In [2]:
import os
import cv2
import easyocr
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification
from matplotlib import pyplot as plt
from PIL import Image
import numpy as np
import sys
import io
import warnings
import logging
import csv
from datetime import datetime

# Completely disable all warnings and logging
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("easyocr").setLevel(logging.ERROR)

# Suppress specific warning types
def suppress_specific_warnings():
    import transformers
    transformers.logging.set_verbosity_error()
    
    # Patch the tokenizer warning methods
    from transformers.tokenization_utils_base import AddedToken, BatchEncoding
    original_init = BatchEncoding.__init__
    def new_init(self, *args, **kwargs):
        original_init(self, *args, **kwargs)
    BatchEncoding.__init__ = new_init

suppress_specific_warnings()

# Define a context manager to suppress stdout/stderr
class SuppressOutput:
    def __init__(self):
        self.stdout = None
        self.stderr = None

    def __enter__(self):
        self.stdout = sys.stdout
        self.stderr = sys.stderr
        sys.stdout = io.StringIO()
        sys.stderr = io.StringIO()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout = self.stdout
        sys.stderr = self.stderr

# Define model constants
MODEL_NAME = "microsoft/layoutlm-base-uncased"  # Use consistent model name

# Define Paths for SROIE dataset
task1_folder = r"D:/Fraud Detection/data/SROIE Dataset/SROIE2019/SROIE2019/task1_train/"
task2_folder = r"D:/Fraud Detection/data/SROIE Dataset/SROIE2019/SROIE2019/task2_train/"
output_path = r"D:\Fraud Detection\outputs"  # Folder to save the OCR results and processed images
results_path = os.path.join(output_path, "results")  # Subfolder for CSV results

# Ensure output directories exist
if not os.path.exists(output_path):
    os.makedirs(output_path)
if not os.path.exists(results_path):
    os.makedirs(results_path)

# Step 1: OCR Processing with EasyOCR
def perform_ocr(image_path):
    # Initialize EasyOCR reader
    reader = easyocr.Reader(['en'])
    result = reader.readtext(image_path)
    
    # Extract bounding boxes and text
    boxes = []
    text = []
    for detection in result:
        boxes.append(detection[0])  # The coordinates of the detected text box
        text.append(detection[1])   # The detected text

    return boxes, text

# Step 2: Layout Analysis (visualize OCR results)
def visualize_ocr_results(image_path, boxes, text):
    image = cv2.imread(image_path)
    for box, t in zip(boxes, text):
        pts = np.array(box, dtype=np.int32).reshape((-1, 1, 2))
        cv2.polylines(image, [pts], isClosed=True, color=(0, 255, 0), thickness=2)
        cv2.putText(image, t, (box[0][0], box[0][1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)

    output_image_path = os.path.join(output_path, os.path.basename(image_path))
    cv2.imwrite(output_image_path, image)
    # plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    # plt.axis('off')
    # plt.show()

# Step 3: Prepare data for LayoutLM (Tokenizing)
def prepare_layoutlm_input(image_path, boxes, text):
    # Convert image to PIL format for LayoutLM processing
    pil_image = Image.open(image_path).convert("RGB")
    
    # Convert EasyOCR boxes to LayoutLM format [x0, y0, x1, y1]
    normalized_boxes = []
    for box in boxes:
        # Calculate min/max coordinates to get the bounding box
        x_coordinates = [point[0] for point in box]
        y_coordinates = [point[1] for point in box]
        
        x0 = int(min(x_coordinates))
        y0 = int(min(y_coordinates))
        x1 = int(max(x_coordinates))
        y1 = int(max(y_coordinates))
        
        normalized_boxes.append([x0, y0, x1, y1])
    
    # For LayoutLM, we need to use a simpler approach without boxes
    # since we're having compatibility issues with the tokenizer
    with SuppressOutput():
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        
        # Use a regular tokenization without layout information
        encoding = tokenizer(
            text,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
    
    return encoding, pil_image, text, normalized_boxes

# Step 4: LayoutLM Model Inference
def run_layoutlm_inference(encoding):
    # Load LayoutLM model
    with SuppressOutput():
        model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
        
        # Perform Inference
        with torch.no_grad():
            outputs = model(**encoding)
    
    return outputs

# Step 5: Process the output
def process_layoutlm_output(outputs):
    # Outputs are logits, let's take the first output (assuming batch size is 1)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=-1)
    return predicted_class

# Function to save predictions to CSV
def save_predictions(image_path, text, predicted_classes, encoding=None):
    # Create a unique filename based on the original image and timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    base_name = os.path.splitext(os.path.basename(image_path))[0]
    csv_filename = f"{base_name}_{timestamp}_predictions.csv"
    csv_path = os.path.join(results_path, csv_filename)
    
    # Map predictions to words - for simplicity, just map each word to a prediction
    results = []
    try:
        # Just use a simple 1:1 mapping as best we can
        pred_list = predicted_classes[0].tolist()
        pred_len = len(pred_list)
        text_len = len(text)
        
        # Ensure number of predictions is at least the number of words
        for i in range(min(text_len, pred_len)):
            results.append({
                "word": text[i],
                "predicted_class": pred_list[i],
                "class_label": "Suspicious" if pred_list[i] == 1 else "Normal"
            })
            
        # If we have fewer predictions than words, assign the most common class to remaining words
        if text_len > pred_len:
            # Find most common class
            from collections import Counter
            common_class = Counter(pred_list).most_common(1)[0][0]
            
            for i in range(pred_len, text_len):
                results.append({
                    "word": text[i],
                    "predicted_class": common_class,
                    "class_label": "Suspicious" if common_class == 1 else "Normal"
                })
    except Exception as e:
        # Last resort fallback - save all words with a general prediction
        print(f"Warning: Error mapping predictions to words: {e}")
        try:
            # See if there's any positive class
            if 1 in predicted_classes.unique().tolist():
                overall_pred = 1
            else:
                overall_pred = 0
        except:
            # If that fails, just default to 0
            overall_pred = 0
            
        for word in text:
            results.append({
                "word": word,
                "predicted_class": overall_pred,
                "class_label": "Suspicious" if overall_pred == 1 else "Normal"
            })
    
    # Write to CSV
    with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['word', 'predicted_class', 'class_label']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for result in results:
            writer.writerow(result)
    
    # Count suspicious words
    suspicious_count = sum(1 for r in results if r["predicted_class"] == 1)
    total_words = len(results)
    
    return csv_path, suspicious_count, total_words

# Function to iterate through all image files in the dataset folders
def process_sroie_dataset():
    # Check if dataset folders exist, otherwise use a sample image
    dataset_exists = os.path.exists(task1_folder) or os.path.exists(task2_folder)
    
    if not dataset_exists:
        print("Dataset folders not found. Using a sample image instead.")
        sample_image = r"C:\Users\Tanmay\Downloads\crisis result.png"
        if os.path.exists(sample_image):
            image_files = [sample_image]
        else:
            print(f"Sample image not found. Please check the file path.")
            return
    else:
        # Collect image files from both task1_train and task2_train folders
        image_files = []
        for folder in [task1_folder, task2_folder]:
            if os.path.exists(folder):
                for root, dirs, files in os.walk(folder):
                    for file in files:
                        if file.endswith('.png') or file.endswith('.jpg'):
                            image_files.append(os.path.join(root, file))
                            
        if not image_files:
            print("No image files found in the dataset directories.")
            return
    
    for image_path in image_files:
        print(f"Processing: {image_path}")

        try:
            # Step 1: Perform OCR on the image
            boxes, text = perform_ocr(image_path)

            if not text:
                print(f"No text detected in {image_path}, skipping...")
                continue

            # Step 2: Visualize the OCR results
            visualize_ocr_results(image_path, boxes, text)

            # Step 3: Prepare data for LayoutLM
            encoding, pil_image, text, normalized_boxes = prepare_layoutlm_input(image_path, boxes, text)

            # Step 4: Run LayoutLM model inference
            outputs = run_layoutlm_inference(encoding)

            # Step 5: Process the LayoutLM output
            predicted_class = process_layoutlm_output(outputs)
            
            # Step 6: Save predictions to CSV without displaying them in terminal
            csv_path, suspicious_count, total_words = save_predictions(image_path, text, predicted_class, encoding)
            
            # Print a summary instead of raw predictions
            if suspicious_count > 0:
                risk_level = "HIGH" if suspicious_count / total_words > 0.2 else "MEDIUM"
                print(f"Result: {risk_level} RISK - {suspicious_count}/{total_words} suspicious elements")
            else:
                print(f"Result: LOW RISK - No suspicious elements found")
                
            print(f"Saved detailed analysis to: {csv_path}")
            
        except Exception as e:
            print(f"Error processing {image_path}: {e}")
            import traceback
            traceback.print_exc()

# Main function to execute the pipeline
def main():
    print("Starting OCR and Layout Analysis Pipeline...")
    process_sroie_dataset()
    print("OCR and Layout Analysis Complete.")

if __name__ == "__main__":
    main()


Starting OCR and Layout Analysis Pipeline...
Processing: D:/Fraud Detection/data/SROIE Dataset/SROIE2019/SROIE2019/task1_train/X00016469612.jpg


KeyboardInterrupt: 

In [6]:
import os
import pandas as pd
import numpy as np
import re
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
import warnings
import logging
from collections import Counter
from tqdm import tqdm

# Suppress unnecessary warnings
warnings.filterwarnings("ignore", category=UserWarning)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
logging.getLogger("transformers").setLevel(logging.ERROR)

# Define paths
RESULTS_PATH = r"D:/Fraud Detection/outputs/results"
OUTPUT_PATH = r"D:/Fraud Detection/outputs/ner_results"
MODEL_PATH = r"D:/Fraud Detection/models"
DATA_PATH = r"D:/Fraud Detection/data"

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Define NER model - using a pre-trained model designed for document NER
MODEL_NAME = "dslim/bert-base-NER"  # Document NER model

class NERProcessor:
    def __init__(self):
        print("Initializing NER Processor...")
        # Load and initialize the NER model
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=MODEL_PATH)
        self.model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, cache_dir=MODEL_PATH)
        self.model.eval()  # Set model to evaluation mode
        
        # ID to label mapping for this NER model
        self.id2label = self.model.config.id2label
        print(f"Model loaded with {len(self.id2label)} entity labels")
        print("NER Processor initialized successfully!")
    
    def process_text(self, text):
        """Process text through NER model to extract entities"""
        print(f"Processing text with length: {len(text)}")
        
        # Tokenize text
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        
        # Run model inference
        with torch.no_grad():
            outputs = self.model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=2)
        
        # Convert predictions to labels
        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        token_predictions = [self.id2label[prediction.item()] for prediction in predictions[0]]
        
        # Process NER results
        word_level_predictions = []
        current_entity = None
        current_text = ""
        
        # Process token-level predictions to get word-level entities
        for token, prediction in zip(tokens, token_predictions):
            if token.startswith("##"):
                # Continuation of previous token
                if current_entity:
                    current_text += token[2:]  # Remove ## prefix
            else:
                # If we had a previous entity, add it to our list
                if current_entity and current_text:
                    word_level_predictions.append({
                        "entity": current_entity,
                        "text": current_text
                    })
                
                # Start new entity
                if prediction.startswith("B-") or prediction.startswith("I-"):
                    current_entity = prediction[2:]  # Remove B- or I- prefix
                    current_text = token
                else:
                    current_entity = None
                    current_text = ""
        
        # Don't forget the last entity
        if current_entity and current_text:
            word_level_predictions.append({
                "entity": current_entity,
                "text": current_text
            })
        
        # Group by entity type
        entities = {}
        for pred in word_level_predictions:
            entity_type = pred["entity"]
            entity_text = pred["text"]
            
            if entity_type not in entities:
                entities[entity_type] = []
            
            entities[entity_type].append(entity_text)
        
        # Count the frequency of each entity
        entity_counts = {entity: len(items) for entity, items in entities.items()}
        
        print(f"Found {sum(entity_counts.values())} entities across {len(entity_counts)} categories")
        return entities, entity_counts
    
    def process_prediction_files(self):
        """Process all prediction files from OCR results"""
        print(f"Looking for OCR prediction files in {RESULTS_PATH}...")
        prediction_files = [f for f in os.listdir(RESULTS_PATH) if f.endswith('_predictions.csv')]
        
        if not prediction_files:
            print("No prediction files found. Please ensure OCR has been run first.")
            return
        
        print(f"Found {len(prediction_files)} prediction files to process")
        results = []
        
        # Process each file
        for file in tqdm(prediction_files, desc="Processing files"):
            file_path = os.path.join(RESULTS_PATH, file)
            
            # Extract document ID from filename
            doc_id = file.split('_')[0]
            
            try:
                # Read OCR prediction CSV
                df = pd.read_csv(file_path)
                
                # Combine all words into a single text
                if 'word' in df.columns:
                    text = ' '.join(df['word'].astype(str).tolist())
                    
                    # Process through NER
                    entities, entity_counts = self.process_text(text)
                    
                    # Calculate risk metrics
                    # More entities usually indicates a more complex document
                    entity_complexity = len(entities)
                    
                    # Documents with certain entity types might have higher fraud risk
                    high_risk_entities = ['ORG', 'MONEY', 'DATE']
                    risk_entity_count = sum([len(entities.get(e, [])) for e in high_risk_entities])
                    
                    # Create result record
                    result = {
                        'document_id': doc_id,
                        'total_entities': sum(entity_counts.values()),
                        'entity_types': len(entity_counts),
                        'entity_complexity': entity_complexity,
                        'risk_entity_count': risk_entity_count,
                        'entities': entities,
                        'entity_counts': entity_counts
                    }
                    
                    results.append(result)
                else:
                    print(f"Warning: File {file} doesn't have the expected 'word' column")
                    
            except Exception as e:
                print(f"Error processing {file}: {str(e)}")
        
        # Save consolidated results
        if results:
            self.save_results(results)
        
        return results
    
    def save_results(self, results):
        """Save NER analysis results to CSV"""
        print(f"Saving NER analysis results for {len(results)} documents...")
        
        # Create DataFrame with core metrics
        df = pd.DataFrame([{
            'document_id': r['document_id'],
            'total_entities': r['total_entities'],
            'entity_types': r['entity_types'],
            'entity_complexity': r['entity_complexity'],
            'risk_entity_count': r['risk_entity_count']
        } for r in results])
        
        # Save to CSV
        output_file = os.path.join(OUTPUT_PATH, "ner_analysis_results.csv")
        df.to_csv(output_file, index=False)
        
        # Save detailed entity information for each document
        for result in results:
            doc_id = result['document_id']
            entities = result['entities']
            
            # Create entity records
            entity_records = []
            for entity_type, values in entities.items():
                for value in values:
                    entity_records.append({
                        'entity_type': entity_type,
                        'value': value
                    })
            
            if entity_records:
                # Save to document-specific CSV
                entity_df = pd.DataFrame(entity_records)
                entity_file = os.path.join(OUTPUT_PATH, f"{doc_id}_entities.csv")
                entity_df.to_csv(entity_file, index=False)
        
        print(f"Results saved to {OUTPUT_PATH}")
        print(f"Main results file: {os.path.join(OUTPUT_PATH, 'ner_analysis_results.csv')}")

def main():
    """Main function to run NER processing"""
    print("Starting NER Processing System...")
    processor = NERProcessor()
    
    # Process all OCR prediction files
    results = processor.process_prediction_files()
    
    # Output summary statistics
    if results:
        total_entities = sum(r['total_entities'] for r in results)
        avg_entities = total_entities / len(results)
        max_entities = max(r['total_entities'] for r in results)
        max_doc = next(r['document_id'] for r in results if r['total_entities'] == max_entities)
        
        print("\n===== NER Processing Complete =====")
        print(f"Processed {len(results)} documents")
        print(f"Found {total_entities} total entities (avg: {avg_entities:.1f} per document)")
        print(f"Document with most entities: {max_doc} ({max_entities} entities)")
        print(f"Results saved to: {OUTPUT_PATH}")
    
    print("NER Processing completed!")

if __name__ == "__main__":
    main()

Starting NER Processing System...
Initializing NER Processor...
Model loaded with 9 entity labels
NER Processor initialized successfully!
Looking for OCR prediction files in D:/Fraud Detection/outputs/results...
Found 122 prediction files to process


Processing files:   0%|          | 0/122 [00:00<?, ?it/s]

Processing text with length: 455


Processing files:   1%|          | 1/122 [00:00<01:59,  1.01it/s]

Found 5 entities across 2 categories
Processing text with length: 553


Processing files:   2%|▏         | 2/122 [00:01<01:23,  1.45it/s]

Found 3 entities across 1 categories
Processing text with length: 706


Processing files:   2%|▏         | 3/122 [00:02<01:24,  1.41it/s]

Found 8 entities across 2 categories
Processing text with length: 554


Processing files:   3%|▎         | 4/122 [00:02<01:13,  1.60it/s]

Found 1 entities across 1 categories
Processing text with length: 716


Processing files:   4%|▍         | 5/122 [00:03<01:21,  1.43it/s]

Found 12 entities across 1 categories
Processing text with length: 366


Processing files:   5%|▍         | 6/122 [00:04<01:14,  1.56it/s]

Found 5 entities across 1 categories
Processing text with length: 863


Processing files:   6%|▌         | 7/122 [00:04<01:19,  1.44it/s]

Found 2 entities across 2 categories
Processing text with length: 441


Processing files:   7%|▋         | 8/122 [00:05<01:18,  1.45it/s]

Found 7 entities across 1 categories
Processing text with length: 853


Processing files:   7%|▋         | 9/122 [00:06<01:20,  1.41it/s]

Found 6 entities across 2 categories
Processing text with length: 806


Processing files:   8%|▊         | 10/122 [00:06<01:16,  1.46it/s]

Found 24 entities across 2 categories
Processing text with length: 583


Processing files:   9%|▉         | 11/122 [00:07<01:13,  1.50it/s]

Found 3 entities across 1 categories
Processing text with length: 764


Processing files:  10%|▉         | 12/122 [00:08<01:16,  1.44it/s]

Found 3 entities across 3 categories
Processing text with length: 566


Processing files:  11%|█         | 13/122 [00:08<01:12,  1.49it/s]

Found 4 entities across 2 categories
Processing text with length: 515


Processing files:  11%|█▏        | 14/122 [00:09<01:09,  1.55it/s]

Found 10 entities across 2 categories
Processing text with length: 1015


Processing files:  12%|█▏        | 15/122 [00:10<01:14,  1.44it/s]

Found 5 entities across 2 categories
Processing text with length: 566


Processing files:  13%|█▎        | 16/122 [00:11<01:15,  1.41it/s]

Found 4 entities across 2 categories
Processing text with length: 566


Processing files:  14%|█▍        | 17/122 [00:11<01:15,  1.39it/s]

Found 4 entities across 2 categories
Processing text with length: 566


Processing files:  15%|█▍        | 18/122 [00:12<01:08,  1.51it/s]

Found 4 entities across 2 categories
Processing text with length: 566


Processing files:  16%|█▌        | 19/122 [00:13<01:08,  1.50it/s]

Found 4 entities across 2 categories
Processing text with length: 911


Processing files:  16%|█▋        | 20/122 [00:13<01:09,  1.46it/s]

Found 4 entities across 2 categories
Processing text with length: 919


Processing files:  17%|█▋        | 21/122 [00:14<01:11,  1.41it/s]

Found 7 entities across 1 categories
Processing text with length: 911


Processing files:  18%|█▊        | 22/122 [00:15<01:11,  1.39it/s]

Found 4 entities across 2 categories
Processing text with length: 540


Processing files:  19%|█▉        | 23/122 [00:15<01:05,  1.51it/s]

Found 4 entities across 2 categories
Processing text with length: 591


Processing files:  20%|█▉        | 24/122 [00:16<01:03,  1.54it/s]

Found 4 entities across 1 categories
Processing text with length: 663


Processing files:  20%|██        | 25/122 [00:17<01:05,  1.48it/s]

Found 5 entities across 1 categories
Processing text with length: 507


Processing files:  21%|██▏       | 26/122 [00:17<01:01,  1.56it/s]

Found 12 entities across 3 categories
Processing text with length: 653


Processing files:  22%|██▏       | 27/122 [00:18<01:01,  1.55it/s]

Found 4 entities across 2 categories
Processing text with length: 694


Processing files:  23%|██▎       | 28/122 [00:19<01:00,  1.55it/s]

Found 5 entities across 3 categories
Processing text with length: 707


Processing files:  24%|██▍       | 29/122 [00:19<01:04,  1.45it/s]

Found 4 entities across 2 categories
Processing text with length: 707


Processing files:  25%|██▍       | 30/122 [00:20<00:59,  1.53it/s]

Found 4 entities across 2 categories
Processing text with length: 984


Processing files:  25%|██▌       | 31/122 [00:21<01:08,  1.33it/s]

Found 2 entities across 1 categories
Processing text with length: 984


Processing files:  26%|██▌       | 32/122 [00:22<01:13,  1.22it/s]

Found 2 entities across 1 categories
Processing text with length: 695


Processing files:  27%|██▋       | 33/122 [00:23<01:11,  1.24it/s]

Found 5 entities across 2 categories
Processing text with length: 427


Processing files:  28%|██▊       | 34/122 [00:23<01:03,  1.38it/s]

Found 3 entities across 1 categories
Processing text with length: 427


Processing files:  29%|██▊       | 35/122 [00:24<00:59,  1.45it/s]

Found 3 entities across 1 categories
Processing text with length: 570


Processing files:  30%|██▉       | 36/122 [00:24<00:53,  1.60it/s]

Found 2 entities across 2 categories
Processing text with length: 570


Processing files:  30%|███       | 37/122 [00:25<00:50,  1.67it/s]

Found 2 entities across 2 categories
Processing text with length: 542


Processing files:  31%|███       | 38/122 [00:25<00:47,  1.77it/s]

Found 12 entities across 3 categories
Processing text with length: 542


Processing files:  32%|███▏      | 39/122 [00:26<00:50,  1.63it/s]

Found 12 entities across 3 categories
Processing text with length: 607


Processing files:  33%|███▎      | 40/122 [00:27<00:49,  1.67it/s]

Found 3 entities across 1 categories
Processing text with length: 560


Processing files:  34%|███▎      | 41/122 [00:27<00:50,  1.60it/s]

Found 9 entities across 2 categories
Processing text with length: 561


Processing files:  34%|███▍      | 42/122 [00:28<00:50,  1.57it/s]

Found 15 entities across 3 categories
Processing text with length: 1247


Processing files:  35%|███▌      | 43/122 [00:29<00:55,  1.43it/s]

Found 7 entities across 3 categories
Processing text with length: 563


Processing files:  36%|███▌      | 44/122 [00:29<00:56,  1.39it/s]

Found 13 entities across 3 categories
Processing text with length: 548


Processing files:  37%|███▋      | 45/122 [00:30<00:52,  1.45it/s]

Found 10 entities across 3 categories
Processing text with length: 557


Processing files:  38%|███▊      | 46/122 [00:31<00:47,  1.59it/s]

Found 3 entities across 1 categories
Processing text with length: 586


Processing files:  39%|███▊      | 47/122 [00:31<00:48,  1.55it/s]

Found 15 entities across 2 categories
Processing text with length: 656


Processing files:  39%|███▉      | 48/122 [00:32<00:49,  1.51it/s]

Found 23 entities across 4 categories
Processing text with length: 438


Processing files:  40%|████      | 49/122 [00:33<00:47,  1.53it/s]

Found 3 entities across 2 categories
Processing text with length: 452


Processing files:  41%|████      | 50/122 [00:33<00:47,  1.52it/s]

Found 2 entities across 2 categories
Processing text with length: 630


Processing files:  42%|████▏     | 51/122 [00:34<00:46,  1.54it/s]

Found 11 entities across 1 categories
Processing text with length: 543


Processing files:  43%|████▎     | 52/122 [00:35<00:47,  1.48it/s]

Found 6 entities across 2 categories
Processing text with length: 602


Processing files:  43%|████▎     | 53/122 [00:35<00:46,  1.50it/s]

Found 14 entities across 3 categories
Processing text with length: 598


Processing files:  44%|████▍     | 54/122 [00:36<00:44,  1.54it/s]

Found 13 entities across 3 categories
Processing text with length: 584


Processing files:  45%|████▌     | 55/122 [00:37<00:45,  1.47it/s]

Found 3 entities across 2 categories
Processing text with length: 192


Processing files:  46%|████▌     | 56/122 [00:37<00:40,  1.65it/s]

Found 2 entities across 1 categories
Processing text with length: 723


Processing files:  47%|████▋     | 57/122 [00:38<00:41,  1.55it/s]

Found 7 entities across 2 categories
Processing text with length: 582


Processing files:  48%|████▊     | 58/122 [00:38<00:41,  1.55it/s]

Found 6 entities across 1 categories
Processing text with length: 610


Processing files:  48%|████▊     | 59/122 [00:39<00:41,  1.51it/s]

Found 21 entities across 2 categories
Processing text with length: 575


Processing files:  49%|████▉     | 60/122 [00:40<00:42,  1.47it/s]

Found 9 entities across 2 categories
Processing text with length: 575


Processing files:  50%|█████     | 61/122 [00:41<00:42,  1.44it/s]

Found 9 entities across 2 categories
Processing text with length: 559


Processing files:  51%|█████     | 62/122 [00:41<00:42,  1.40it/s]

Found 2 entities across 2 categories
Processing text with length: 559


Processing files:  52%|█████▏    | 63/122 [00:42<00:44,  1.31it/s]

Found 2 entities across 2 categories
Processing text with length: 564


Processing files:  52%|█████▏    | 64/122 [00:43<00:41,  1.41it/s]

Found 13 entities across 2 categories
Processing text with length: 564


Processing files:  53%|█████▎    | 65/122 [00:44<00:40,  1.42it/s]

Found 13 entities across 2 categories
Processing text with length: 481


Processing files:  54%|█████▍    | 66/122 [00:44<00:36,  1.52it/s]

Found 5 entities across 3 categories
Processing text with length: 481


Processing files:  55%|█████▍    | 67/122 [00:45<00:38,  1.42it/s]

Found 5 entities across 3 categories
Processing text with length: 556


Processing files:  56%|█████▌    | 68/122 [00:45<00:35,  1.54it/s]

Found 11 entities across 3 categories
Processing text with length: 556


Processing files:  57%|█████▋    | 69/122 [00:46<00:33,  1.57it/s]

Found 11 entities across 3 categories
Processing text with length: 569


Processing files:  57%|█████▋    | 70/122 [00:47<00:31,  1.65it/s]

Found 9 entities across 3 categories
Processing text with length: 514


Processing files:  58%|█████▊    | 71/122 [00:47<00:31,  1.64it/s]

Found 11 entities across 2 categories
Processing text with length: 559


Processing files:  59%|█████▉    | 72/122 [00:48<00:33,  1.49it/s]

Found 12 entities across 2 categories
Processing text with length: 359


Processing files:  60%|█████▉    | 73/122 [00:48<00:29,  1.65it/s]

Found 11 entities across 1 categories
Processing text with length: 705


Processing files:  61%|██████    | 74/122 [00:49<00:29,  1.64it/s]

Found 18 entities across 2 categories
Processing text with length: 843


Processing files:  61%|██████▏   | 75/122 [00:50<00:31,  1.50it/s]

Found 13 entities across 2 categories
Processing text with length: 404


Processing files:  62%|██████▏   | 76/122 [00:50<00:29,  1.57it/s]

Found 4 entities across 2 categories
Processing text with length: 528


Processing files:  63%|██████▎   | 77/122 [00:51<00:29,  1.53it/s]

Found 9 entities across 1 categories
Processing text with length: 489


Processing files:  64%|██████▍   | 78/122 [00:52<00:28,  1.54it/s]

Found 7 entities across 2 categories
Processing text with length: 529


Processing files:  65%|██████▍   | 79/122 [00:52<00:27,  1.56it/s]

Found 4 entities across 2 categories
Processing text with length: 685


Processing files:  66%|██████▌   | 80/122 [00:53<00:27,  1.52it/s]

Found 8 entities across 3 categories
Processing text with length: 608


Processing files:  66%|██████▋   | 81/122 [00:54<00:27,  1.51it/s]

Found 12 entities across 1 categories
Processing text with length: 545


Processing files:  67%|██████▋   | 82/122 [00:54<00:26,  1.50it/s]

Found 10 entities across 2 categories
Processing text with length: 415


Processing files:  68%|██████▊   | 83/122 [00:55<00:24,  1.58it/s]

Found 6 entities across 3 categories
Processing text with length: 488


Processing files:  69%|██████▉   | 84/122 [00:56<00:24,  1.58it/s]

Found 12 entities across 2 categories
Processing text with length: 676


Processing files:  70%|██████▉   | 85/122 [00:56<00:23,  1.61it/s]

Found 8 entities across 2 categories
Processing text with length: 863


Processing files:  70%|███████   | 86/122 [00:57<00:24,  1.47it/s]

Found 18 entities across 4 categories
Processing text with length: 565


Processing files:  71%|███████▏  | 87/122 [00:58<00:22,  1.54it/s]

Found 7 entities across 1 categories
Processing text with length: 507


Processing files:  72%|███████▏  | 88/122 [00:58<00:21,  1.55it/s]

Found 5 entities across 2 categories
Processing text with length: 580


Processing files:  73%|███████▎  | 89/122 [00:59<00:19,  1.66it/s]

Found 9 entities across 2 categories
Processing text with length: 505


Processing files:  74%|███████▍  | 90/122 [00:59<00:18,  1.71it/s]

Found 1 entities across 1 categories
Processing text with length: 740


Processing files:  75%|███████▍  | 91/122 [01:00<00:19,  1.62it/s]

Found 5 entities across 1 categories
Processing text with length: 613


Processing files:  75%|███████▌  | 92/122 [01:01<00:18,  1.62it/s]

Found 4 entities across 1 categories
Processing text with length: 795


Processing files:  76%|███████▌  | 93/122 [01:01<00:18,  1.58it/s]

Found 4 entities across 2 categories
Processing text with length: 589


Processing files:  77%|███████▋  | 94/122 [01:02<00:17,  1.60it/s]

Found 6 entities across 1 categories
Processing text with length: 529


Processing files:  78%|███████▊  | 95/122 [01:03<00:18,  1.49it/s]

Found 5 entities across 3 categories
Processing text with length: 903


Processing files:  79%|███████▊  | 96/122 [01:03<00:18,  1.42it/s]

Found 8 entities across 2 categories
Processing text with length: 902


Processing files:  80%|███████▉  | 97/122 [01:04<00:17,  1.41it/s]

Found 1 entities across 1 categories
Processing text with length: 510


Processing files:  80%|████████  | 98/122 [01:05<00:16,  1.49it/s]

Found 10 entities across 3 categories
Processing text with length: 541


Processing files:  81%|████████  | 99/122 [01:05<00:14,  1.58it/s]

Found 17 entities across 2 categories
Processing text with length: 717


Processing files:  82%|████████▏ | 100/122 [01:06<00:13,  1.57it/s]

Found 5 entities across 2 categories
Processing text with length: 717


Processing files:  83%|████████▎ | 101/122 [01:07<00:14,  1.42it/s]

Found 5 entities across 2 categories
Processing text with length: 510


Processing files:  84%|████████▎ | 102/122 [01:07<00:12,  1.55it/s]

Found 10 entities across 1 categories
Processing text with length: 510


Processing files:  84%|████████▍ | 103/122 [01:08<00:11,  1.63it/s]

Found 10 entities across 1 categories
Processing text with length: 526


Processing files:  85%|████████▌ | 104/122 [01:08<00:10,  1.78it/s]

Found 8 entities across 1 categories
Processing text with length: 526


Processing files:  86%|████████▌ | 105/122 [01:09<00:09,  1.86it/s]

Found 8 entities across 1 categories
Processing text with length: 549


Processing files:  87%|████████▋ | 106/122 [01:09<00:08,  1.81it/s]

Found 4 entities across 3 categories
Processing text with length: 549


Processing files:  88%|████████▊ | 107/122 [01:10<00:08,  1.77it/s]

Found 4 entities across 3 categories
Processing text with length: 919


Processing files:  89%|████████▊ | 108/122 [01:11<00:08,  1.65it/s]

Found 12 entities across 2 categories
Processing text with length: 1233


Processing files:  89%|████████▉ | 109/122 [01:12<00:08,  1.45it/s]

Found 10 entities across 2 categories
Processing text with length: 654


Processing files:  90%|█████████ | 110/122 [01:12<00:08,  1.44it/s]

Found 14 entities across 3 categories
Processing text with length: 654


Processing files:  91%|█████████ | 111/122 [01:13<00:07,  1.46it/s]

Found 14 entities across 3 categories
Processing text with length: 654


Processing files:  92%|█████████▏| 112/122 [01:14<00:06,  1.50it/s]

Found 14 entities across 3 categories
Processing text with length: 654


Processing files:  93%|█████████▎| 113/122 [01:14<00:05,  1.55it/s]

Found 14 entities across 3 categories
Processing text with length: 654


Processing files:  93%|█████████▎| 114/122 [01:15<00:04,  1.62it/s]

Found 14 entities across 3 categories
Processing text with length: 624


Processing files:  94%|█████████▍| 115/122 [01:15<00:04,  1.51it/s]

Found 10 entities across 1 categories
Processing text with length: 624


Processing files:  95%|█████████▌| 116/122 [01:16<00:03,  1.52it/s]

Found 10 entities across 1 categories
Processing text with length: 624


Processing files:  96%|█████████▌| 117/122 [01:17<00:03,  1.51it/s]

Found 10 entities across 1 categories
Processing text with length: 624


Processing files:  97%|█████████▋| 118/122 [01:17<00:02,  1.57it/s]

Found 10 entities across 1 categories
Processing text with length: 610


Processing files:  98%|█████████▊| 119/122 [01:18<00:01,  1.51it/s]

Found 12 entities across 2 categories
Processing text with length: 610


Processing files:  98%|█████████▊| 120/122 [01:19<00:01,  1.45it/s]

Found 12 entities across 2 categories
Processing text with length: 610


Processing files:  99%|█████████▉| 121/122 [01:19<00:00,  1.47it/s]

Found 12 entities across 2 categories
Processing text with length: 610


Processing files: 100%|██████████| 122/122 [01:20<00:00,  1.51it/s]

Found 12 entities across 2 categories
Saving NER analysis results for 122 documents...





Results saved to D:/Fraud Detection/outputs/ner_results
Main results file: D:/Fraud Detection/outputs/ner_results\ner_analysis_results.csv

===== NER Processing Complete =====
Processed 122 documents
Found 968 total entities (avg: 7.9 per document)
Document with most entities: X51005230617 (24 entities)
Results saved to: D:/Fraud Detection/outputs/ner_results
NER Processing completed!


In [None]:
import os
import pandas as pd
import numpy as np
import re
import json
import spacy
import networkx as nx
from tqdm import tqdm
import matplotlib.pyplot as plt
import warnings
import logging
from pathlib import Path
from collections import defaultdict, Counter

# Suppress unnecessary warnings and logs
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
logging.getLogger("matplotlib").setLevel(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Define paths - ensure these are correct for your environment
BASE_PATH = r"D:/Fraud Detection"
NER_RESULTS_PATH = os.path.join(BASE_PATH, "outputs/ner_results")
RELATIONSHIP_OUTPUT_PATH = os.path.join(BASE_PATH, "outputs/relationship_results")
VISUALIZATION_PATH = os.path.join(BASE_PATH, "outputs/visualizations")

# Ensure directories exist
os.makedirs(RELATIONSHIP_OUTPUT_PATH, exist_ok=True)
os.makedirs(VISUALIZATION_PATH, exist_ok=True)

# Load spaCy model for linguistic analysis
print("Loading NLP models...")
try:
    nlp = spacy.load("en_core_web_sm")
    print("SpaCy model loaded successfully")
except:
    print("Downloading SpaCy model (one-time setup)...")
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
    print("SpaCy model loaded successfully")

class RelationshipExtractor:
    def __init__(self):
        print("Initializing Relationship Extractor...")
        
        # Define relationship patterns
        self.relationship_patterns = {
            "OWNERSHIP": [
                {"POS": ["NOUN", "PROPN"], "DEP": ["nsubj", "compound"]},
                {"LEMMA": ["own", "possess", "have", "hold", "acquire"]},
                {"POS": ["NOUN", "PROPN"], "DEP": ["dobj", "attr"]}
            ],
            "EMPLOYMENT": [
                {"POS": ["NOUN", "PROPN"], "DEP": ["nsubj", "compound"]},
                {"LEMMA": ["work", "employ", "hire", "contract"]},
                {"LEMMA": ["for", "with", "at", "by"]},
                {"POS": ["NOUN", "PROPN"], "DEP": ["pobj"]}
            ],
            "TRANSACTION": [
                {"POS": ["NOUN", "PROPN"], "DEP": ["nsubj", "compound"]},
                {"LEMMA": ["pay", "transfer", "send", "receive", "deposit", "withdraw"]},
                {"LEMMA": ["to", "from"]},
                {"POS": ["NOUN", "PROPN"], "DEP": ["pobj"]}
            ],
            "LOCATION": [
                {"POS": ["NOUN", "PROPN"], "DEP": ["nsubj", "compound"]},
                {"LEMMA": ["locate", "situate", "base", "headquarter", "live"]},
                {"LEMMA": ["in", "at", "near"]},
                {"POS": ["NOUN", "PROPN"], "DEP": ["pobj"]}
            ]
        }
        
        # Dictionary to store high-confidence relationships (these are rule-based)
        self.high_confidence_patterns = {
            r'(.*) is (employed|hired|working) (at|by|with) (.*)': 'EMPLOYMENT',
            r'(.*) works for (.*)': 'EMPLOYMENT',
            r'(.*) owns (.*)': 'OWNERSHIP', 
            r'(.*) is owned by (.*)': 'OWNERSHIP',
            r'(.*) paid (.*) to (.*)': 'TRANSACTION',
            r'(.*) transferred (.*) to (.*)': 'TRANSACTION',
            r'(.*) is located (in|at) (.*)': 'LOCATION',
            r'(.*) is based (in|at) (.*)': 'LOCATION'
        }
        
        # Entity type mappings for relationships
        self.entity_relation_mapping = {
            ('PER', 'ORG'): ['EMPLOYMENT', 'OWNERSHIP'],
            ('ORG', 'ORG'): ['OWNERSHIP', 'TRANSACTION'],
            ('PER', 'PER'): ['TRANSACTION'],
            ('ORG', 'LOC'): ['LOCATION'],
            ('PER', 'LOC'): ['LOCATION'],
            ('ORG', 'MONEY'): ['TRANSACTION'],
            ('PER', 'MONEY'): ['TRANSACTION']
        }
        
        # Load NER results
        self.ner_results_file = os.path.join(NER_RESULTS_PATH, "ner_analysis_results.csv")
        self.documents = self.load_ner_results()
        print(f"Loaded data for {len(self.documents)} documents")
        
        # Prepare relationship graph
        self.global_graph = nx.DiGraph()
        
        print("Relationship Extractor initialized successfully!")
    
    def load_ner_results(self):
        """Load NER results from both the main CSV and individual entity files"""
        if not os.path.exists(self.ner_results_file):
            print(f"ERROR: NER results file not found at {self.ner_results_file}")
            print("Please run the NER processor first.")
            return []
        
        # Load main results
        main_df = pd.read_csv(self.ner_results_file)
        print(f"Found {len(main_df)} documents with NER analysis")
        
        documents = []
        for _, row in main_df.iterrows():
            doc_id = row['document_id']
            entity_file = os.path.join(NER_RESULTS_PATH, f"{doc_id}_entities.csv")
            
            if os.path.exists(entity_file):
                # Load detailed entity information
                entity_df = pd.read_csv(entity_file)
                
                # Group entities by type
                entities = {}
                for entity_type in entity_df['entity_type'].unique():
                    entities[entity_type] = entity_df[entity_df['entity_type'] == entity_type]['value'].tolist()
                
                # Create document record
                doc_record = {
                    'document_id': doc_id,
                    'total_entities': row['total_entities'],
                    'entity_types': row['entity_types'],
                    'entity_complexity': row['entity_complexity'],
                    'risk_entity_count': row['risk_entity_count'],
                    'entities': entities
                }
                documents.append(doc_record)
        
        return documents
    
    def extract_relationships_from_text(self, text, entities_by_type):
        """Extract relationships from text using linguistic patterns"""
        relationships = []
        
        # Process text with spaCy
        doc = nlp(text)
        
        # Check for rule-based high confidence patterns first
        for pattern, rel_type in self.high_confidence_patterns.items():
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                if isinstance(match, tuple):
                    # Multi-group match (for complex patterns)
                    relationships.append({
                        'type': rel_type,
                        'source': match[0].strip(),
                        'target': match[-1].strip(),
                        'confidence': 'HIGH'
                    })
                elif isinstance(match, str):
                    # Single group match
                    parts = match.split(' ')
                    if len(parts) >= 2:
                        relationships.append({
                            'type': rel_type,
                            'source': parts[0].strip(),
                            'target': ' '.join(parts[1:]).strip(),
                            'confidence': 'HIGH'
                        })
        
        # Extract relationships based on entities and their co-occurrence
        flat_entities = {}
        for ent_type, ents in entities_by_type.items():
            for ent in ents:
                flat_entities[ent.lower()] = ent_type
        
        # Find entity co-occurrences within sentences
        for sent in doc.sents:
            sent_text = sent.text.lower()
            found_entities = []
            
            for entity, entity_type in flat_entities.items():
                if entity.lower() in sent_text:
                    found_entities.append((entity, entity_type))
            
            # If we have at least 2 entities in a sentence, they might be related
            if len(found_entities) >= 2:
                for i in range(len(found_entities)):
                    for j in range(i+1, len(found_entities)):
                        ent1, type1 = found_entities[i]
                        ent2, type2 = found_entities[j]
                        
                        # Check if these entity types can have a relationship
                        if (type1, type2) in self.entity_relation_mapping:
                            possible_rels = self.entity_relation_mapping[(type1, type2)]
                            
                            # Use dependency parsing to infer relationship type
                            # For now, use the first possible relationship type
                            rel_type = possible_rels[0]
                            
                            relationships.append({
                                'type': rel_type,
                                'source': ent1,
                                'target': ent2,
                                'confidence': 'MEDIUM'
                            })
        
        # Filter out duplicate relationships
        unique_relationships = []
        seen = set()
        
        for rel in relationships:
            key = (rel['type'], rel['source'].lower(), rel['target'].lower())
            if key not in seen:
                seen.add(key)
                unique_relationships.append(rel)
        
        return unique_relationships
    
    def process_documents(self):
        """Process all documents to extract relationships"""
        print(f"\nProcessing {len(self.documents)} documents for relationship extraction...")
        
        all_relationships = []
        document_relationships = {}
        
        for doc in tqdm(self.documents, desc="Extracting relationships"):
            doc_id = doc['document_id']
            entities = doc['entities']
            
            # We need text to extract relationships - let's reconstruct a simplified version
            text = ""
            for entity_type, values in entities.items():
                for value in values:
                    text += f"{value} is a {entity_type}. "
            
            # Extract relationships from text
            relationships = self.extract_relationships_from_text(text, entities)
            
            document_relationships[doc_id] = relationships
            all_relationships.extend(relationships)
            
            # Also add relationships to the global graph
            for rel in relationships:
                source = rel['source']
                target = rel['target']
                rel_type = rel['type']
                
                # Add nodes and edge to graph
                self.global_graph.add_node(source)
                self.global_graph.add_node(target)
                
                # Add edge with relationship type as attribute
                self.global_graph.add_edge(source, target, type=rel_type)
        
        # Analyze and save relationship data
        self.save_relationships(document_relationships, all_relationships)
        self.visualize_relationship_network()
        
        # Return relationship stats
        rel_types = Counter([rel['type'] for rel in all_relationships])
        return {
            'total_relationships': len(all_relationships),
            'by_type': dict(rel_types),
            'document_count': len(document_relationships)
        }
    
    def save_relationships(self, document_relationships, all_relationships):
        """Save relationship extraction results"""
        print(f"Saving relationship results to {RELATIONSHIP_OUTPUT_PATH}...")
        
        # Save all relationships to a single CSV
        relationships_df = pd.DataFrame(all_relationships)
        all_relationships_file = os.path.join(RELATIONSHIP_OUTPUT_PATH, "all_relationships.csv")
        relationships_df.to_csv(all_relationships_file, index=False)
        
        # Save relationships by document
        for doc_id, relationships in document_relationships.items():
            if relationships:  # Only save if there are relationships
                doc_rel_df = pd.DataFrame(relationships)
                doc_file = os.path.join(RELATIONSHIP_OUTPUT_PATH, f"{doc_id}_relationships.csv")
                doc_rel_df.to_csv(doc_file, index=False)
        
        # Save network data as JSON for visualization
        network_data = {
            'nodes': list(self.global_graph.nodes()),
            'edges': [{'source': u, 'target': v, 'type': d['type']} 
                      for u, v, d in self.global_graph.edges(data=True)]
        }
        
        network_file = os.path.join(RELATIONSHIP_OUTPUT_PATH, "relationship_network.json")
        with open(network_file, 'w') as f:
            json.dump(network_data, f)
    
    def visualize_relationship_network(self):
        """Create visualizations of the relationship network"""
        print("\nGenerating network visualizations...")
        
        if len(self.global_graph) == 0:
            print("No relationships found for visualization.")
            return
        
        # Set up color map for relationship types
        rel_types = set([d['type'] for _, _, d in self.global_graph.edges(data=True)])
        colors = plt.cm.tab10.colors
        color_map = {rel_type: colors[i % len(colors)] for i, rel_type in enumerate(rel_types)}
        
        # Prepare edge colors
        edge_colors = [color_map[self.global_graph.edges[edge]['type']] 
                      for edge in self.global_graph.edges()]
        
        # Create main visualization
        plt.figure(figsize=(14, 10))
        pos = nx.spring_layout(self.global_graph, seed=42)
        
        # Draw network
        nx.draw_networkx_nodes(self.global_graph, pos, alpha=0.8, node_size=500)
        nx.draw_networkx_labels(self.global_graph, pos, font_size=10)
        nx.draw_networkx_edges(self.global_graph, pos, width=2, alpha=0.7, edge_color=edge_colors)
        
        # Add legend
        for rel_type, color in color_map.items():
            plt.plot([0], [0], color=color, label=rel_type, linewidth=3)
        
        plt.legend(title="Relationship Types", loc="upper right")
        plt.title("Entity Relationship Network", size=15)
        plt.axis('off')
        
        # Save visualization
        network_viz_file = os.path.join(VISUALIZATION_PATH, "relationship_network.png")
        plt.savefig(network_viz_file, dpi=300, bbox_inches='tight')
        plt.close()
        
        # Create additional stats visualizations
        self.create_stats_visualizations()
        
        print(f"Network visualization saved to {network_viz_file}")
    
    def create_stats_visualizations(self):
        """Create additional visualizations for relationship statistics"""
        # Count relationship types
        rel_types = Counter([d['type'] for _, _, d in self.global_graph.edges(data=True)])
        
        # Relationship types distribution
        plt.figure(figsize=(10, 6))
        plt.bar(rel_types.keys(), rel_types.values(), color=plt.cm.tab10.colors[:len(rel_types)])
        plt.title("Distribution of Relationship Types", size=15)
        plt.xlabel("Relationship Type")
        plt.ylabel("Count")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(VISUALIZATION_PATH, "relationship_types.png"), dpi=300)
        plt.close()
        
        # Node connectivity (degree)
        node_degrees = dict(self.global_graph.degree())
        top_nodes = sorted(node_degrees.items(), key=lambda x: x[1], reverse=True)[:10]
        
        plt.figure(figsize=(10, 6))
        plt.bar([n[0] for n in top_nodes], [n[1] for n in top_nodes], 
                color=plt.cm.viridis(np.linspace(0, 1, len(top_nodes))))
        plt.title("Top 10 Most Connected Entities", size=15)
        plt.xlabel("Entity")
        plt.ylabel("Number of Connections")
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig(os.path.join(VISUALIZATION_PATH, "top_connected_entities.png"), dpi=300)
        plt.close()

def main():
    """Main function to run relationship extraction"""
    print("Starting Relationship Extraction System...")
    
    extractor = RelationshipExtractor()
    
    # Process all documents
    results = extractor.process_documents()
    
    # Print summary statistics
    print("\n===== Relationship Extraction Complete =====")
    print(f"Processed {results['document_count']} documents")
    print(f"Found {results['total_relationships']} relationships")
    print("\nRelationship types distribution:")
    for rel_type, count in results['by_type'].items():
        print(f"  - {rel_type}: {count}")
    
    print(f"\nResults saved to: {RELATIONSHIP_OUTPUT_PATH}")
    print(f"Visualizations saved to: {VISUALIZATION_PATH}")
    
    print("\nRelationship Extraction completed!")

if __name__ == "__main__":
    main()

Loading NLP models...
Downloading SpaCy model (one-time setup)...
SpaCy model loaded successfully
Starting Relationship Extraction System...
Initializing Relationship Extractor...
Found 122 documents with NER analysis
Loaded data for 122 documents
Relationship Extractor initialized successfully!

Processing 122 documents for relationship extraction...


Extracting relationships: 100%|██████████| 122/122 [00:03<00:00, 34.05it/s]


Saving relationship results to D:/Fraud Detection\outputs/relationship_results...

Generating network visualizations...
Network visualization saved to D:/Fraud Detection\outputs/visualizations\relationship_network.png

===== Relationship Extraction Complete =====
Processed 122 documents
Found 80 relationships

Relationship types distribution:
  - LOCATION: 12
  - OWNERSHIP: 68

Results saved to: D:/Fraud Detection\outputs/relationship_results
Visualizations saved to: D:/Fraud Detection\outputs/visualizations

Relationship Extraction completed!


In [2]:
import streamlit as st
import pandas as pd
import networkx as nx
import plotly.graph_objects as go
import plotly.express as px
import os
import json
from datetime import datetime
import numpy as np
from pathlib import Path

# Set page config
st.set_page_config(
    page_title="Fraud Detection Dashboard",
    page_icon="🔍",
    layout="wide"
)

# Constants - Updated paths to be more flexible
# Constants - Updated paths to be more flexible
BASE_PATH = r"D:/Fraud Detection"  # Direct path for notebook compatibility
RELATIONSHIP_OUTPUT_PATH = os.path.join(BASE_PATH, "outputs", "relationship_results")
NER_RESULTS_PATH = os.path.join(BASE_PATH, "outputs", "ner_results")
# Ensure output directories exist
os.makedirs(RELATIONSHIP_OUTPUT_PATH, exist_ok=True)
os.makedirs(NER_RESULTS_PATH, exist_ok=True)

class FraudDetectionDashboard:
    def __init__(self):
        self.load_data()
        
    def load_data(self):
        """Load all necessary data for the dashboard"""
        # Load relationship data
        self.relationships = self._load_relationship_data()
        
        # Load NER results
        self.ner_results = self._load_ner_results()
        
        # Create network graph
        self.G = self._create_network_graph()
        
    def _load_relationship_data(self):
        """Load relationship data from CSV files"""
        relationships = []
        if os.path.exists(RELATIONSHIP_OUTPUT_PATH):
            for file in os.listdir(RELATIONSHIP_OUTPUT_PATH):
                if file.endswith('_relationships.csv'):
                    df = pd.read_csv(os.path.join(RELATIONSHIP_OUTPUT_PATH, file))
                    relationships.append(df)
        return pd.concat(relationships) if relationships else pd.DataFrame()
    
    def _load_ner_results(self):
        """Load NER results from CSV files"""
        ner_results = []
        if os.path.exists(NER_RESULTS_PATH):
            for file in os.listdir(NER_RESULTS_PATH):
                if file.endswith('_entities.csv'):
                    df = pd.read_csv(os.path.join(NER_RESULTS_PATH, file))
                    ner_results.append(df)
        return pd.concat(ner_results) if ner_results else pd.DataFrame()
    
    def _create_network_graph(self):
        """Create a network graph from relationships"""
        G = nx.DiGraph()
        
        if not self.relationships.empty:
            for _, row in self.relationships.iterrows():
                G.add_edge(
                    row['source'],
                    row['target'],
                    relationship_type=row['type'],
                    confidence=row['confidence']
                )
        
        return G
    
    def plot_network_graph(self):
        """Create an interactive network graph using plotly"""
        if not self.G.edges():
            st.warning("No relationships found to visualize.")
            return None
        
        # Create node positions using spring layout
        pos = nx.spring_layout(self.G)
        
        # Create edge trace
        edge_x = []
        edge_y = []
        edge_text = []
        
        for edge in self.G.edges(data=True):
            x0, y0 = pos[edge[0]]
            x1, y1 = pos[edge[1]]
            edge_x.extend([x0, x1, None])
            edge_y.extend([y0, y1, None])
            edge_text.append(f"{edge[2]['relationship_type']} ({edge[2]['confidence']})")
        
        edge_trace = go.Scatter(
            x=edge_x, y=edge_y,
            line=dict(width=0.5, color='#888'),
            hoverinfo='text',
            mode='lines',
            text=edge_text
        )
        
        # Create node trace
        node_x = []
        node_y = []
        node_text = []
        node_connections = []
        
        for node in self.G.nodes():
            x, y = pos[node]
            node_x.append(x)
            node_y.append(y)
            node_text.append(node)
            node_connections.append(len(list(self.G.neighbors(node))))
        
        node_trace = go.Scatter(
            x=node_x, y=node_y,
            mode='markers+text',
            hoverinfo='text',
            text=node_text,
            textposition="top center",
            marker=dict(
                showscale=True,
                colorscale='YlGnBu',
                size=10,
                color=node_connections,
                colorbar=dict(
                    thickness=15,
                    title='Node Connections',
                    xanchor='left'
                )
            )
        )
        
        # Create figure with updated layout
        fig = go.Figure(data=[edge_trace, node_trace])
        fig.update_layout(
            title='Relationship Network',
            showlegend=False,
            hovermode='closest',
            margin=dict(b=20,l=5,r=5,t=40),
            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
        )
        
        return fig
    
    def get_relationship_stats(self):
        """Calculate and return relationship statistics"""
        if self.relationships.empty:
            return pd.DataFrame()
        
        stats = {
            'Total Relationships': len(self.relationships),
            'Unique Entities': len(set(self.relationships['source'].unique()) | set(self.relationships['target'].unique())),
            'Relationship Types': self.relationships['type'].nunique(),
            'High Confidence Relationships': len(self.relationships[self.relationships['confidence'] == 'HIGH']),
            'Average Relationships per Entity': len(self.relationships) / len(set(self.relationships['source'].unique()) | set(self.relationships['target'].unique()))
        }
        
        return pd.DataFrame(list(stats.items()), columns=['Metric', 'Value'])
    
    def get_entity_risk_scores(self):
        """Calculate risk scores for entities based on their relationships"""
        if self.relationships.empty:
            return pd.DataFrame()
        
        # Calculate risk scores based on relationship types and confidence
        risk_scores = {}
        for _, row in self.relationships.iterrows():
            source = row['source']
            target = row['target']
            rel_type = row['type']
            confidence = row['confidence']
            
            # Assign risk weights based on relationship type
            risk_weights = {
                'TRANSACTION': 2.0,
                'OWNERSHIP': 1.5,
                'EMPLOYMENT': 1.0,
                'LOCATION': 0.5
            }
            
            # Calculate base risk
            base_risk = risk_weights.get(rel_type, 1.0)
            
            # Adjust for confidence
            confidence_multiplier = 1.5 if confidence == 'HIGH' else 1.0
            
            # Update risk scores
            risk_scores[source] = risk_scores.get(source, 0) + (base_risk * confidence_multiplier)
            risk_scores[target] = risk_scores.get(target, 0) + (base_risk * confidence_multiplier)
        
        # Convert to DataFrame
        risk_df = pd.DataFrame(list(risk_scores.items()), columns=['Entity', 'Risk Score'])
        risk_df = risk_df.sort_values('Risk Score', ascending=False)
        
        return risk_df

def main():
    st.title("🔍 Fraud Detection Dashboard")
    
    # Initialize dashboard
    dashboard = FraudDetectionDashboard()
    
    # Sidebar
    st.sidebar.title("Dashboard Controls")
    view_option = st.sidebar.selectbox(
        "Select View",
        ["Network Graph", "Risk Analysis", "Entity Analysis"]
    )
    
    # Main content
    if view_option == "Network Graph":
        st.header("Relationship Network Visualization")
        fig = dashboard.plot_network_graph()
        if fig is not None:
            st.plotly_chart(fig, use_container_width=True)
        
        # Show relationship statistics
        st.subheader("Relationship Statistics")
        stats_df = dashboard.get_relationship_stats()
        st.dataframe(stats_df, use_container_width=True)
        
    elif view_option == "Risk Analysis":
        st.header("Entity Risk Analysis")
        risk_df = dashboard.get_entity_risk_scores()
        
        if not risk_df.empty:
            # Plot risk scores
            fig = px.bar(risk_df.head(10), 
                        x='Entity', 
                        y='Risk Score',
                        title='Top 10 High-Risk Entities')
            st.plotly_chart(fig, use_container_width=True)
            
            # Show detailed risk scores
            st.subheader("Detailed Risk Scores")
            st.dataframe(risk_df, use_container_width=True)
        else:
            st.warning("No risk analysis data available.")
        
    else:  # Entity Analysis
        st.header("Entity Analysis")
        
        if not dashboard.ner_results.empty:
            # Entity type distribution
            entity_types = dashboard.ner_results['entity_type'].value_counts()
            fig = px.pie(values=entity_types.values, 
                        names=entity_types.index,
                        title='Entity Type Distribution')
            st.plotly_chart(fig, use_container_width=True)
            
            # Show entity details
            st.subheader("Entity Details")
            st.dataframe(dashboard.ner_results, use_container_width=True)
        else:
            st.warning("No entity data available for analysis.")

if __name__ == "__main__":
    main()



In [None]:
import os
import subprocess
import sys

def run_dashboard():
    try:
        # Get the directory of the current script, fallback to cwd if __file__ is not defined
        current_dir = os.path.dirname(os.path.abspath(__file__))
    except NameError:
        current_dir = os.getcwd()
    
    # Path to the dashboard script
    dashboard_path = os.path.join(current_dir, "dashboard.py")
    
    # Ensure we're in the correct directory
    os.chdir(current_dir)
    
    # Run the dashboard using streamlit
    try:
        # Use the full path to streamlit
        streamlit_cmd = [sys.executable, "-m", "streamlit", "run", dashboard_path]
        print(f"Running command: {' '.join(streamlit_cmd)}")
        subprocess.run(streamlit_cmd, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error running dashboard: {e}")
        print("Make sure you have installed all required dependencies:")
        print("pip install -r requirements.txt")
    except Exception as e:
        print(f"Unexpected error: {e}")

if __name__ == "__main__":
    run_dashboard()


Running command: c:\Users\Tanmay\AppData\Local\Programs\Python\Python311\python.exe -m streamlit run d:\Fraud Detection\Scripts\dashboard.py


In [1]:
import streamlit as st
import cv2
import numpy as np
from PIL import Image
import easyocr
import torch
from transformers import LayoutLMForSequenceClassification, LayoutLMTokenizer
import pandas as pd
import plotly.express as px
import tempfile
import os

# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])

# Initialize LayoutLM model and tokenizer
model_name = "microsoft/layoutlm-base-uncased"
tokenizer = LayoutLMTokenizer.from_pretrained(model_name)
model = LayoutLMForSequenceClassification.from_pretrained(model_name)

def process_image(image):
    """Process the uploaded image and extract text using OCR"""
    # Convert image to numpy array
    img_array = np.array(image)
    
    # Perform OCR
    results = reader.readtext(img_array)
    
    # Extract text and bounding boxes
    extracted_text = []
    boxes = []
    for (bbox, text, prob) in results:
        if prob > 0.5:  # Confidence threshold
            extracted_text.append(text)
            boxes.append(bbox)
    
    return extracted_text, boxes, img_array

def analyze_layout(text, boxes, image):
    """Analyze the document layout using LayoutLM"""
    # Prepare input for LayoutLM
    encoding = tokenizer(
        text,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=512
    )
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(**encoding)
        predictions = outputs.logits.argmax(-1)
    
    return predictions

def detect_fraud(text, layout_analysis):
    """Perform fraud detection based on extracted text and layout analysis"""
    # This is a simplified example - you should implement your actual fraud detection logic here
    fraud_indicators = {
        'amount_mismatch': False,
        'suspicious_patterns': False,
        'layout_anomalies': False
    }
    
    # Example checks (customize based on your requirements)
    if any('$' in t for t in text):
        amounts = [float(t.replace('$', '')) for t in text if '$' in t]
        if len(amounts) > 1 and max(amounts) - min(amounts) > 100:
            fraud_indicators['amount_mismatch'] = True
    
    if layout_analysis is not None and torch.any(layout_analysis != 0):
        fraud_indicators['layout_anomalies'] = True
    
    return fraud_indicators

def main():
    st.title("Document Fraud Detection System")
    st.write("Upload a document image to detect potential fraud")

    # File uploader
    uploaded_file = st.file_uploader("Choose an image file", type=['png', 'jpg', 'jpeg'])

    if uploaded_file is not None:
        # Display the uploaded image
        image = Image.open(uploaded_file)
        st.image(image, caption='Uploaded Document', use_column_width=True)

        # Process the image
        with st.spinner('Processing document...'):
            # Extract text and boxes
            extracted_text, boxes, img_array = process_image(image)
            
            # Analyze layout
            layout_analysis = analyze_layout(extracted_text, boxes, img_array)
            
            # Detect fraud
            fraud_indicators = detect_fraud(extracted_text, layout_analysis)

            # Display results
            st.subheader("Extracted Text")
            st.write(extracted_text)

            st.subheader("Fraud Detection Results")
            
            # Create a DataFrame for visualization
            fraud_df = pd.DataFrame({
                'Indicator': list(fraud_indicators.keys()),
                'Detected': list(fraud_indicators.values())
            })
            
            # Create a bar chart
            fig = px.bar(fraud_df, x='Indicator', y='Detected',
                        title='Fraud Detection Indicators',
                        color='Detected',
                        color_discrete_map={True: 'red', False: 'green'})
            st.plotly_chart(fig)

            # Display detailed results
            st.subheader("Detailed Analysis")
            for indicator, detected in fraud_indicators.items():
                status = "⚠️ Detected" if detected else "✅ Not Detected"
                st.write(f"{indicator.replace('_', ' ').title()}: {status}")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Some weights of LayoutLMForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlm-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-05-16 02:25:03.185 
  command:

    streamlit run C:\Users\Tanmay\AppData\Roaming\Python\Python311\site-packages\ipykernel_launcher.py [ARGUMENTS]


In [6]:
import streamlit as st
import cv2
import numpy as np
from PIL import Image
import easyocr
import torch
from transformers import LayoutLMForSequenceClassification, LayoutLMTokenizer
import pandas as pd
import plotly.express as px
import tempfile
import os
import re
from datetime import datetime

# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])

# Initialize LayoutLM model and tokenizer
model_name = "microsoft/layoutlm-base-uncased"
tokenizer = LayoutLMTokenizer.from_pretrained(model_name)
model = LayoutLMForSequenceClassification.from_pretrained(model_name)

def process_image(image):
    """Process the uploaded image and extract text using OCR"""
    # Convert image to numpy array
    img_array = np.array(image)
    
    # Perform OCR
    results = reader.readtext(img_array)
    
    # Extract text and bounding boxes
    extracted_text = []
    boxes = []
    for (bbox, text, prob) in results:
        if prob > 0.5:  # Confidence threshold
            extracted_text.append(text)
            boxes.append(bbox)
    
    return extracted_text, boxes, img_array

def analyze_layout(text, boxes, image):
    """Analyze the document layout using LayoutLM"""
    # Prepare input for LayoutLM
    encoding = tokenizer(
        text,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=512
    )
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(**encoding)
        predictions = outputs.logits.argmax(-1)
    
    return predictions

def extract_amounts(text):
    """Extract monetary amounts from text"""
    amounts = []
    # Match patterns like $100, 100.00, $1,234.56
    amount_pattern = r'\$?\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
    for t in text:
        matches = re.findall(amount_pattern, t)
        for match in matches:
            try:
                # Remove $ and , characters and convert to float
                clean_amount = float(match.replace('$', '').replace(',', ''))
                amounts.append(clean_amount)
            except ValueError:
                continue
    return amounts

def extract_dates(text):
    """Extract dates from text"""
    dates = []
    # Common date patterns
    date_patterns = [
        r'\d{1,2}/\d{1,2}/\d{2,4}',  # MM/DD/YYYY
        r'\d{1,2}-\d{1,2}-\d{2,4}',  # MM-DD-YYYY
        r'\d{1,2}\.\d{1,2}\.\d{2,4}'  # MM.DD.YYYY
    ]
    
    for t in text:
        for pattern in date_patterns:
            matches = re.findall(pattern, t)
            dates.extend(matches)
    return dates

def detect_fraud(text, layout_analysis):
    """Perform comprehensive fraud detection based on extracted text and layout analysis"""
    fraud_indicators = {
        'amount_mismatch': False,
        'suspicious_patterns': False,
        'layout_anomalies': False,
        'date_inconsistency': False,
        'amount_format_anomaly': False,
        'missing_crucial_info': False,
        'duplicate_amounts': False
    }

    # --- Amount Analysis ---
    amounts = extract_amounts(text)
    if amounts:
        total_amount = max(amounts)
        line_items = [amt for amt in amounts if amt != total_amount]
        line_sum = sum(line_items)

        # Allow for a 5% mismatch (due to rounding, taxes, etc.)
        if line_items and abs(line_sum - total_amount) / total_amount > 0.05:
            fraud_indicators['amount_mismatch'] = True

        if len(set(amounts)) != len(amounts):
            fraud_indicators['duplicate_amounts'] = True

        if any(amt < 0 for amt in amounts):
            fraud_indicators['amount_format_anomaly'] = True

    # --- Date Analysis ---
    dates = extract_dates(text)
    parsed_dates = []
    for d in dates:
        for fmt in ['%m/%d/%Y', '%m-%d-%Y', '%m.%d.%Y', '%m/%d/%y', '%m-%d-%y', '%m.%d.%y']:
            try:
                parsed = datetime.strptime(d, fmt)
                parsed_dates.append(parsed)
                break
            except ValueError:
                continue
    if len(parsed_dates) >= 2:
        if not all(parsed_dates[i] <= parsed_dates[i+1] for i in range(len(parsed_dates)-1)):
            fraud_indicators['date_inconsistency'] = True

    # --- Layout Anomalies ---
    if layout_analysis is not None and layout_analysis.item() != 0:
        fraud_indicators['layout_anomalies'] = True

    # --- Missing Information ---
    crucial_keywords = ['total', 'date', 'invoice', 'bill', 'amount']
    text_lower = ' '.join(text).lower()
    missing = [word for word in crucial_keywords if word not in text_lower]
    if len(missing) >= 3:
        fraud_indicators['missing_crucial_info'] = True

    # --- Suspicious Patterns ---
    suspicious_patterns = [
        r'\d{16}',               # credit card
        r'\d{3}-\d{2}-\d{4}',    # SSN
        r'\bvoid\b',             # Voided
        r'\bcopy\b',             # Copy
        r'\bduplicate\b'         # Duplicate
    ]
    if any(re.search(pat, ' '.join(text), re.IGNORECASE) for pat in suspicious_patterns):
        fraud_indicators['suspicious_patterns'] = True

    return fraud_indicators


def main():
    st.title("Document Fraud Detection System")
    st.write("Upload a document image to detect potential fraud")

    # File uploader
    uploaded_file = st.file_uploader("Choose an image file", type=['png', 'jpg', 'jpeg'])

    if uploaded_file is not None:
        # Display the uploaded image
        image = Image.open(uploaded_file)
        st.image(image, caption='Uploaded Document', use_column_width=True)

        # Process the image
        with st.spinner('Processing document...'):
            # Extract text and boxes
            extracted_text, boxes, img_array = process_image(image)
            
            # Analyze layout
            layout_analysis = analyze_layout(extracted_text, boxes, img_array)
            
            # Detect fraud
            fraud_indicators = detect_fraud(extracted_text, layout_analysis)

            # Display results
            st.subheader("Extracted Text")
            st.write(extracted_text)

            st.subheader("Fraud Detection Results")
            
            # Create a DataFrame for visualization
            fraud_df = pd.DataFrame({
                'Indicator': list(fraud_indicators.keys()),
                'Detected': list(fraud_indicators.values())
            })
            
            # Create a bar chart
            fig = px.bar(fraud_df, x='Indicator', y='Detected',
                        title='Fraud Detection Indicators',
                        color='Detected',
                        color_discrete_map={True: 'red', False: 'green'})
            st.plotly_chart(fig)

            # Display detailed results
            st.subheader("Detailed Analysis")
            for indicator, detected in fraud_indicators.items():
                status = "⚠️ Detected" if detected else "✅ Not Detected"
                st.write(f"{indicator.replace('_', ' ').title()}: {status}")


if __name__ == "__main__":
    main()

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Some weights of LayoutLMForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlm-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
