In [7]:




import os
import PyPDF2
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from sklearn.preprocessing import Binarizer
import numpy as np

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                page_text = page.extract_text()
                if page_text:
                    text += page_text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

def extract_features(text):
    doc = nlp(text)
    features = {
        'invoice_number': [],
        'date': [],
        'amount': []
    }
    
    # Extract named entities
    for ent in doc.ents:
        if ent.label_ == 'DATE':
            features['date'].append(ent.text)
        elif ent.label_ == 'MONEY':
            features['amount'].append(ent.text)
        elif ent.label_ == 'CARDINAL':
            features['invoice_number'].append(ent.text)
    
    return features

def flatten_features(features):
    return " ".join([item for sublist in features.values() for item in sublist])

def calculate_cosine_similarity(text1, text2):
    if not text1.strip() or not text2.strip():
        return 0.0
    
    vectorizer = TfidfVectorizer(stop_words='english').fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    return cosine_similarity([vectors[0]], [vectors[1]])[0][0]

def calculate_jaccard_similarity(text1, text2):
    if not text1.strip() or not text2.strip():
        return 0.0
    
    vectorizer = CountVectorizer(binary=True).fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    if vectors.shape[1] == 0:  # Handle case with no common words
        return 0.0
    binarizer = Binarizer()
    bin_vectors = binarizer.fit_transform(vectors)
    return jaccard_score(bin_vectors[0], bin_vectors[1])

def calculate_combined_similarity(text1, text2, weight_cosine=0.7, weight_jaccard=0.3):
    cosine_sim = calculate_cosine_similarity(text1, text2)
    jaccard_sim = calculate_jaccard_similarity(text1, text2)
    
    # Combine the scores using specified weights
    return weight_cosine * cosine_sim + weight_jaccard * jaccard_sim

def evaluate_similarity(database, test_features, weight_cosine, weight_jaccard):
    best_match = None
    highest_similarity = 0
    
    for train_invoice, data in database.items():
        similarity = calculate_combined_similarity(test_features, data['features'], weight_cosine, weight_jaccard)
        if similarity > highest_similarity:
            highest_similarity = similarity
            best_match = train_invoice
    
    return best_match, highest_similarity

# Paths to training and test folders
train_folder = "./train"
test_folder = "./test"

# List of training and test invoices
train_invoices = ["2024.03.15_0954.pdf", "2024.03.15_1145.pdf", "Faller_8.pdf", "invoice_77073.pdf", "invoice_102856.pdf"]
test_invoices = ["invoice_77098.pdf", "invoice_102857.pdf"]

# Dictionary to store training invoices
database = {}

# Extract and store training invoices
for invoice in train_invoices:
    pdf_path = os.path.join(train_folder, invoice)
    text = extract_text_from_pdf(pdf_path)
    
    if text.strip():
        features = extract_features(text)
        flattened_features = flatten_features(features)
        if not flattened_features.strip():
            print(f"Warning: Flattened features for {invoice} are empty.")
        database[invoice] = {'text': text, 'features': flattened_features}

# Define weight ranges
weights = np.arange(0.1, 1.0, 0.1)

# Dictionary to store results
results = {}

# Compare test invoices to training invoices
for test_invoice in test_invoices:
    pdf_path = os.path.join(test_folder, test_invoice)
    test_text = extract_text_from_pdf(pdf_path)
    
    if not test_text.strip():
        print(f"Test Invoice: {test_invoice} has empty or invalid content.")
        continue
    
    test_features = extract_features(test_text)
    test_flattened_features = flatten_features(test_features)
    
    if not test_flattened_features.strip():
        print(f"Warning: Flattened features for {test_invoice} are empty.")
    
    # Evaluate combinations of weights
    for weight_cosine in weights:
        for weight_jaccard in weights:
            if weight_cosine + weight_jaccard == 1:  # Ensure weights sum up to 1
                best_match, highest_similarity = evaluate_similarity(
                    database, 
                    test_flattened_features, 
                    weight_cosine, 
                    weight_jaccard
                )
                results[(weight_cosine, weight_jaccard)] = highest_similarity
    
    # Find the best weight combination for the current test invoice
    best_weights = max(results, key=results.get)
    print(f"Test Invoice: {test_invoice}")
    print(f"Best Weights: Cosine: {best_weights[0]}, Jaccard: {best_weights[1]}")
    print(f"Most Similar Invoice: {best_match}")
    print(f"Similarity Score: {highest_similarity}\n")


Test Invoice: invoice_77098.pdf
Best Weights: Cosine: 0.9, Jaccard: 0.1
Most Similar Invoice: invoice_77073.pdf
Similarity Score: 0.671950599397238

Test Invoice: invoice_102857.pdf
Best Weights: Cosine: 0.9, Jaccard: 0.1
Most Similar Invoice: invoice_102856.pdf
Similarity Score: 0.4593476126495024

