In [4]:
import os
import PyPDF2
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

train_folder = "./train"
train_invoices = ["2024.03.15_0954.pdf", "2024.03.15_1145.pdf", "Faller_8.pdf", "invoice_77073.pdf", "invoice_102856.pdf"]
database = {}

test_folder = "./test"
test_invoices = ["invoice_77098.pdf", "invoice_102857.pdf"]


def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text


def extract_features(text):
    features = {}
    features['invoice_number'] = re.findall(r'Invoice Number: (\d+)', text)
    features['date'] = re.findall(r'Date: (\d{2}/\d{2}/\d{4})', text)
    features['amount'] = re.findall(r'Amount: (\d+\.\d{2})', text)
    return features


def calculate_similarity(text1, text2):
    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    cosine_sim = cosine_similarity(vectors)
    return cosine_sim[0][1]





for invoice in train_invoices:
    pdf_path = os.path.join(train_folder, invoice)
    text = extract_text_from_pdf(pdf_path)
    features = extract_features(text)
    database[invoice] = {'text': text, 'features': features}


for test_invoice in test_invoices:
    pdf_path = os.path.join(test_folder, test_invoice)
    test_text = extract_text_from_pdf(pdf_path)
    test_features = extract_features(test_text)
    
    best_match = None
    highest_similarity = 0
    
    for train_invoice, data in database.items():
        similarity = calculate_similarity(test_text, data['text'])
        if similarity > highest_similarity:
            highest_similarity = similarity
            best_match = train_invoice
    
    print(f"Test Invoice: {test_invoice}")
    print(f"Most Similar Invoice: {best_match}")
    print(f"Similarity Score: {highest_similarity}\n")




Test Invoice: invoice_77098.pdf
Most Similar Invoice: invoice_77073.pdf
Similarity Score: 0.7791324628540709

Test Invoice: invoice_102857.pdf
Most Similar Invoice: invoice_102856.pdf
Similarity Score: 0.7283994123857916

