In [2]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_features(documents):
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(documents)
    return X, vectorizer


In [4]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(query_vector, database_vectors):
    similarities = cosine_similarity(query_vector, database_vectors)
    return similarities


In [None]:
# query_pdf = '/absolute/path/to/query_invoice.pdf'
# database_pdfs = [
#     '/absolute/path/to/invoice1.pdf',
#     '/absolute/path/to/invoice2.pdf',
#     '/absolute/path/to/invoice3.pdf'
# ]


In [11]:
def find_most_similar_invoice(query_pdf_path, database_pdfs):
    # Extract text from the query PDF
    query_text = extract_text_from_pdf(query_pdf_path)
    
    # Extract text from the database PDFs
    database_texts = [extract_text_from_pdf(pdf) for pdf in database_pdfs]
    
    # Add query text to the list of database texts
    all_texts = [query_text] + database_texts
    
    # Extract features
    X, vectorizer = extract_features(all_texts)
    
    # Query vector is the first row, database vectors are the rest
    query_vector = X[0:1]
    database_vectors = X[1:]
    
    # Calculate similarity
    similarities = calculate_similarity(query_vector, database_vectors)
    
    # Find the index of the most similar document
    most_similar_index = similarities.argmax()
    most_similar_score = similarities[0][most_similar_index]
    
    return most_similar_index, most_similar_score

# Example usage
query_pdf = 'query_invoice.pdf'
database_pdfs = ['invoice1.pdf']

index, score = find_most_similar_invoice(query_pdf, database_pdfs)
print(f"Most similar invoice index: {index}, Similarity score: {score}")


Most similar invoice index: 0, Similarity score: 0.029014676924131705
