In [5]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text


In [6]:
import re

def extract_features(text):
    features = {}
    
    # Extract invoice number
    invoice_number = re.search(r'Invoice Number:\s*(\S+)', text)
    features['invoice_number'] = invoice_number.group(1) if invoice_number else ""
    
    # Extract dates
    date = re.search(r'Date:\s*([\d/]+)', text)
    features['date'] = date.group(1) if date else ""
    
    # Extract amounts
    amount = re.search(r'Total Amount:\s*([\d,.]+)', text)
    features['amount'] = amount.group(1) if amount else ""
    
    # Extract keywords (you can use more sophisticated methods like TF-IDF)
    keywords = re.findall(r'\b\w+\b', text)
    features['keywords'] = set(keywords)
    
    return features


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])
    similarity = cosine_similarity(vectors[0:1], vectors[1:2])
    return similarity[0][0]


In [8]:
def calculate_jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0


In [10]:
from pdf2image import convert_from_path

def pdf_to_images(pdf_path):
    return convert_from_path(pdf_path)


In [20]:
invoice_database = {
    'invoice1.pdf': extract_features(extract_text_from_pdf('invoice1.pdf')),
    'invoice2.pdf': extract_features(extract_text_from_pdf('invoice2.pdf')),
    # Add more invoices as needed
}

def find_most_similar_invoice(input_invoice_path):
    input_text = extract_text_from_pdf(input_invoice_path)
    input_features = extract_features(input_text)
    
    best_similarity = 0
    most_similar_invoice = None
    
    for invoice_path, features in invoice_database.items():
        database_text = extract_text_from_pdf(invoice_path)
        
        # Calculate similarity
        similarity = calculate_cosine_similarity(input_text, database_text)
        
        if similarity > best_similarity:
            best_similarity = similarity
            most_similar_invoice = invoice_path
    
    return most_similar_invoice, best_similarity


In [21]:
input_invoice_path = 'invoice1.pdf'
most_similar_invoice, similarity_score = find_most_similar_invoice(input_invoice_path)

print(f'Most similar invoice: {most_similar_invoice}')
print(f'Similarity score: {similarity_score}')


Most similar invoice: invoice1.pdf
Similarity score: 1.0000000000000004


In [31]:
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    print("Extracted Text:\n", text)  # Print the text to debug
    return text


In [32]:
def extract_invoice_number(text):
    """Extracts the invoice number from the text."""
    match = re.search(r'Invoice\s*Number[:\s]*([\w\-]+)', text, re.IGNORECASE)
    return match.group(1) if match else None


In [33]:
def extract_date(text):
    """Extracts the date from the text."""
    match = re.search(r'\b(?:Date|Issue Date|Invoice Date):?\s*([\d/]{8,10}|\d{4}-\d{2}-\d{2})', text, re.IGNORECASE)
    return match.group(1) if match else None


In [34]:
def extract_amount(text):
    """Extracts the total amount from the text."""
    match = re.search(r'Total\s*Amount[:\s]*[\$]?([\d,\.]+)', text, re.IGNORECASE)
    return match.group(1) if match else None


In [36]:
sample_text = """
Invoice Number: INV-12345
Date: 07/30/2024
Total Amount: $1,234.56
"""

print("Sample Text:\n", sample_text)

invoice_number = extract_invoice_number(sample_text)
date = extract_date(sample_text)
amount = extract_amount(sample_text)

print(f'Invoice Number: {invoice_number}')
print(f'Date: {date}')
print(f'Amount: {amount}')


Sample Text:
 
Invoice Number: INV-12345
Date: 07/30/2024
Total Amount: $1,234.56

Invoice Number: INV-12345
Date: 07/30/2024
Amount: 1,234.56


In [37]:
import pdfplumber
import re

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    print("Extracted Text:\n", text)  # Print the text to debug
    return text

def extract_invoice_number(text):
    """Extracts the invoice number from the text."""
    match = re.search(r'Invoice\s*Number[:\s]*([\w\-]+)', text, re.IGNORECASE)
    return match.group(1) if match else None

def extract_date(text):
    """Extracts the date from the text."""
    match = re.search(r'\b(?:Date|Issue Date|Invoice Date):?\s*([\d/]{8,10}|\d{4}-\d{2}-\d{2})', text, re.IGNORECASE)
    return match.group(1) if match else None

def extract_amount(text):
    """Extracts the total amount from the text."""
    match = re.search(r'Total\s*Amount[:\s]*[\$]?([\d,\.]+)', text, re.IGNORECASE)
    return match.group(1) if match else None

def extract_invoice_details(pdf_path):
    """Extracts invoice details (number, date, amount) directly from a PDF."""
    text = extract_text_from_pdf(pdf_path)
    
    invoice_number = extract_invoice_number(text)
    date = extract_date(text)
    amount = extract_amount(text)
    
    return {
        'Invoice Number': invoice_number,
        'Date': date,
        'Amount': amount
    }

# Example usage
pdf_path = 'query_invoice.pdf'  # Update this path to your actual PDF file
details = extract_invoice_details(pdf_path)

print(f'Invoice Number: {details["Invoice Number"]}')
print(f'Date: {details["Date"]}')
print(f'Amount: {details["Amount"]}')


Extracted Text:
 Invoice
Invoice Number INV-3337
From:
DEMO - Sliced Invoices Order Number 12345
Suite 5A-1204 Invoice Date January 25, 2016
123 Somewhere Street
Due Date January 31, 2016
Your City AZ 12345
Total Due $93.50
admin@slicedinvoices.com
To:
Test Business
123 Somewhere St
d
Melbourne, VIC 3000
test@test.com
i
Hrs/Qty Service Rate/Price Adjust Sub Total
a
Web Design
1.00 $85.00 0.00% $85.00
This is a sample description...
P
Sub Total $85.00
Tax $8.50
Total $93.50
ANZ Bank
ACC # 1234 1234
BSB # 4321 432
Payment is due within 30 days from date of invoice. Late payment is subject to fees of 5% per month.
Thanks for choosing DEMO - Sliced Invoices | admin@slicedinvoices.com
Page 1/1
Invoice Number: INV-3337
Date: None
Amount: None
