In [44]:
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import os
import re

In [45]:
def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    doc.close()
    return text

In [46]:
def perform_ocr(image_path):
    return pytesseract.image_to_string(Image.open(image_path))


In [47]:
def parse_invoice_text(text):
    invoice_data = {
        "Invoice Number": "",
        "Customer Name": "",
        "Ship To": "",
        "Date": "",
        "Ship Mode": "",
        "Balance Due": "",
        "Item": "",
        "Quantity": "",
        "Rate": "",
        "Amount": ""
    }
    
    # Extracting invoice number
    invoice_number = re.search(r'INVOICE\s*#\s*(\d+)', text)
    if invoice_number:
        invoice_data["Invoice Number"] = invoice_number.group(1)
    
    # Extracting customer name
    customer_name = re.search(r'Bill\s*To:\s*([\w\s]+)', text)
    if customer_name:
        invoice_data["Customer Name"] = customer_name.group(1)
    
    # Extracting ship to address
    ship_to = re.search(r'Ship\s*To:\s*([\w\s\,]+)', text)
    if ship_to:
        invoice_data["Ship To"] = ship_to.group(1)
    
    # Extracting date
    date = re.search(r'Date:\s*([\w\s]+)', text)
    if date:
        invoice_data["Date"] = date.group(1)
    
    # Extracting ship mode
    ship_mode = re.search(r'Ship\s*Mode:\s*([\w\s]+)', text)
    if ship_mode:
        invoice_data["Ship Mode"] = ship_mode.group(1)
    
    # Extracting balance due
    balance_due = re.search(r'Balance\s*Due:\s*\$([\d\.]+)', text)
    if balance_due:
        invoice_data["Balance Due"] = balance_due.group(1)
    
    # Extracting item details (assuming one item per invoice for simplicity)
    item_details = re.search(r'(\d+)\s+(\S.*?)(?:\s+\$\s*(\d+\.\d{2})\s+\$\s*(\d+\.\d{2}))', text)
    if item_details:
        invoice_data["Quantity"] = item_details.group(1)
        invoice_data["Item"] = item_details.group(2)
        invoice_data["Rate"] = item_details.group(3)
        invoice_data["Amount"] = item_details.group(4)
    
    return invoice_data


Example usage 

In [48]:
pdf_path = 'C:\\Users\\Rishika\\Desktop\\AIDEIT\\data\\1000+ PDF_Invoice_Folder\\invoice_Aaron Bergman_36258.pdf'
extracted_text = extract_text_from_pdf(pdf_path)
print(extracted_text)

INVOICE
# 36258
SuperStore
Bill To:
Aaron Bergman
Ship To:
98103, Seattle,
Washington, United
States
Mar 06 2012
First Class
$50.10
Date:
Ship Mode:
Balance Due:
Item
Quantity
Rate
Amount
Global Push Button Manager's Chair, Indigo
1
$48.71
$48.71
Chairs, Furniture, FUR-CH-4421
$48.71
$9.74
$11.13
$50.10
Subtotal:
Discount (20%):
Shipping:
Total:
Notes:
Thanks for your business!
Terms:
Order ID : CA-2012-AB10015140-40974



In [49]:
# Function to process all PDFs in a directory
def process_pdfs_in_directory(directory):
    data = []
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory, filename)
            extracted_text = extract_text_from_pdf(pdf_path)
            invoice_data = parse_invoice_text(extracted_text)
            data.append(invoice_data)
    return data

In [50]:
# Directory containing your PDF invoices
pdf_directory = r'C:\Users\Rishika\Desktop\AIDEIT\data\1000+ PDF_Invoice_Folder'

In [51]:
# Process PDFs and store data in a list of dictionaries
extracted_data = process_pdfs_in_directory(pdf_directory)

In [52]:
import pandas as pd

In [53]:
# Convert data to DataFrame
df = pd.DataFrame(extracted_data)

In [54]:
# Save DataFrame to CSV
csv_file = 'extracted_invoices.csv'
df.to_csv(csv_file, index=False)

In [55]:
extracted_data

[{'Invoice Number': '36258',
  'Customer Name': 'Aaron Bergman\nShip To',
  'Ship To': '98103, Seattle,\nWashington, United\nStates\nMar 06 2012\nFirst Class\n',
  'Date': 'Ship Mode',
  'Ship Mode': 'Balance Due',
  'Balance Due': '',
  'Item': 'Chairs, Furniture, FUR-CH-4421',
  'Quantity': '71',
  'Rate': '48.71',
  'Amount': '9.74'},
 {'Invoice Number': '36259',
  'Customer Name': 'Aaron Bergman\nShip To',
  'Ship To': '98103, Seattle,\nWashington, United\nStates\nMar 06 2012\nFirst Class\n',
  'Date': 'Ship Mode',
  'Ship Mode': 'Balance Due',
  'Balance Due': '',
  'Item': '3',
  'Quantity': '330',
  'Rate': '17.94',
  'Amount': '53.82'},
 {'Invoice Number': '',
  'Customer Name': 'Jun 5',
  'Ship To': '',
  'Date': 'Balance Due',
  'Ship Mode': '',
  'Balance Due': '',
  'Item': '',
  'Quantity': '',
  'Rate': '',
  'Amount': ''},
 {'Invoice Number': '39519',
  'Customer Name': 'Aaron Bergman\nShip To',
  'Ship To': '76017, Arlington,\nTexas, United States\nFeb 19 2012\nStandard