In [150]:
import pandas as pd
from nltk.tokenize import word_tokenize
from pypdf import PdfReader
import re

# Initialize reader and datasets
reader = PdfReader("img/test.pdf")
product_data = pd.DataFrame(columns=["Product_Name", "Qnt", "Price"])
totals_data = {"Subtotal": None, "Savings": None, "Tax": None, "Total": None}
meta_data = {"Date": None, "Order_Number":None}

# Helper function to extract product details
def extract_details(line):
    # Match the price format
    price_match = re.search(r"\$\d+(\.\d+)?", line)
    price = price_match.group(0) if price_match else "Unknown"
    # Match quantity (number before the price)
    qnt_match = re.search(r"(\d+)\s+\$", line)
    qnt = qnt_match.group(1) if qnt_match else "1"
    return price, qnt

def clean_product_name(buffer):
    # Extract product name and description up to the last numeric value before quantity or price
    match = re.search(r"^(.*?)(?:(?:Weight-adjustedQty|ShoppedQty|Qty|$))", buffer)
    if match:
        name = match.group(1).strip()
        return name
    return buffer.strip()

# Parsing logic
buffer = ""
processing_products = True

for j, page in enumerate(reader.pages):
    text = page.extract_text()
    for i, line in enumerate(text.splitlines()):
        if j == 0 and i < 1:
            continue  # Skip header for page 0
        elif j > 0 and i < 1:
            continue  # Skip header for other pages
        line = line.strip()
        if not line or "http" in line or "Order details" in line:
            continue  # Skip headers and irrelevant lines

        # Detect and stop product processing when totals start
        if "Subtotal" in line:
            processing_products = False
            totals_data["Subtotal"] = line.split("$")[-1].strip()
            continue
            
        if "Order#" in line:
            order_match = re.search(r'Order#\s*(\d+-?\d*)', line)
            if order_match:
                meta_data["Order_Number"] = order_match.group(1)
            continue
            
        # Extract date
        date_match = re.search(r'([A-Za-z]{3}\s+\d{1,2},\s+\d{4})', line)
        if date_match:
            meta_data["Date"] = date_match.group(1)

        if not processing_products:
            # Extract totals
            if "Savings" in line:
                totals_data["Savings"] = line.split("$")[-1].strip()
            elif "Tax" in line:
                totals_data["Tax"] = line.split("$")[-1].strip()
            elif "Total" in line:
                totals_data["Total"] = line.split("$")[-1].strip()
            continue

        # Accumulate lines for multi-line product descriptions
        if re.search(r"\$\d+(\.\d+)?", line):
            buffer += " " + line
            price, qnt = extract_details(buffer)
            item_name = clean_product_name(buffer)
            product_data = pd.concat(
                [product_data, pd.DataFrame([[item_name, qnt, price]], columns=product_data.columns)],
                ignore_index=True,
            )
            buffer = ""
        else:
            buffer += " " + line

# Ensure the last buffered product is processed
if buffer:
    price, qnt = extract_details(buffer)
    item_name = clean_product_name(buffer)
    product_data = pd.concat(
        [product_data, pd.DataFrame([[item_name, qnt, price]], columns=product_data.columns)],
        ignore_index=True,
    )

print("\nTotals Data:")
print(totals_data)
print("\nMeta Data:")
print(meta_data)



Totals Data:
{'Subtotal': '190.85', 'Savings': '5.05', 'Tax': '2.44', 'Total': '188.24'}

Meta Data:
{'Date': 'Nov 19, 2024', 'Order_Number': '2000127-21418074'}


In [151]:
product_data

Unnamed: 0,Product_Name,Qnt,Price
0,"Fresh Cauliflower, Each",1,$3.42
1,Fresh Green Seedless Grapes (2.25 lbs/Bag Est.),1,$4.02
2,"Fresh Whole Carrots, 1 lb Bag",2,$1.96
3,"Fresh Roma Tomato, Each",23,$5.34
4,"Marketside Fresh Organic Bananas, Bunch",1,$2.12
5,"Fresh Yellow Onions, 3 lb Bag",2,$5.36
6,"Fresh Banana, Each",11,$1.88
7,"Fresh Gala Apples, 3 lb Bag",1,$3.48
8,"Great Value Aluminum Foil, 25 sq ft",1,$1.42
9,"Fresh Hass Avocados, Each",5,$2.90
