In [1]:
import re
from pathlib import Path
from dateutil.parser import parse as parse_date
import pdfplumber
from datetime import date
import pandas as pd
import numpy as np
def extract_total(text: str):
    """Extract Total or Grand Total."""
    m = re.search(r"Grand Total\s*\$([\d,\.]+)", text, flags=re.IGNORECASE)
    if not m:
        matches = re.findall(r"Total\s*\$([\d,\.]+)", text, flags=re.IGNORECASE)
        if not matches:
            return None
        amount_str = matches[-1]
    else:
        amount_str = m.group(1)
    return float(amount_str.replace(",", ""))


def extract_service_date(text: str):
    """Extract date from 'Date: Dec 5, 2025'."""
    m = re.search(r"Date\s*:\s*(.+)", text, flags=re.IGNORECASE)
    if not m:
        return None
    try:
        return parse_date(m.group(1)).date().isoformat()
    except:
        return None


def extract_listing(text: str):
    """
    Extract listing/property name from item line:
    '01 593 Poulsbo  $150.00'
    """
    m = re.search(r"^\s*\d+\s+(.+?)\s+\$\s*[\d,\.]+", text, flags=re.MULTILINE)
    if not m:
        return None
    
    listing = m.group(1).strip()
    parts = listing.split()
    if len(parts) == 2 and parts[0].isdigit():
        num, name = parts
        return f"{name} {num}"
    if listing=="Bainbridge":
        return "Bainbridge 11431"
    if listing=="Longbranch":
        return "Longbranch 6821"
    else:
        return listing

def infer_service_type(text: str):
    """Determine whether invoice contains hot tub service."""
    if re.search(r"hot\s*tub", text, flags=re.IGNORECASE):
        return "Cleaning_Hottub"
    return "Cleaning"


def extract_invoice_number(pdf_path: Path):
    """
    Extract invoice number from filename.
    Example: 'INV065.pdf' → 'INV065'
    """
    name = pdf_path.stem  # 'INV065'
    m = re.search(r"INV\d+", name, flags=re.IGNORECASE)
    return m.group(0).upper() if m else None


def summarize_invoice(pdf_path: Path) -> dict:
    """Parse invoice PDF and return structured summary."""
    with pdfplumber.open(pdf_path) as pdf:
        pages_text = [page.extract_text() or "" for page in pdf.pages]

    text = "\n".join(pages_text)

    # First non-empty line is service provider name
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    service_provider = lines[0] if lines else "UnknownProvider"
    service_provider = np.where("camila" in service_provider.lower(), "Camila", service_provider)

    invoice_number = extract_invoice_number(pdf_path)
    total_amount = extract_total(text)
    listing = extract_listing(text)
    date_of_service = pd.to_datetime(extract_service_date(text)) if extract_service_date(text) else date.today()
    service_type = infer_service_type(text)

    invoicecontent = f"{date_of_service.month}.{date_of_service.day:02d}_{service_type}"

    # Build standard output filename
    today_str = date.today().strftime("%Y%m%d")
    amount_str = f"{total_amount:.2f}".rstrip("0").rstrip(".") if total_amount else "NA"

    filename = (
        f"{today_str}_{listing}_{service_provider}_{invoice_number}_"
        f"{invoicecontent}_{amount_str}"
    )

    return {
        "invoice_number": invoice_number,
        "total_amount": total_amount,
        "listing": listing,
        "date_of_service": date_of_service,
        "service_type": service_type,
        "service_provider": service_provider,
        "invoicecontent": invoicecontent,
        "filename": filename,
        "source_file": pdf_path.name,
    }


In [2]:
INPUT_DIR = Path("/Users/ylin/Downloads/Invoice/")  # <-- EDIT THIS
OUTPUT_FILE = "/Users/ylin/Downloads/Invoice/service_summary.xlsx"
Mapping = pd.read_excel("/Users/ylin/My Drive/Cohost/Data and Reporting/04-Accounting/InvoicePayment/InvoiceTracking.xlsx",sheet_name='Mapping')


In [3]:
rows = []

for path in INPUT_DIR.rglob("*"):
    if not path.is_file():
        continue

    if path.suffix.lower() != ".pdf":
        # Skip non-PDFs
        continue

    row = summarize_invoice(path)
    if row:
        row["source_file"] = path.name
        rows.append(row)

df = pd.DataFrame(
        rows,
        columns=[
            "service_provider",
            "invoice_number",
            "total_amount", 
            "listing",
            "invoicecontent",
            "source_file",
            "filename"
        ],
    )

In [4]:
df.to_excel(OUTPUT_FILE, index=False)

In [5]:
df = df.loc[df['service_provider']=="Camila"]
df

Unnamed: 0,service_provider,invoice_number,total_amount,listing,invoicecontent,source_file,filename
0,Camila,INV058,175.0,Poulsbo 3866,12.26_Cleaning,INV058.pdf,20251226_Poulsbo 3866_Camila_INV058_12.26_Clea...
1,Camila,INV059,205.0,Poulsbo 3956,12.26_Cleaning_Hottub,INV059.pdf,20251226_Poulsbo 3956_Camila_INV059_12.26_Clea...
2,Camila,INV057,200.0,Bainbridge 11431,12.24_Cleaning,INV057.pdf,20251226_Bainbridge 11431_Camila_INV057_12.24_...


In [6]:
# change filenames for Camila invoices
for _, row in df.iterrows():
    old_path = INPUT_DIR / row["source_file"]
    new_name = row["filename"] + ".pdf"
    new_path = old_path.with_name(new_name)

    if old_path.exists():
        print(f"Renaming: {old_path.name} → {new_name}")
        old_path.rename(new_path)
    else:
        print(f"File not found: {old_path}")
        

Renaming: INV058.pdf → 20251226_Poulsbo 3866_Camila_INV058_12.26_Cleaning_175.pdf
Renaming: INV059.pdf → 20251226_Poulsbo 3956_Camila_INV059_12.26_Cleaning_Hottub_205.pdf
Renaming: INV057.pdf → 20251226_Bainbridge 11431_Camila_INV057_12.24_Cleaning_200.pdf


In [None]:
# change names based on excel sheet
df = pd.read_excel("/Users/ylin/My Drive/Cohost/Data and Reporting/04-Accounting/InvoicePayment/InvoiceTracking.xlsx")
df = df.loc[df["Date"]=='2025-12-19']
for _, row in df.iterrows():
    old_path = INPUT_DIR / row["fileName"]
    name_ext =row["fileName"].split(".")[-1]
    new_name = f"{row['filename']}.{name_ext}"
    new_path = old_path.with_name(new_name)
    if not old_path.exists():
        print(f"Skipping (not found): {old_path}")
        continue

    print(f"Renaming: {old_path.name} → {new_name}")
    old_path.rename(new_path)

Skipping (not found): /Users/ylin/Downloads/Invoice/INV054.pdf
Skipping (not found): /Users/ylin/Downloads/Invoice/INV052.pdf
Skipping (not found): /Users/ylin/Downloads/Invoice/INV051.pdf
Renaming: 20251217_001527-COLLAGE_Original.jpeg → 20251219_Mercer 3627_Jackson Livy_11.9_WindowGlassReplacement_198.36.jpeg
Renaming: 20251217_000803-COLLAGE_Original.jpeg → 20251219_Longbranch 6821_Jackson Livy_11.13_3 Space Heaters Emergency Purchase and Delivery_127.88.jpeg
Renaming: Diswasher_Repair_Invoice.pdf → 20251219_Kirkland 10219_Jackson Livy_11.9_Dishwasher Repair_200.pdf
Skipping (not found): /Users/ylin/Downloads/Invoice/Diswasher_Repair_Invoice.pdf
Renaming: IMG_6836.jpeg → 20251219_Redmond 7579_Jackson Livy_12.14_Emergency Bathroom Mat Purchase and Delivery_20.59.jpeg
Renaming: IMG_6833.jpeg → 20251219_Elektra 1108_Jackson Livy_9.22_2 Copy Keys Replacement_10.97.jpeg
Renaming: IMG_6830.jpeg → 20251219_Seattle 1502_Jackson Livy_11.8_Emergency PaperTowel Delivery_7.79.jpeg
Renaming: IMG

TypeError: unsupported operand type(s) for /: 'PosixPath' and 'float'