In [5]:
import re
from pathlib import Path
from dateutil.parser import parse as parse_date
import pdfplumber
from datetime import date
import pandas as pd
import numpy as np
def extract_total(text: str):
    """Extract Total or Grand Total."""
    m = re.search(r"Grand Total\s*\$([\d,\.]+)", text, flags=re.IGNORECASE)
    if not m:
        matches = re.findall(r"Total\s*\$([\d,\.]+)", text, flags=re.IGNORECASE)
        if not matches:
            return None
        amount_str = matches[-1]
    else:
        amount_str = m.group(1)
    return float(amount_str.replace(",", ""))


def extract_service_date(text: str):
    """Extract date from 'Date: Dec 5, 2025'."""
    m = re.search(r"Date\s*:\s*(.+)", text, flags=re.IGNORECASE)
    if not m:
        return None
    try:
        return parse_date(m.group(1)).date().isoformat()
    except:
        return None


def extract_listing(text: str):
    """
    Extract listing/property name from item line:
    '01 593 Poulsbo  $150.00'
    """
    m = re.search(r"^\s*\d+\s+(.+?)\s+\$\s*[\d,\.]+", text, flags=re.MULTILINE)
    if not m:
        return None
    
    listing = m.group(1).strip()
    parts = listing.split()
    if len(parts) == 2 and parts[0].isdigit():
        num, name = parts
        return f"{name} {num}"
    if listing=="Bainbridge":
        return "Bainbridge 11431"
    if listing=="Longbranch":
        return "Longbranch 6821"
    else:
        return listing

def infer_service_type(text: str):
    """Determine whether invoice contains hot tub service."""
    if re.search(r"hot\s*tub", text, flags=re.IGNORECASE):
        return "Cleaning_Hottub"
    return "Cleaning"


def extract_invoice_number(pdf_path: Path):
    """
    Extract invoice number from filename.
    Example: 'INV065.pdf' → 'INV065'
    """
    name = pdf_path.stem  # 'INV065'
    m = re.search(r"INV\d+", name, flags=re.IGNORECASE)
    return m.group(0).upper() if m else None


def summarize_invoice(pdf_path: Path) -> dict:
    """Parse invoice PDF and return structured summary."""
    with pdfplumber.open(pdf_path) as pdf:
        pages_text = [page.extract_text() or "" for page in pdf.pages]

    text = "\n".join(pages_text)

    # First non-empty line is service provider name
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    service_provider = lines[0] if lines else "UnknownProvider"
    service_provider = np.where("camila" in service_provider.lower(), "Camila", service_provider)

    invoice_number = extract_invoice_number(pdf_path)
    total_amount = extract_total(text)
    listing = extract_listing(text)
    date_of_service = pd.to_datetime(extract_service_date(text)) if extract_service_date(text) else date.today()
    service_type = infer_service_type(text)

    invoicecontent = f"{date_of_service.month}.{date_of_service.day:02d}_{service_type}"

    # Build standard output filename
    today_str = date.today().strftime("%Y%m%d")
    amount_str = f"{total_amount:.2f}".rstrip("0").rstrip(".") if total_amount else "NA"

    filename = (
        f"{today_str}_{listing}_{service_provider}_{invoice_number}_"
        f"{invoicecontent}_{amount_str}"
    )

    return {
        "invoice_number": invoice_number,
        "total_amount": total_amount,
        "listing": listing,
        "date_of_service": date_of_service,
        "service_type": service_type,
        "service_provider": service_provider,
        "invoicecontent": invoicecontent,
        "filename": filename,
        "source_file": pdf_path.name,
    }


In [6]:
INPUT_DIR = Path("/Users/ylin/Downloads/Invoice/")  # <-- EDIT THIS
OUTPUT_FILE = "/Users/ylin/Downloads/Invoice/service_summary.xlsx"
Mapping = pd.read_excel("/Users/ylin/My Drive/Cohost/Data and Reporting/04-Accounting/InvoicePayment/InvoiceTracking.xlsx",sheet_name='Mapping')


In [7]:
rows = []

for path in INPUT_DIR.rglob("*"):
    if not path.is_file():
        continue

    if path.suffix.lower() != ".pdf":
        # Skip non-PDFs
        continue

    row = summarize_invoice(path)
    if row:
        row["source_file"] = path.name
        rows.append(row)

df = pd.DataFrame(
        rows,
        columns=[
            "service_provider",
            "invoice_number",
            "total_amount", 
            "listing",
            "invoicecontent",
            "source_file",
            "filename"
        ],
    )

In [8]:
df.to_excel(OUTPUT_FILE, index=False)

In [11]:
df = df.loc[df['service_provider']=="Camila"]
df

Unnamed: 0,service_provider,invoice_number,total_amount,listing,invoicecontent,source_file,filename
2,Camila,INV049,200.0,Poulsbo 3956,12.11_Cleaning_Hottub,INV049.pdf,20251212_Poulsbo 3956_Camila_INV049_12.11_Clea...
3,Camila,INV048,175.0,Poulsbo 3866,12.11_Cleaning,INV048.pdf,20251212_Poulsbo 3866_Camila_INV048_12.11_Clea...
8,Camila,INV046,150.0,Poulsbo 563,12.09_Cleaning,INV046.pdf,20251212_Poulsbo 563_Camila_INV046_12.09_Clean...
9,Camila,INV045,230.0,Longbranch 6821,12.09_Cleaning_Hottub,INV045.pdf,20251212_Longbranch 6821_Camila_INV045_12.09_C...
10,Camila,INV050,200.0,Bainbridge 11431,12.12_Cleaning,INV050.pdf,20251212_Bainbridge 11431_Camila_INV050_12.12_...


In [16]:

for _, row in df.iterrows():
    old_path = INPUT_DIR / row["source_file"]
    new_name = row["filename"] + ".pdf"
    new_path = old_path.with_name(new_name)

    if old_path.exists():
        print(f"Renaming: {old_path.name} → {new_name}")
        old_path.rename(new_path)
    else:
        print(f"File not found: {old_path}")
        

Renaming: INV049.pdf → 20251212_Poulsbo 3956_Camila_INV049_12.11_Cleaning_Hottub_200.pdf
Renaming: INV048.pdf → 20251212_Poulsbo 3866_Camila_INV048_12.11_Cleaning_175.pdf
Renaming: INV046.pdf → 20251212_Poulsbo 563_Camila_INV046_12.09_Cleaning_150.pdf
Renaming: INV045.pdf → 20251212_Longbranch 6821_Camila_INV045_12.09_Cleaning_Hottub_230.pdf
Renaming: INV050.pdf → 20251212_Bainbridge 11431_Camila_INV050_12.12_Cleaning_200.pdf


In [15]:
for file in df['source_file']:
    name_no_ext = file
    new_name = df.loc[df['source_file'] == file, 'filename'].values[0] + ".pdf"
    new_path = file.with_name(new_name) 
    print(f"Renaming: {file.name} → {new_name}")
    file.rename(new_path)

AttributeError: 'str' object has no attribute 'with_name'