In [40]:
import re
import os
from pathlib import Path
from dateutil.parser import parse as parse_date
import pandas as pd
import pdfplumber
from datetime import date

# ========= CONFIG =========
# Change this to your *local* folder that contains all the invoices (PDFs only)
INPUT_DIR = Path("/Users/ylin/Google Drive/My Drive/Company Transactions/2026/Valta Homes Unprocessed Invoices/")  # <-- EDIT THIS
OUTPUT_FILE = "/Users/ylin/Downloads/Invoice/service_summary.xlsx"
Mapping = pd.read_excel("/Users/ylin/Google Drive/My Drive/Data and Reporting/04-Accounting/InvoicePayment/InvoiceTracking.xlsx",sheet_name='Mapping')

# ========= HELPER FUNCTIONS =========

def infer_service_type(text: str) -> str:
    """Simple keyword-based classifier for service_type."""
    rep_match = re.search(r"Service - \s*(.+)", text)
    if rep_match:
        result = rep_match.group(1).strip()
    elif re.search(r"cleaning", text, re.IGNORECASE):
        result = text.strip()
    else:
        result = None

    t = result.lower()
    if any(k in t for k in ["landscaping", "lawn", "yard", "mow", "garden"]):
        return "Landscaping"
    if any(k in t for k in ["clean", "janitor", "maid", "housekeep"]):
        return "Cleaning"
    if any(k in t for k in ["repair & handyman"]):
        return "Repair"
    if any(k in t for k in ["staging", "decor", "design"]):
        return "Staging"
    return "other"

def safe_parse_date(s: str):
    try:
        return parse_date(s, fuzzy=True).date()
    except Exception:
        return None

def extract_total(text):
    match = re.search(r"Total\s*\$([\d,\.]+)", text)
    if match:
         return  clean_number(match.group(1))

    # fallback
    money = re.search(r"\$?\s*([0-9,]+\.\d{2})", text)
    if money:
        return money.group(1)

    return None

def clean_number(x):
    x = float(x.replace(",", ""))
    s = ('%.2f' % x).rstrip('0').rstrip('.')
    return s


# ========= PDF PARSER =========

def parse_valta_pdf(path: Path):
    """
    Parse Valta-style PDF invoices where:
      - Service Address: <address>
      - Service Representative: <provider>
      - Date of service appears in a line like '11/21/2025' in the table
      - Total line like 'Total $55.00'
    """
    print(f"Parsing PDF: {path.name}")

    with pdfplumber.open(path) as pdf:
        pages_text = [page.extract_text() or "" for page in pdf.pages]

    text = "\n".join(pages_text)

    if not text.strip():
        return None
    # ----- SERVICE PROPERTY (from file name) -----
  
    filename = path.name.split("_")[1].split(".")[0]
    listing = Mapping.loc[Mapping["InvoiceAbbr"].astype(str) == filename,"Listing"].values[0]
  
    # ----- SERVICE PROVIDER (from 'Service Representative:') -----
    # Capture text until end of line
    rep_match = re.search(r"Service Representative:\s*(.+)", text)
    service_provider = rep_match.group(1).strip().split()[0] if rep_match else None
    if not service_provider:  # fallback to file name if not found
        service_provider = path.stem

    # ----- DATE OF SERVICE -----
    # Grab the first mm/dd/yyyy in the text (typically the service date in the table)
    date_matches = re.findall(r"\b(\d{1,2}/\d{1,2}/\d{4})\b", text)
    date_of_service = None
    if date_matches:
        date_of_service = safe_parse_date(date_matches[2]).strftime("%m.%d")

    # ----- TOTAL AMOUNT -----
    total_amount = extract_total(text)

    # ----- SERVICE TYPE -----
    # Use the body text and provider to infer type
    service_type = infer_service_type(text + " " + service_provider)

    invoicecontent = service_provider + "_" + service_type

    filename = date.today().strftime("%Y%m%d") + "_" +listing + "_Valta Homes_" + date_of_service + "_" + invoicecontent + "_" + total_amount

    return {
        "total_amount": total_amount,
        "listing": listing,
        "date_of_service": date_of_service,
        "invoicecontent": invoicecontent,
        "filename":filename,
    }

In [None]:
"""# ======= TESTING =======
path = Path("/Users/ylin/Google Drive/My Drive/Company Transactions/2026/Valta Homes Unprocessed Invoices/Invoice 20260114_4027U2_MI.pdf")#Invoice 20260126_250_HM.pdf")
with pdfplumber.open(path) as pdf:
    pages_text = [page.extract_text() or "" for page in pdf.pages]

text = "\n".join(pages_text)
rep_match = re.search(r"Service Representative:\s*(.+)", text)
service_provider = rep_match.group(1).strip().split()[0] if rep_match else None
infer_service_type(text + " " + service_provider)
"""


'Cleaning'

In [41]:
rows = []

for path in INPUT_DIR.rglob("*"):
    if not path.is_file():
        continue

    if path.suffix.lower() != ".pdf":
        # Skip non-PDFs
        continue

    if not path.name.startswith("Invoice"):
        continue
    
    row = parse_valta_pdf(path)
    if row:
        row["source_file"] = path.name
        rows.append(row)
        

Parsing PDF: Invoice 20260202_E809_CL.pdf
Parsing PDF: Invoice 20260126_26060_HM.pdf
Parsing PDF: Invoice 20260202_E1203_HM.pdf
Parsing PDF: Invoice 20260203_11321_HM.pdf
Parsing PDF: Invoice 20260126_250_HM.pdf
Parsing PDF: Invoice 20260206_OSBR_HM.pdf
Parsing PDF: Invoice 20260114_4027U2_MI.pdf
Parsing PDF: Invoice 20260219_10057_ST.pdf
Parsing PDF: Invoice 20260202_E703_HM.pdf
Parsing PDF: Invoice 20260202_E1115_HM.pdf


In [42]:
df = pd.DataFrame(
        rows,
        columns=[
            "total_amount", 
            "listing",
            "date_of_service",
            "invoicecontent",
            "source_file",
            "filename"
        ],
    )
df.to_excel(OUTPUT_FILE, index=False)


In [43]:
for _, row in df.iterrows():
    old_path = INPUT_DIR / row["source_file"]
    new_name = f"{row['filename']}.pdf"
    new_path = old_path.with_name(new_name)

    if not old_path.exists():
        print(f"Skipping (not found): {old_path}")
        continue

    print(f"Renaming: {old_path.name} → {new_name}")
    old_path.rename(new_path)

Renaming: Invoice 20260202_E809_CL.pdf → 20260223_Elektra 809_Valta Homes_02.02_Invoice 20260202_E809_CL_Cleaning_185.pdf
Renaming: Invoice 20260126_26060_HM.pdf → 20260223_Hoodsport 26060_Valta Homes_01.26_David_Repair_95.pdf
Renaming: Invoice 20260202_E1203_HM.pdf → 20260223_Elektra 1203_Valta Homes_02.02_Jason_Repair_55.pdf
Renaming: Invoice 20260203_11321_HM.pdf → 20260223_Kirkland 11321_Valta Homes_01.13_Jason_Repair_493.48.pdf
Renaming: Invoice 20260126_250_HM.pdf → 20260223_Shelton 250_Valta Homes_01.26_David_Repair_150.pdf
Renaming: Invoice 20260206_OSBR_HM.pdf → 20260223_OSBR_Valta Homes_02.03_David_Repair_434.19.pdf
Renaming: Invoice 20260114_4027U2_MI.pdf → 20260223_Beachwood 2_Valta Homes_01.14_Maria_Cleaning_184.5.pdf
Renaming: Invoice 20260219_10057_ST.pdf → 20260223_Seattle 10057_Valta Homes_10.30_Staging_Staging_240.pdf
Renaming: Invoice 20260202_E703_HM.pdf → 20260223_Elektra 703_Valta Homes_02.02_Flavio_Repair_104.98.pdf
Renaming: Invoice 20260202_E1115_HM.pdf → 20260