In [5]:
import re
import os
from pathlib import Path
from dateutil.parser import parse as parse_date
import pandas as pd
import pdfplumber
from datetime import date

# ========= CONFIG =========
# Change this to your *local* folder that contains all the invoices (PDFs only)
INPUT_DIR = Path("/Users/ylin/My Drive/Cohost/Company Transaction 2026/Valta Homes Unprocessed Invoices/")  # <-- EDIT THIS
OUTPUT_FILE = "/Users/ylin/My Drive/Cohost/Data and Reporting/04-Accounting/InvoicePayment/service_summary.xlsx"
Mapping = pd.read_excel("/Users/ylin/My Drive/Cohost/Data and Reporting/04-Accounting/InvoicePayment/InvoiceTracking.xlsx",sheet_name='Mapping')

# ========= HELPER FUNCTIONS =========

def infer_service_type(text: str) -> str:
    """Simple keyword-based classifier for service_type."""
    t = (text or "").lower()
    if any(k in t for k in ["landscaping", "lawn", "yard", "mow", "garden"]):
        return "Landscaping"
    if any(k in t for k in ["clean", "janitor", "maid", "housekeep"]):
        return "Cleaning"
    if any(k in t for k in ["repair & handyman"]):
        return "Repair"
    return "other"


def safe_parse_date(s: str):
    try:
        return parse_date(s, fuzzy=True).date()
    except Exception:
        return None

def extract_total(text):
    match = re.search(r"Total\s*\$([\d,\.]+)", text)
    if match:
         return  clean_number(match.group(1))

    # fallback
    money = re.search(r"\$?\s*([0-9,]+\.\d{2})", text)
    if money:
        return money.group(1)

    return None

def clean_number(x):
    x = float(x.replace(",", ""))
    s = ('%.2f' % x).rstrip('0').rstrip('.')
    return s


# ========= PDF PARSER =========

def parse_valta_pdf(path: Path):
    """
    Parse Valta-style PDF invoices where:
      - Service Address: <address>
      - Service Representative: <provider>
      - Date of service appears in a line like '11/21/2025' in the table
      - Total line like 'Total $55.00'
    """
    print(f"Parsing PDF: {path.name}")

    with pdfplumber.open(path) as pdf:
        pages_text = [page.extract_text() or "" for page in pdf.pages]

    text = "\n".join(pages_text)

    if not text.strip():
        return None
    # ----- SERVICE PROPERTY (from file name) -----
  
    filename = path.name.split("_")[1]
    listing = Mapping.loc[Mapping["InvoiceAbbr"].astype(str) == filename,"Listing"].values[0]
  
    # ----- SERVICE PROVIDER (from 'Service Representative:') -----
    # Capture text until end of line
    rep_match = re.search(r"Service Representative:\s*(.+)", text)
    service_provider = rep_match.group(1).strip().split()[0] if rep_match else None
    if not service_provider:  # fallback to file name if not found
        service_provider = path.stem

    # ----- DATE OF SERVICE -----
    # Grab the first mm/dd/yyyy in the text (typically the service date in the table)
    date_matches = re.findall(r"\b(\d{1,2}/\d{1,2}/\d{4})\b", text)
    date_of_service = None
    if date_matches:
        date_of_service = safe_parse_date(date_matches[2]).strftime("%m.%d")

    # ----- TOTAL AMOUNT -----
    total_amount = extract_total(text)

    # ----- SERVICE TYPE -----
    # Use the body text and provider to infer type
    service_type = infer_service_type(text + " " + service_provider)

    invoicecontent = service_provider + "_" + service_type

    filename = date.today().strftime("%Y%m%d") + "_" +listing + "_Valta Homes_" + date_of_service + "_" + invoicecontent + "_" + total_amount

    return {
        "total_amount": total_amount,
        "listing": listing,
        "date_of_service": date_of_service,
        "invoicecontent": invoicecontent,
        "filename":filename,
    }

In [6]:
"""# ======= TESTING =======
path = Path("/Users/ylin/My Drive/Cohost/Company Transaction 2025/Valta Homes Unprocessed Invoices/20251129_OSBR_Valta Homes_11.10_21_David_Repair_704.00.pdf")
with pdfplumber.open(path) as pdf:
    pages_text = [page.extract_text() or "" for page in pdf.pages]

text = "\n".join(pages_text)
match = re.search(r"Total\s*\$([\d,\.]+)", text)
match.group(1)
extract_total(text)
"""




In [7]:
rows = []

for path in INPUT_DIR.rglob("*"):
    if not path.is_file():
        continue

    if path.suffix.lower() != ".pdf":
        # Skip non-PDFs
        continue

    if not path.name.startswith("Invoice"):
        continue
    
    row = parse_valta_pdf(path)
    if row:
        row["source_file"] = path.name
        rows.append(row)

df = pd.DataFrame(
        rows,
        columns=[
            "total_amount", 
            "listing",
            "date_of_service",
            "invoicecontent",
            "source_file",
            "filename"
        ],
    )

df.to_excel(OUTPUT_FILE, index=False)

Parsing PDF: Invoice 20251222_C19_HM.pdf
Parsing PDF: Invoice 20251229_1502_HM.pdf


In [8]:
df

Unnamed: 0,total_amount,listing,date_of_service,invoicecontent,source_file,filename
0,682.5,Microsoft 14645-C19,12.22,Invoice 20251222_C19_HM_Repair,Invoice 20251222_C19_HM.pdf,20260104_Microsoft 14645-C19_Valta Homes_12.22...
1,129.09,Seattle 1502,12.29,Jason_Repair,Invoice 20251229_1502_HM.pdf,20260104_Seattle 1502_Valta Homes_12.29_Jason_...


In [5]:
for _, row in df.iterrows():
    old_path = INPUT_DIR / row["source_file"]
    new_name = f"{row['filename']}.pdf"
    new_path = old_path.with_name(new_name)

    if not old_path.exists():
        print(f"Skipping (not found): {old_path}")
        continue

    print(f"Renaming: {old_path.name} → {new_name}")
    old_path.rename(new_path)

Renaming: Invoice 20251217_6821_LS.pdf → 20251226_Longbranch 6821_Valta Homes_12.17_Jason_Landscaping_150.pdf
Renaming: Invoice 20251210_1621_HM.pdf → 20251226_Bellevue 1621_Valta Homes_12.10_Jason_Repair_146.34.pdf
Renaming: Invoice 20251212_14707_HM.pdf → 20251226_Redmond 14707_Valta Homes_12.12_Jason_Repair_110.03.pdf
Renaming: Invoice 20251130_11641_HM.pdf → 20251226_Redmond 11641_Valta Homes_11.24_Jason_Repair_770.pdf
Renaming: Invoice 20251118_8017_HM.pdf → 20251226_Kirkland 8017_Valta Homes_11.17_Jason_Repair_201.pdf
Renaming: Invoice 20251219_3956_LS.pdf → 20251226_Poulsbo 3956_Valta Homes_12.19_Manuel_Landscaping_130.pdf
Renaming: Invoice 20251211_12834_HM.pdf → 20251226_Seatac 12834_Valta Homes_12.11_Invoice 20251211_12834_HM_Repair_82.5.pdf
Renaming: Invoice 20251211_310_HM.pdf → 20251226_Shelton 310_Valta Homes_12.11_David_Cleaning_82.5.pdf
Renaming: Invoice 20251219_14507_HM.pdf → 20251226_Bellevue 14507_Valta Homes_12.19_Jason_Repair_110.pdf
Renaming: Invoice 20251205_563