# Invoice By Month Workflow
This notebook processes monthly invoice files, extracts property information, maps properties, and copies files to target locations.

In [2]:
# Import Required Libraries
import os
import pandas as pd
import shutil
import glob
import re
from openpyxl import load_workbook

## Define Helper Functions
All helper functions for file and data operations are defined below.

In [188]:
def get_listings():
    file_path = "/Users/ylin/Google Drive/My Drive/Cohost/Cohost Cleaner Compensation/Working/Data/Property_Cohost.xlsx"
    cohost = pd.read_excel(file_path)
    return cohost.Listing.unique().tolist()
  
def collect_paths(oldloc):
    loc1 = os.listdir(oldloc)
    loc1.remove(".DS_Store")
    pathes = {'loc1':loc1,'fullpath':[oldloc+"/"+x for x in loc1]}
    for k in loc1:
        loc2 = [d for d in os.listdir(oldloc+"/"+k) if os.path.isdir(oldloc+"/"+k+"/"+d) and not d.startswith('.DS_Store')]
        for d in loc2:
            pathes.append({'loc1': k, 'fullpath': d})
    return pd.DataFrame(pathes)

def collect_files(pathes):
    files = []
    for _, row in pathes.iterrows():
        tmp = os.listdir(row['fullpath'])
        for f in tmp:
            files.append({'loc1': row['loc1'], 'fullpath': row['fullpath'], 'file': f})
    return pd.DataFrame(files)

def extract_property1(files):
    def get_property1(x):
        parts = x.split('_')
        if len(parts)>1:
            txts = parts[1]
        else: 
            txts = None
        return txts
    files['property1'] = files['file'].apply(get_property1)
    return files

def assign_property(files, listings):
    files['property'] = None
    for idx, row in files.iterrows():
        tmp = False
        for x in listings:
            y = ' '.join(x.split(' ')[::-1])
            if bool(re.search(x, row['file']) or re.search(y, row['file'])):
                files.at[idx, 'property'] = x
                break 
        if bool(re.search('Booking.com|booking.com',row['file'])):         
            files.at[idx, 'property'] = 'BookingCommission'
        if bool(re.search('marketing|Marketing',row['file'])):         
            files.at[idx, 'property'] = 'Valta Realty'   
    return files

def save_tracking_csv(files, drv_loc, month):
    out_path = os.path.join(drv_loc, "Data and Reporting/04-Accounting/MonthlyInvoiceMigration/Filestracking", f"transactions_{month}.csv")
    files.to_csv(out_path, index=False, na_rep='')

def read_folder_paths(drv_loc):
    xlsx_path = os.path.join(drv_loc, "Data and Reporting/04-Accounting/MonthlyInvoiceMigration/Data/FolderPaths.xlsx")
    df = pd.read_excel(xlsx_path)
    df['loc'] = df['loc'].str.replace('2025', '2024')
    return df

def merge_files_loc(files, fileloc):
    files_loc = pd.merge(files, fileloc, left_on='property', right_on='listing', how='left')
    files_loc = files_loc[['property', 'loc', 'file', 'fullpath']]
    return files_loc

def copy_files(files_loc):
    for idx, row in files_loc.iterrows():
        if pd.notna(row['loc']) and row['property'] not in ["Valta Realty", "BookingCommission"]:
            src = os.path.join(row['fullpath'], row['file'])
            dst = os.path.join(row['loc'], "Invoice", row['file'])
            os.makedirs(os.path.dirname(dst), exist_ok=True)
            shutil.copy2(src, dst)
        elif pd.notna(row['loc']) and row['property'] in ["Valta Realty", "BookingCommission"]:
            src = os.path.join(row['fullpath'], row['file'])
            dst = os.path.join(row['loc'], row['file'])
            os.makedirs(row['loc'], exist_ok=True)
            shutil.copy2(src, dst)

def check_new_files(files_loc):
    newfiles = []
    for k in files_loc['loc'].dropna().unique():
        prop = files_loc.loc[files_loc['loc'] == k, 'property']
        if not any(prop.isin(["Valta Realty", "BookingCommission"])):
            tmp = os.listdir(os.path.join(k, "Invoice"))
            newfiles.extend([{'newpath': os.path.join(k, "Invoice"), 'file': f} for f in tmp])
        else:
            tmp = os.listdir(k)
            newfiles.extend([{'newpath': k, 'file': f} for f in tmp])
    newfiles_df = pd.DataFrame(newfiles)
    missing = set(files_loc.loc[files_loc['loc'].notna(), 'file']) - set(newfiles_df['file'])
    return missing

## Set Parameters and Paths
Set variables for the month, drive location, and old location.

In [None]:
Month = "2025-06"
drv_loc = "/Users/ylin/Google Drive/My Drive/Cohost/"
oldloc = drv_loc+"Accounting/Company Transactions/2025/"+Month
oldloc = f"{drv_loc}Accounting/Company Transactions/2025/{Month}"
listings = get_listings()

## Collect Directory Paths
Use collect_paths to get subdirectory paths for the selected month.

In [183]:
pathes = collect_paths(oldloc)


## Collect Files from Directories
Use collect_files to list files in each directory path.

In [123]:
files = collect_files(pathes)
files.tail()

Unnamed: 0,loc1,fullpath,file
194,Credit 5565,/Users/ylin/Google Drive/My Drive/Cohost/Accou...,20250613 legal certified mail to gray harbor d...
195,Credit 5565,/Users/ylin/Google Drive/My Drive/Cohost/Accou...,20250613 office supplies label maker 61.74.pdf
196,Credit 5565,/Users/ylin/Google Drive/My Drive/Cohost/Accou...,20250607 meal team with client Feifeili 495.44...
197,Credit 5565,/Users/ylin/Google Drive/My Drive/Cohost/Accou...,20250602_Beachwood_Legal Cintas Fire Inspectio...
198,Credit 5565,/Users/ylin/Google Drive/My Drive/Cohost/Accou...,20250602_Beachwood_Legal Cintas Fire Inspectio...


## Extract Property Information from Filenames
Use extract_property1 to parse filenames and extract property information.

In [143]:
#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', None)
files = extract_property1(files)
#files[["property1", "file"]]
#[d.split('_')[1] if len(d.split("_"))>1 else None for d in files.file ]


## Assign Properties to Files
Use assign_property to match files to property names using listings.

In [None]:
files = assign_property(files, listings)

In [None]:
txts = ["Longbranch","Hoodsports","Keaau","Lilliwaup"] + [f"OSBR {i}" for i in range(1,13)] + \
["Ocean Spray 8","Ocean Spray","Microsoft D303","OSBR","Seatac","Osbr","OSBR 11","Burien 14407 Middle",
 "Burien 14407 Top","Beachwood 6","Mercer Island 2449","13020","Microsoft 14620 E205","Microsoft 14645 C19",
 "Microsoft E205","Jing Properties","Seatac 12934","Kirkkland 10219"]
chngs = ["Longbranch 6821","Hoodsport 26060","Keaau 15-1542","Lilliwaup 28610"] +\
 [f"Cottage {i}" for i in range(1,13)] + ["Cottage 8","OSBR","Microsoft 14615-D303","OSBR","Seatac 12834",
  "OSBR","Cottage 11 (tiny)","Burien 14407 middle","Burien 14407 top","Beachwood 6","Mercer 2449",
  "Bellevue 13020","Microsoft 14620-E205", "Microsoft 14645-C19","Microsoft 14620-E205","Seatac 12834",
  "Seatac 12834","Kirkland 10219"]


Unnamed: 0,loc1,fullpath,file,property,property1
12,Credit 3104,/Users/ylin/Google Drive/My Drive/Cohost/Accou...,20250604_Hoodsports_Amazon_Bed Frame_114.32.pdf,,Hoodsports


## Map Properties to Standard Names
Use map_properties to standardize property names according to mapping rules.

In [213]:
def manual_map_properties(files):
    txts = ["Longbranch","Hoodsports","Keaau","Lilliwaup"] + [f"OSBR {i}" for i in range(1,13)] + \
            ["Ocean Spray 8","Ocean Spray","Microsoft D303","OSBR","Seatac","Osbr","OSBR 11","Burien 14407 Middle",
        "Burien 14407 Top","Beachwood 6","Mercer Island 2449","13020","Microsoft 14620 E205","Microsoft 14645 C19",
        "Microsoft E205","Jing Properties","Seatac 12934","Kirkkland 10219","Shelton310"]
    chngs = ["Longbranch 6821","Hoodsport 26060","Keaau 15-1542","Lilliwaup 28610"] +\
        [f"Cottage {i}" for i in range(1,13)] + ["Cottage 8","OSBR","Microsoft 14615-D303","OSBR","Seatac 12834",
        "OSBR","Cottage 11 (tiny)","Burien 14407 middle","Burien 14407 top","Beachwood 6","Mercer 2449",
        "Bellevue 13020","Microsoft 14620-E205", "Microsoft 14645-C19","Microsoft 14620-E205","Seatac 12834",
        "Seatac 12834","Kirkland 10219","Shelton 310"]
    for t, c in zip(txts, chngs):
        idx = files['property1'] == t
        files.loc[idx, 'property'] = c
    return files

files = manual_map_properties(files)
#files[files['file'].str.contains('Elektra', case=False, regex=True)]
files[files['property'].isna()]

Unnamed: 0,loc1,fullpath,file,property,property1
0,3038,/Users/ylin/Google Drive/My Drive/Cohost/Accou...,20250630 Legal James Justason case Frank as co...,,1.PNG
1,3038,/Users/ylin/Google Drive/My Drive/Cohost/Accou...,20250630 Legal James Justason case Frank as co...,,2.PNG
2,3038,/Users/ylin/Google Drive/My Drive/Cohost/Accou...,20250630 Legal kyle freeman case Frank as cons...,,1.PNG
3,3038,/Users/ylin/Google Drive/My Drive/Cohost/Accou...,20250630 Legal kyle freeman case Frank as cons...,,2.PNG
4,7197,/Users/ylin/Google Drive/My Drive/Cohost/Accou...,20260623 Tax WA exercise 122.27.pdf,,
5,7197,/Users/ylin/Google Drive/My Drive/Cohost/Accou...,20250609 Valta Cohost Wages for trash collecti...,,
8,Credit 3104,/Users/ylin/Google Drive/My Drive/Cohost/Accou...,20250606_Costco Issaquah_Supplies_$272.62.jpeg,,Costco Issaquah
13,Credit 3104,/Users/ylin/Google Drive/My Drive/Cohost/Accou...,20250630_OSBR laundry_Amazon_32.66.pdf,,OSBR laundry
15,Credit 3104,/Users/ylin/Google Drive/My Drive/Cohost/Accou...,20250624_Costco Issaquah_Supplies_317.00.jpeg,,Costco Issaquah
22,Credit 3104,/Users/ylin/Google Drive/My Drive/Cohost/Accou...,20250603_Target_All unit supply_14.49.jpg,,Target


## Save Tracking CSV
Use save_tracking_csv to save the processed file data to a CSV file.

In [None]:
save_tracking_csv(files, drv_loc, Month)

## Read Folder Paths from Excel
Use read_folder_paths to load folder location data from an Excel file.

In [None]:
fileloc = read_folder_paths(drv_loc)
fileloc.head()

## Merge File and Location Data
Use merge_files_loc to merge file data with folder location data.

In [None]:
files_loc = merge_files_loc(files, fileloc)
files_loc.head()

## Copy Files to Target Locations
Use copy_files to copy files to their respective target folders.

In [None]:
copy_files(files_loc)

## Check for Missing Files
Use check_new_files to identify files that were not copied successfully.

In [None]:
missing = check_new_files(files_loc)
print("Missing files:", missing)

## Run Main Workflow
Combine all steps above to execute the full workflow and print missing files.

In [None]:
def main():
    Month = "2024-06"
    drv_loc = "/Users/ylin/Google Drive/My Drive/Cohost/"
    oldloc = os.path.join(drv_loc, "Accounting/Company Transactions/2024", Month)
    listings = get_listings(os.path.join(drv_loc, "Data and Reporting/04-Accounting/MonthlyInvoiceMigration/Codes/Functions.R"))
    pathes = collect_paths(oldloc)
    files = collect_files(pathes)
    files = extract_property1(files, listings)
    files = assign_property(files, listings)
    files = map_properties(files)
    save_tracking_csv(files, drv_loc, Month)
    fileloc = read_folder_paths(drv_loc)
    files_loc = merge_files_loc(files, fileloc)
    copy_files(files_loc)
    missing = check_new_files(files_loc)
    print("Missing files:", missing)

main()