In [1]:
from pathlib import Path
import pandas as pd

DATA_DIR = Path('..') / 'data'
RAW_PATH = DATA_DIR / 'boston' / 'raw' / 'Boston Raw 6.25.csv'
PROCESSED_PATH = DATA_DIR / 'boston' / 'processed' / '2025_06 Boston Globe.xlsx'

In [2]:
def dtype_name(dtype: pd.Series) -> str:
    if pd.api.types.is_integer_dtype(dtype):
        return 'int'
    if pd.api.types.is_float_dtype(dtype):
        return 'float'
    if pd.api.types.is_bool_dtype(dtype):
        return 'bool'
    if pd.api.types.is_datetime64_any_dtype(dtype):
        return 'datetime64[ns]'
    return 'str'

def infer_column_types(df: pd.DataFrame):
    return [{col: dtype_name(dtype)} for col, dtype in df.dtypes.items()]

In [3]:
raw_df = pd.read_csv(RAW_PATH, low_memory=False)
raw_column_types = infer_column_types(raw_df)
raw_column_types

[{'OrderURN': 'str'},
 {'CustomerURN': 'str'},
 {'Customer_Number': 'float'},
 {'Customer_Name': 'str'},
 {'Agency_URN': 'str'},
 {'Agency_Number': 'float'},
 {'Agency_Name': 'str'},
 {'TitleType1': 'str'},
 {'Title': 'str'},
 {'PageGroup': 'str'},
 {'Class': 'str'},
 {'Position': 'str'},
 {'Style': 'str'},
 {'Border': 'str'},
 {'FT_Campaign_ID': 'float'},
 {'lineitem_id': 'float'},
 {'Insert_Date': 'str'},
 {'Stop_Date': 'str'},
 {'Number_Dates': 'float'},
 {'Size': 'str'},
 {'HJ_Columns': 'int'},
 {'HJ_Depth': 'float'},
 {'HJ_Width': 'float'},
 {'HJ_Lines': 'float'},
 {'Insert_Net_Price': 'float'},
 {'Insert_Gross_Price': 'float'},
 {'Insert_Tax': 'float'},
 {'Insert_Tax_Rate': 'float'},
 {'Row_Net_Price': 'float'},
 {'Row_Gross_Price': 'float'},
 {'Reason_Code': 'str'},
 {'Reason_Description': 'str'},
 {'Ad_Color': 'str'},
 {'PONumber': 'str'},
 {'StyleType': 'str'},
 {'First_Date': 'str'},
 {'Last_Date': 'str'},
 {'Edzone': 'str'},
 {'Invoice_Text': 'str'},
 {'Physical_Inserts': 'f

In [4]:
processed_df = pd.read_excel(PROCESSED_PATH)
processed_columns = processed_df.columns.tolist()
processed_columns

['OrderURN',
 'CustomerURN',
 'Customer_Number',
 'Customer_Name',
 'Agency_URN',
 'Agency_Number',
 'Agency_Name',
 'TitleType1',
 'Title',
 'PageGroup',
 'Class',
 'Position',
 'Style',
 'Border',
 'FT_Campaign_ID',
 'Insert_Date',
 'Stop_Date',
 'Number_Dates',
 'Size',
 'HJ_Columns',
 'HJ_Depth',
 'HJ_Width',
 'HJ_Lines',
 'Insert_Net_Price',
 'Insert_Gross_Price',
 'Insert_Tax',
 'Insert_Tax_Rate',
 'Row_Net_Price',
 'Row_Gross_Price',
 'Reason_Code',
 'Reason_Description',
 'Ad_Color',
 'PONumber',
 'StyleType',
 'First_Date',
 'Last_Date',
 'Edzone',
 'Invoice_Text',
 'Physical_Inserts',
 'Number_Of_Pages',
 'Advertiser_Type',
 'Create_Time',
 'UpdateTime',
 'Booking_Notes',
 'PackageName',
 'Payment',
 'AdSource',
 'Scrutiny',
 'ImmigrationAD',
 'SummaryClass',
 'External_AD_ID',
 'OrderKeyer',
 'Team_Keyer',
 'OperatorName',
 'Scrutiny_Release_Operator',
 'Team_Name',
 'OrderTaker',
 'Sales_Rep',
 'SRWork_Responsibility',
 'HouseAD',
 'Insert_Text_Version',
 'Contract_ID',
 'R