# Macro Monthly Features → SBA Loans Merge

This notebook:
1. Loads `macro_monthly_features.csv` (pre-aggregated monthly macro indicators)
2. Loads the SBA loan dataset
3. Merges on `approval_year` and `approval_month`
4. Validates coverage and spot checks
5. Saves the final merged dataset

In [None]:
import pandas as pd
import numpy as np

MACRO_PATH = 'macro/macro_monthly_features.csv'
SBA_PATH   = '../sba_loan_with_unemployment.csv'

In [None]:
# ── 1. Load macro monthly features ──
macro = pd.read_csv(MACRO_PATH)
print(f"Macro shape: {macro.shape}")
print(f"Date range: {macro['approval_year'].min()}/{macro['approval_month'].min()} → "
      f"{macro['approval_year'].max()}/{macro['approval_month'].max()}")
print(f"\nColumns: {list(macro.columns)}")
macro.head()

In [None]:
# ── 2. Load SBA loan dataset ──
sba = pd.read_csv(SBA_PATH, low_memory=False)
print(f"SBA shape: {sba.shape}")
print(f"Approval date range: {sba['ApprovalDate'].min()} → {sba['ApprovalDate'].max()}")

# Check which macro columns already exist in SBA
macro_feature_cols = [c for c in macro.columns if c not in ['approval_year', 'approval_month']]
already_present = [c for c in macro_feature_cols if c in sba.columns]
if already_present:
    print(f"\n⚠ These macro columns already exist in SBA and will be dropped before re-merging:")
    print(f"  {already_present}")
    sba = sba.drop(columns=already_present)

print(f"\nSBA shape after cleanup: {sba.shape}")

In [None]:
# ── 3. Merge on approval_year & approval_month ──
merged = sba.merge(macro, on=['approval_year', 'approval_month'], how='left')

print(f"Merged shape: {merged.shape}")
print(f"\nNull counts in macro features:")
print(merged[macro_feature_cols].isnull().sum())
print(f"\nRows with any null macro feature: "
      f"{merged[macro_feature_cols].isnull().any(axis=1).sum()}")

In [None]:
# ── 4. Spot check merged values against raw CSV files ──
SERIES_FILES = {
    'ten_year_treasury': '10_Year_Treasury.csv',
    'one_year_treasury': '1_Year_Treasury.csv',
    'mortgage_30y':      '30_Year_Mortgage.csv',
    'crude_oil':         'Crude_Oil.csv',
    'dow_jones':         'Dow_Jones.csv',
    'fed_fund_rate':     'Fed_Fund_Rate.csv',
    'housing_starts':    'Housing_start.csv',
    'sp500':             'SP500.csv',
}

# Load each raw file and compute monthly averages
raw_monthly = {}
for col_name, filename in SERIES_FILES.items():
    df = pd.read_csv(f'macro/{filename}', encoding='utf-8-sig')
    df['Date'] = pd.to_datetime(df['Date'])
    df['Value'] = pd.to_numeric(df['Value'], errors='coerce')
    df['year'] = df['Date'].dt.year
    df['month'] = df['Date'].dt.month
    monthly = df.groupby(['year', 'month'])['Value'].mean()
    raw_monthly[col_name] = monthly

# Pick a few (year, month) combos to spot check
spot_checks = [(1990, 1), (1995, 6), (2000, 12), (2008, 9), (2013, 5)]

results = []
for year, month in spot_checks:
    row = merged[(merged['approval_year'] == year) & (merged['approval_month'] == month)].iloc[0]
    for col_name in SERIES_FILES:
        merged_val = row[col_name]
        raw_val = raw_monthly[col_name].get((year, month), np.nan)
        match = "✓" if pd.notna(raw_val) and np.isclose(merged_val, raw_val, rtol=1e-6) else "✗"
        results.append({
            'year': year, 'month': month, 'feature': col_name,
            'merged': round(merged_val, 4), 'raw_avg': round(raw_val, 4) if pd.notna(raw_val) else None,
            'match': match
        })

check_df = pd.DataFrame(results)
n_pass = (check_df['match'] == '✓').sum()
n_total = len(check_df)
print(f"Spot check: {n_pass}/{n_total} values match raw data\n")
check_df

In [None]:
# ── 5. Save final merged dataset ──
OUTPUT_PATH = '../sba_loan_with_unemployment.csv'
merged.to_csv(OUTPUT_PATH, index=False)
print(f"Saved merged dataset to {OUTPUT_PATH}")
print(f"Final shape: {merged.shape}")
print(f"Columns: {list(merged.columns)}")