
# Time-Series Train/Validation/Test Split

This notebook splits `sba_loan_final_v2.csv` using `approval_year` and `approval_month` with a strict time order:

- Train: before `2007-01`
- Validation: `2007-01` through `2009-12`
- Test: `2010-01` and later

This ensures the model uses past data to predict future data and isolates the 2007-2009 period for validation.


In [1]:

from pathlib import Path
import pandas as pd

pd.set_option('display.max_columns', 200)

PROJECT_ROOT = Path('/Users/yangmar/Desktop/MS&E 246/Project')
DATA_PATH = PROJECT_ROOT / 'sba_loan_final_v2.csv'

# Split boundaries
VAL_START = pd.Timestamp('2007-01-01')
VAL_END = pd.Timestamp('2009-12-31')
TEST_START = pd.Timestamp('2010-01-01')

# Optional save
SAVE_SPLITS = True
OUT_DIR = PROJECT_ROOT / 'modeling' / 'splits'
OUT_DIR.mkdir(parents=True, exist_ok=True)

print('Data path exists:', DATA_PATH.exists())


Data path exists: True


In [2]:

# Load data
sba = pd.read_csv(DATA_PATH)

required = {'approval_year', 'approval_month'}
missing = required - set(sba.columns)
if missing:
    raise ValueError(f'Missing required columns: {missing}')

sba['approval_year'] = pd.to_numeric(sba['approval_year'], errors='coerce').astype('Int64')
sba['approval_month'] = pd.to_numeric(sba['approval_month'], errors='coerce').astype('Int64')

# Build month-start timestamp for splitting
sba['approval_ym'] = pd.to_datetime(
    {
        'year': sba['approval_year'].astype('Int64'),
        'month': sba['approval_month'].astype('Int64'),
        'day': 1,
    },
    errors='coerce'
)

bad_dates = sba['approval_ym'].isna().sum()
print('Rows:', len(sba))
print('Invalid year-month rows:', int(bad_dates))
print('Approval YM range:', sba['approval_ym'].min(), 'to', sba['approval_ym'].max())


Rows: 127475
Invalid year-month rows: 0
Approval YM range: 1990-01-01 00:00:00 to 2014-01-01 00:00:00


  sba = pd.read_csv(DATA_PATH)


In [3]:

# Time-based split
train_mask = sba['approval_ym'] < VAL_START
val_mask = (sba['approval_ym'] >= VAL_START) & (sba['approval_ym'] <= VAL_END)
test_mask = sba['approval_ym'] >= TEST_START

train_df = sba.loc[train_mask].copy()
val_df = sba.loc[val_mask].copy()
test_df = sba.loc[test_mask].copy()

print('Train rows:', len(train_df))
print('Validation rows:', len(val_df))
print('Test rows:', len(test_df))
print('Total in splits:', len(train_df) + len(val_df) + len(test_df))


Train rows: 74556
Validation rows: 21757
Test rows: 31162
Total in splits: 127475


In [4]:

# Sanity checks to prevent temporal leakage
assert train_df['approval_ym'].max() < val_df['approval_ym'].min(), 'Train/Val overlap detected'
assert val_df['approval_ym'].max() < test_df['approval_ym'].min(), 'Val/Test overlap detected'

# Ensure all rows with valid approval_ym are assigned exactly once
valid = sba[sba['approval_ym'].notna()].copy()
assigned = train_mask | val_mask | test_mask
assert assigned[valid.index].all(), 'Some valid rows were not assigned to any split'

# Check period ranges
summary = pd.DataFrame({
    'split': ['train', 'validation', 'test'],
    'start': [train_df['approval_ym'].min(), val_df['approval_ym'].min(), test_df['approval_ym'].min()],
    'end': [train_df['approval_ym'].max(), val_df['approval_ym'].max(), test_df['approval_ym'].max()],
    'rows': [len(train_df), len(val_df), len(test_df)],
})
summary


Unnamed: 0,split,start,end,rows
0,train,1990-01-01,2006-12-01,74556
1,validation,2007-01-01,2009-12-01,21757
2,test,2010-01-01,2014-01-01,31162


In [5]:

# Optional: save split files
if SAVE_SPLITS:
    train_path = OUT_DIR / 'sba_train_pre2007.csv'
    val_path = OUT_DIR / 'sba_validation_2007_2009.csv'
    test_path = OUT_DIR / 'sba_test_2010_onward.csv'

    train_df.drop(columns=['approval_ym']).to_csv(train_path, index=False)
    val_df.drop(columns=['approval_ym']).to_csv(val_path, index=False)
    test_df.drop(columns=['approval_ym']).to_csv(test_path, index=False)

    print('Saved:')
    print(' ', train_path)
    print(' ', val_path)
    print(' ', test_path)


Saved:
  /Users/yangmar/Desktop/MS&E 246/Project/modeling/splits/sba_train_pre2007.csv
  /Users/yangmar/Desktop/MS&E 246/Project/modeling/splits/sba_validation_2007_2009.csv
  /Users/yangmar/Desktop/MS&E 246/Project/modeling/splits/sba_test_2010_onward.csv


In [7]:
train_df.head()

Unnamed: 0,BorrZip,CDC_Zip,ThirdPartyLender_City,ThirdPartyLender_State,ThirdPartyDollars,GrossApproval,ApprovalDate,ApprovalFiscalYear,DeliveryMethod,subpgmdesc,TermInMonths,ProjectCounty,ProjectState,BusinessType,LoanStatus,ChargeOffDate,GrossChargeOffAmount,HasThirdParty,NaicsSector,BorrZip_clean,approval_year,approval_month,county_fips,CDC_Zip_clean,county_unemployment_rate,ten_year_treasury,one_year_treasury,mortgage_30y,crude_oil,dow_jones,fed_fund_rate,housing_starts,sp500,inflation_annual,cpi_u_seasonality_adjusted,business_loan_chargeoff_rate_pct,commercial_bank_loans_leases_billions,consumer_credit_outstanding_millions,core_cpi_excl._food_energy,all_loan_delinquency_rate_pct,personal_savings_rate_pct,consumer_sentiment_index,approval_ym
0,66106,65109,UNKNOWN,UNKNOWN,0,166000,1990-01-02,1990,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,12,WYANDOTTE,KS,INDIVIDUAL,PIF,,0,0,UNKNOWN,66106,1990,1,20209.0,65109,9.7,8.206667,7.920952,9.926,22.69,2590.54,8.229032,1551.0,329.08,6.11,127.5,1.41,635.7875,797714.86,132.1,5.03,7.9,93.0,1990-01-01
1,92507,92106,UNKNOWN,UNKNOWN,0,117000,1990-01-02,1990,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,240,RIVERSIDE,CA,INDIVIDUAL,PIF,,0,0,UNKNOWN,92507,1990,1,6065.0,92106,5.4,8.206667,7.920952,9.926,22.69,2590.54,8.229032,1551.0,329.08,6.11,127.5,1.41,635.7875,797714.86,132.1,5.03,7.9,93.0,1990-01-01
2,46628,46601,UNKNOWN,UNKNOWN,0,261000,1990-01-03,1990,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,120,ST JOSEPH,IN,CORPORATION,PIF,,0,0,UNKNOWN,46628,1990,1,18141.0,46601,5.3,8.206667,7.920952,9.926,22.69,2590.54,8.229032,1551.0,329.08,6.11,127.5,1.41,635.7875,797714.86,132.1,5.03,7.9,93.0,1990-01-01
3,62946,62704,UNKNOWN,UNKNOWN,0,262000,1990-01-03,1990,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,240,SALINE,IL,CORPORATION,CHGOFF,2003-03-28,0,0,UNKNOWN,62946,1990,1,17165.0,62704,10.9,8.206667,7.920952,9.926,22.69,2590.54,8.229032,1551.0,329.08,6.11,127.5,1.41,635.7875,797714.86,132.1,5.03,7.9,93.0,1990-01-01
4,84106,84109,UNKNOWN,UNKNOWN,0,154000,1990-01-03,1990,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,240,SALT LAKE,UT,CORPORATION,PIF,,0,0,UNKNOWN,84106,1990,1,49035.0,84109,4.2,8.206667,7.920952,9.926,22.69,2590.54,8.229032,1551.0,329.08,6.11,127.5,1.41,635.7875,797714.86,132.1,5.03,7.9,93.0,1990-01-01
