
# Merge All Datasets in `more_macro`

This notebook loads every CSV in `additional_dataset/more_macro/`, creates
`approval_year` and `approval_month` from each series date, and outer-merges all
series into a single monthly table.


In [28]:

from pathlib import Path
import pandas as pd

pd.set_option('display.max_columns', 200)

PROJECT_ROOT = Path('/Users/yangmar/Desktop/MS&E 246/Project')
MACRO_DIR = PROJECT_ROOT / 'additional_dataset' / 'more_macro'
OUT_PATH = MACRO_DIR / 'more_macro_merged_monthly.csv'

# Preferred output column names by FRED series code.
FRIENDLY_NAME_MAP = {
    'BUSLOANS': 'commercial_bank_loans_leases_billions',
    'TOTALSL': 'consumer_credit_outstanding_millions',
    'PSAVERT': 'personal_savings_rate_pct',
    'UMCSENT': 'consumer_sentiment_index',
    'CORBLACBS': 'business_loan_chargeoff_rate_pct',
    'DRALACBS': 'all_loan_delinquency_rate_pct',
    'TDSP': 'household_debt_service_ratio_pct',
    'CPIAUCSL': 'cpi_u_seasonality_adjusted',
    'CPILFESL': 'core_cpi_excl._food_energy'
}

print('Macro folder:', MACRO_DIR)
print('CSV count:', len(list(MACRO_DIR.glob('*.csv'))))


Macro folder: /Users/yangmar/Desktop/MS&E 246/Project/additional_dataset/more_macro
CSV count: 8


In [29]:

# Load and normalize each CSV to: approval_year, approval_month, <series>
series_tables = []
coverage = []

for csv_path in sorted(MACRO_DIR.glob('*.csv')):
    df = pd.read_csv(csv_path)
    df.columns = [str(c).replace('\ufeff', '').strip() for c in df.columns]

    date_col = next((c for c in df.columns if c.lower() in ('observation_date', 'date')), None)
    if date_col is None:
        raise ValueError(f'{csv_path.name}: no date column found. Columns={list(df.columns)}')

    value_cols = [c for c in df.columns if c != date_col]
    if len(value_cols) != 1:
        raise ValueError(f'{csv_path.name}: expected exactly one value column, got {value_cols}')

    raw_value_col = value_cols[0]
    friendly_col = FRIENDLY_NAME_MAP.get(raw_value_col)

    if not friendly_col:
        # Fallback: use file stem if a code is not in the map
        friendly_col = csv_path.stem.lower().replace('-', '_').replace(' ', '_')

    x = pd.DataFrame({
        'date': pd.to_datetime(df[date_col], errors='coerce'),
        friendly_col: pd.to_numeric(df[raw_value_col], errors='coerce')
    }).dropna(subset=['date', friendly_col])

    x['approval_year'] = x['date'].dt.year.astype('int64')
    x['approval_month'] = x['date'].dt.month.astype('int64')

    # If series has >1 point in a month (e.g., daily/weekly), collapse to monthly mean.
    x = (
        x.groupby(['approval_year', 'approval_month'], as_index=False)[friendly_col]
        .mean()
        .sort_values(['approval_year', 'approval_month'])
    )

    series_tables.append(x)
    coverage.append((csv_path.name, raw_value_col, friendly_col, x['approval_year'].min(), x['approval_month'].min(), x['approval_year'].max(), x['approval_month'].max(), len(x)))

coverage_df = pd.DataFrame(
    coverage,
    columns=['file', 'raw_series_col', 'friendly_col', 'start_year', 'start_month', 'end_year', 'end_month', 'rows']
)
display(coverage_df)


Unnamed: 0,file,raw_series_col,friendly_col,start_year,start_month,end_year,end_month,rows
0,CPI-U_seasonality_adjusted.csv,CPIAUCSL,cpi_u_seasonality_adjusted,1990,1,2014,12,300
1,Charge-off_rate_on_business_loans.csv,CORBLACBS,business_loan_chargeoff_rate_pct,1990,1,2014,10,100
2,Commercial_bank_loans_and_leases.csv,BUSLOANS,commercial_bank_loans_leases_billions,1990,1,2014,12,300
3,Consumer_credit_outstanding.csv,TOTALSL,consumer_credit_outstanding_millions,1990,1,2014,12,300
4,Core_CPI_excl._food_energy.csv,CPILFESL,core_cpi_excl._food_energy,1990,1,2014,12,300
5,Delinquency_rate_on_all_loan.csv,DRALACBS,all_loan_delinquency_rate_pct,1990,1,2014,10,100
6,Personal_savings_rate.csv,PSAVERT,personal_savings_rate_pct,1990,1,2014,12,300
7,UMichigan_Consumer_Sentiment.csv,UMCSENT,consumer_sentiment_index,1990,1,2014,12,300


In [30]:
series_tables

[     approval_year  approval_month  cpi_u_seasonality_adjusted
 0             1990               1                     127.500
 1             1990               2                     128.000
 2             1990               3                     128.600
 3             1990               4                     128.900
 4             1990               5                     129.100
 ..             ...             ...                         ...
 295           2014               8                     237.460
 296           2014               9                     237.477
 297           2014              10                     237.430
 298           2014              11                     236.983
 299           2014              12                     236.252
 
 [300 rows x 3 columns],
     approval_year  approval_month  business_loan_chargeoff_rate_pct
 0            1990               1                              1.41
 1            1990               4                              1.4

In [31]:

# Outer-merge all datasets on approval_year + approval_month
merged = None
for t in series_tables:
    if merged is None:
        merged = t.copy()
    else:
        merged = merged.merge(t, on=['approval_year', 'approval_month'], how='outer')

merged = merged.sort_values(['approval_year', 'approval_month']).reset_index(drop=True)

print('Merged shape:', merged.shape)
print('Columns:', merged.columns.tolist())
merged.tail(12)


Merged shape: (300, 10)
Columns: ['approval_year', 'approval_month', 'cpi_u_seasonality_adjusted', 'business_loan_chargeoff_rate_pct', 'commercial_bank_loans_leases_billions', 'consumer_credit_outstanding_millions', 'core_cpi_excl._food_energy', 'all_loan_delinquency_rate_pct', 'personal_savings_rate_pct', 'consumer_sentiment_index']


Unnamed: 0,approval_year,approval_month,cpi_u_seasonality_adjusted,business_loan_chargeoff_rate_pct,commercial_bank_loans_leases_billions,consumer_credit_outstanding_millions,core_cpi_excl._food_energy,all_loan_delinquency_rate_pct,personal_savings_rate_pct,consumer_sentiment_index
288,2014,1,235.288,0.23,1586.7954,3113002.96,235.961,3.31,5.2,81.2
289,2014,2,235.547,,1619.7096,3133891.8,236.185,,5.3,81.6
290,2014,3,236.028,,1633.3176,3150697.02,236.625,,5.2,80.0
291,2014,4,236.468,0.21,1651.1911,3173234.63,237.072,3.08,5.4,84.1
292,2014,5,236.918,,1663.6926,3192546.34,237.529,,5.6,81.9
293,2014,6,237.231,,1676.8043,3209899.68,237.837,,5.6,82.5
294,2014,7,237.498,0.2,1694.9147,3231535.62,238.195,2.9,5.5,81.8
295,2014,8,237.46,,1711.7561,3248721.35,238.405,,5.3,82.5
296,2014,9,237.477,,1725.7351,3265101.43,238.786,,5.5,84.6
297,2014,10,237.43,0.22,1730.8096,3275438.44,239.191,2.7,5.4,86.9


In [32]:
merged['business_loan_chargeoff_rate_pct'] = merged['business_loan_chargeoff_rate_pct'].ffill()
merged['all_loan_delinquency_rate_pct'] = merged['all_loan_delinquency_rate_pct'].ffill()

In [33]:
merged.tail(12)

Unnamed: 0,approval_year,approval_month,cpi_u_seasonality_adjusted,business_loan_chargeoff_rate_pct,commercial_bank_loans_leases_billions,consumer_credit_outstanding_millions,core_cpi_excl._food_energy,all_loan_delinquency_rate_pct,personal_savings_rate_pct,consumer_sentiment_index
288,2014,1,235.288,0.23,1586.7954,3113002.96,235.961,3.31,5.2,81.2
289,2014,2,235.547,0.23,1619.7096,3133891.8,236.185,3.31,5.3,81.6
290,2014,3,236.028,0.23,1633.3176,3150697.02,236.625,3.31,5.2,80.0
291,2014,4,236.468,0.21,1651.1911,3173234.63,237.072,3.08,5.4,84.1
292,2014,5,236.918,0.21,1663.6926,3192546.34,237.529,3.08,5.6,81.9
293,2014,6,237.231,0.21,1676.8043,3209899.68,237.837,3.08,5.6,82.5
294,2014,7,237.498,0.2,1694.9147,3231535.62,238.195,2.9,5.5,81.8
295,2014,8,237.46,0.2,1711.7561,3248721.35,238.405,2.9,5.3,82.5
296,2014,9,237.477,0.2,1725.7351,3265101.43,238.786,2.9,5.5,84.6
297,2014,10,237.43,0.22,1730.8096,3275438.44,239.191,2.7,5.4,86.9


In [34]:

# Save merged monthly macro table
merged.to_csv(OUT_PATH, index=False)
print('Saved:', OUT_PATH)


Saved: /Users/yangmar/Desktop/MS&E 246/Project/additional_dataset/more_macro/more_macro_merged_monthly.csv


In [35]:
sba = pd.read_csv("/Users/yangmar/Desktop/MS&E 246/Project/sba_loan_final_v1.csv")

res = sba.merge(merged, how = 'left', on=['approval_year','approval_month'])

  sba = pd.read_csv("/Users/yangmar/Desktop/MS&E 246/Project/sba_loan_final_v1.csv")


In [36]:
res.head(10)

Unnamed: 0,BorrZip,CDC_Zip,ThirdPartyLender_City,ThirdPartyLender_State,ThirdPartyDollars,GrossApproval,ApprovalDate,ApprovalFiscalYear,DeliveryMethod,subpgmdesc,TermInMonths,ProjectCounty,ProjectState,BusinessType,LoanStatus,ChargeOffDate,GrossChargeOffAmount,HasThirdParty,NaicsSector,BorrZip_clean,approval_year,approval_month,county_fips,CDC_Zip_clean,county_unemployment_rate,ten_year_treasury,one_year_treasury,mortgage_30y,crude_oil,dow_jones,fed_fund_rate,housing_starts,sp500,inflation_annual,cpi_u_seasonality_adjusted,business_loan_chargeoff_rate_pct,commercial_bank_loans_leases_billions,consumer_credit_outstanding_millions,core_cpi_excl._food_energy,all_loan_delinquency_rate_pct,personal_savings_rate_pct,consumer_sentiment_index
0,66106,65109,UNKNOWN,UNKNOWN,0,166000,1990-01-02,1990,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,12,WYANDOTTE,KS,INDIVIDUAL,PIF,,0,0,UNKNOWN,66106,1990,1,20209.0,65109,9.7,8.206667,7.920952,9.926,22.69,2590.54,8.229032,1551.0,329.08,6.11,127.5,1.41,635.7875,797714.86,132.1,5.03,7.9,93.0
1,92507,92106,UNKNOWN,UNKNOWN,0,117000,1990-01-02,1990,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,240,RIVERSIDE,CA,INDIVIDUAL,PIF,,0,0,UNKNOWN,92507,1990,1,6065.0,92106,5.4,8.206667,7.920952,9.926,22.69,2590.54,8.229032,1551.0,329.08,6.11,127.5,1.41,635.7875,797714.86,132.1,5.03,7.9,93.0
2,46628,46601,UNKNOWN,UNKNOWN,0,261000,1990-01-03,1990,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,120,ST JOSEPH,IN,CORPORATION,PIF,,0,0,UNKNOWN,46628,1990,1,18141.0,46601,5.3,8.206667,7.920952,9.926,22.69,2590.54,8.229032,1551.0,329.08,6.11,127.5,1.41,635.7875,797714.86,132.1,5.03,7.9,93.0
3,62946,62704,UNKNOWN,UNKNOWN,0,262000,1990-01-03,1990,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,240,SALINE,IL,CORPORATION,CHGOFF,2003-03-28,0,0,UNKNOWN,62946,1990,1,17165.0,62704,10.9,8.206667,7.920952,9.926,22.69,2590.54,8.229032,1551.0,329.08,6.11,127.5,1.41,635.7875,797714.86,132.1,5.03,7.9,93.0
4,84106,84109,UNKNOWN,UNKNOWN,0,154000,1990-01-03,1990,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,240,SALT LAKE,UT,CORPORATION,PIF,,0,0,UNKNOWN,84106,1990,1,49035.0,84109,4.2,8.206667,7.920952,9.926,22.69,2590.54,8.229032,1551.0,329.08,6.11,127.5,1.41,635.7875,797714.86,132.1,5.03,7.9,93.0
5,84104,84109,UNKNOWN,UNKNOWN,0,135000,1990-01-03,1990,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,240,SALT LAKE,UT,CORPORATION,PIF,,0,0,51,84104,1990,1,49035.0,84109,4.2,8.206667,7.920952,9.926,22.69,2590.54,8.229032,1551.0,329.08,6.11,127.5,1.41,635.7875,797714.86,132.1,5.03,7.9,93.0
6,43215,43215,UNKNOWN,UNKNOWN,0,300000,1990-01-03,1990,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,240,FRANKLIN,OH,CORPORATION,PIF,,0,0,UNKNOWN,43215,1990,1,39049.0,43215,4.0,8.206667,7.920952,9.926,22.69,2590.54,8.229032,1551.0,329.08,6.11,127.5,1.41,635.7875,797714.86,132.1,5.03,7.9,93.0
7,45427,45402,UNKNOWN,UNKNOWN,0,284000,1990-01-04,1990,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,240,MONTGOMERY,OH,CORPORATION,PIF,,0,0,UNKNOWN,45427,1990,1,39113.0,45402,6.0,8.206667,7.920952,9.926,22.69,2590.54,8.229032,1551.0,329.08,6.11,127.5,1.41,635.7875,797714.86,132.1,5.03,7.9,93.0
8,23860,23805,UNKNOWN,UNKNOWN,0,479000,1990-01-05,1990,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,240,HOPEWELL CITY,VA,CORPORATION,PIF,,0,0,23,23860,1990,1,51670.0,23805,6.0,8.206667,7.920952,9.926,22.69,2590.54,8.229032,1551.0,329.08,6.11,127.5,1.41,635.7875,797714.86,132.1,5.03,7.9,93.0
9,54701,53716,UNKNOWN,UNKNOWN,0,750000,1990-01-05,1990,504,Sec. 504 - Loan Guarantees - Private Sector Fi...,240,EAU CLAIRE,WI,CORPORATION,PIF,,0,0,UNKNOWN,54701,1990,1,55035.0,53716,4.6,8.206667,7.920952,9.926,22.69,2590.54,8.229032,1551.0,329.08,6.11,127.5,1.41,635.7875,797714.86,132.1,5.03,7.9,93.0


In [None]:
res.isnull().sum()[-10:]

sp500                                    0
inflation_annual                         0
cpi_u_seasonality_adjusted               0
business_loan_chargeoff_rate_pct         0
commercial_bank_loans_leases_billions    0
consumer_credit_outstanding_millions     0
core_cpi_excl._food_energy               0
all_loan_delinquency_rate_pct            0
personal_savings_rate_pct                0
consumer_sentiment_index                 0
dtype: int64

In [39]:
res.to_csv('/Users/yangmar/Desktop/MS&E 246/Project/sba_loan_final_v2.csv', index=False)