In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import linearmodels as lm
import statsmodels.api as sm
from linearmodels import PanelOLS
from linearmodels import RandomEffects
from linearmodels import PooledOLS
from linearmodels import FirstDifferenceOLS
from linearmodels import BetweenOLS
from linearmodels import FamaMacBeth
import sqlite3
from tqdm import tqdm
import dask.dataframe as dd
from dask import delayed, compute
from dask.diagnostics import ProgressBar

# Merge FM and HRCN to get smaller set (Only states with HRCN risk data)

In [7]:
db_path = "../Database/thesis_database.db"
conn = sqlite3.connect(db_path, check_same_thread=False)
hrcn_risk_agg = pd.read_sql("SELECT * FROM hrcn_risk_agg", conn)

In [8]:
hrcn_risk_agg = pd.read_sql("SELECT * FROM hrcn_risk_agg", conn)

In [9]:
hrcn_risk_agg.head()

Unnamed: 0,MSA,STATE,HRCN_HLRB,HRCN_AFREQ,HRCN_EALS
0,10180,Texas,0.000614,0.009247,29.528751
1,10220,Oklahoma,0.000661,0.005861,23.389537
2,10300,Michigan,0.00039,0.00862,38.95374
3,10420,Ohio,0.000545,0.01293,55.923044
4,10460,New Mexico,0.000254,0.009413,27.280588


In [12]:
# read'../../Data/mainland_usa_gdf_msa_aggregated.pkl'
mortgage_hrcn = {}
#Merge Mortgage data and Hurricane Data on MSA ||| This reduces the datasets by removing all unessential states data
pb = tqdm(range(1999, 2023))
for year in pb:
    pb.set_description(f"Merging {year} hrcn and mortgage data")
    query = f"""
    SELECT *
    FROM fm_{year}
    """
    df_year = pd.read_sql(query, conn) 
    merged_df = df_year.merge(hrcn_risk_agg, on='MSA', how='inner')
    # fm_combined = pd.concat([fm_combined, merged_df])
    key_name = f"fm_{year}"
    mortgage_hrcn[key_name] = merged_df
    del merged_df, df_year
#Runtime: 4:30

Merging 2022 hrcn and mortgage data: 100%|██████████| 24/24 [04:27<00:00, 11.14s/it]


In [13]:
pb = tqdm(mortgage_hrcn.items())
for key, dataset in pb:
    key_hrcn = f"{key}_hrcn_new"
    pb.set_description(f"writing {key_hrcn} to database")
    # Write the dataset to the database
    dataset.to_sql(key_hrcn, conn, if_exists="replace", index=False)
    del dataset

writing fm_2022_hrcn_new to database: 100%|██████████| 24/24 [06:58<00:00, 17.43s/it]


In [12]:
query = """
SELECT *
FROM fm_2000_hrcn
"""
fm_1999_hrcn_new = pd.read_sql(query, conn)
fm_1999_hrcn_new.head()

Unnamed: 0,Date,MSA,LSN,CLDS,AGE,CIR,ELTV,DDD,CS,FPD,...,HRCN_EALPE,HRCN_EALA,HRCN_EALT,HRCN_EALS,HRCN_ALRB,HRCN_ALRP,HRCN_ALRA,HRCN_ALR_N,HRCN_RISKV,HRCN_RISKS
0,2001-09-01 00:00:00,39300,F00Q10000054,0,0,7.125,,0,762,200110,...,279546.490849,55272.025502,14809840.0,86.979392,0.00033,1.050044e-07,0.003032,78.202911,15621230.0,87.34111
1,2001-10-01 00:00:00,39300,F00Q10000054,0,1,7.125,,0,762,200110,...,279546.490849,55272.025502,14809840.0,86.979392,0.00033,1.050044e-07,0.003032,78.202911,15621230.0,87.34111
2,2001-11-01 00:00:00,39300,F00Q10000054,0,2,7.125,,0,762,200110,...,279546.490849,55272.025502,14809840.0,86.979392,0.00033,1.050044e-07,0.003032,78.202911,15621230.0,87.34111
3,2001-12-01 00:00:00,39300,F00Q10000054,0,3,7.125,,0,762,200110,...,279546.490849,55272.025502,14809840.0,86.979392,0.00033,1.050044e-07,0.003032,78.202911,15621230.0,87.34111
4,2000-03-01 00:00:00,39300,F00Q10004108,0,0,8.75,,0,772,200004,...,279546.490849,55272.025502,14809840.0,86.979392,0.00033,1.050044e-07,0.003032,78.202911,15621230.0,87.34111


# Combine Files

In [15]:
# (if not, you'll need to set that up)
def fetch_and_merge(year):
    query = f"""
    SELECT *
    FROM fm_{year}_hrcn
    """
    merged_df = dd.from_pandas(pd.read_sql(query, conn), npartitions=10)
    return merged_df

results = [delayed(fetch_and_merge)(year) for year in range(1999, 2023)]

# Compute the results in parallel
with ProgressBar():
    merged_dataframes = compute(*results, scheduler='single-threaded')
# Ensure merged_dataframes is a list
merged_dataframes_list = list(merged_dataframes)
fm_combined = dd.concat(merged_dataframes_list)


[########################################] | 100% Completed | 246.77 s


In [16]:
fm_combined.to_parquet('../Data/fm_combined2.parquet', engine='pyarrow')

# Aggregate Dataset

In [2]:
fm_combined = dd.read_parquet('../Data/fm_combined.parquet')

In [5]:
fm_combined.columns

Index(['Date', 'MSA', 'LSN', 'CLDS', 'AGE', 'CIR', 'ELTV', 'DDD', 'CS', 'FPD',
       'FIRST_F', 'MD', 'MIP', 'CLTV', 'DTI', 'LTV', 'OIR', 'P_TYPE', 'POSTAL',
       'OLT', 'HRCN_EVNTS', 'HRCN_AFREQ', 'HRCN_EXP_A', 'HRCN_EXPB',
       'HRCN_EXPP', 'HRCN_EXPPE', 'HRCN_EXPA', 'HRCN_EXPT', 'HRCN_HLRB',
       'HRCN_HLRP', 'HRCN_HLRA', 'HRCN_EALB', 'HRCN_EALP', 'HRCN_EALPE',
       'HRCN_EALA', 'HRCN_EALT', 'HRCN_EALS', 'HRCN_ALRB', 'HRCN_ALRP',
       'HRCN_ALRA', 'HRCN_ALR_N', 'HRCN_RISKV', 'HRCN_RISKS', 'HRCN_EALS_Norm',
       'HRCN_RISK_CATEGORY_QUANTILE'],
      dtype='object')

In [3]:
db_path = "../Database/thesis_database.db"
conn = sqlite3.connect(db_path, check_same_thread=False)
hrcn_risk_agg = pd.read_sql("SELECT * FROM hrcn_risk_agg", conn)

### Add state

In [4]:
#Merge hrcn_risk_agg on fm_combined on MSA only keeping STATE
fm_combined = fm_combined.merge(hrcn_risk_agg[['MSA','STATE']], on='MSA', how='inner')
fm_combined.head()


Unnamed: 0,Date,MSA,LSN,CLDS,AGE,CIR,ELTV,DDD,CS,FPD,...,HRCN_EALS,HRCN_ALRB,HRCN_ALRP,HRCN_ALRA,HRCN_ALR_N,HRCN_RISKV,HRCN_RISKS,HRCN_EALS_Norm,HRCN_RISK_CATEGORY_QUANTILE,STATE
0,2002-09-01 00:00:00,10420,F99Q10000029,0,0,6.375,,0,618,200210,...,55.923044,6e-06,1.453149e-09,0.000121,29.853005,358143.40386,55.720054,0.114308,Relatively Moderate,Ohio
1,2002-10-01 00:00:00,10420,F99Q10000029,0,1,6.375,,0,618,200210,...,55.923044,6e-06,1.453149e-09,0.000121,29.853005,358143.40386,55.720054,0.114308,Relatively Moderate,Ohio
2,2002-11-01 00:00:00,10420,F99Q10000029,0,2,6.375,,0,618,200210,...,55.923044,6e-06,1.453149e-09,0.000121,29.853005,358143.40386,55.720054,0.114308,Relatively Moderate,Ohio
3,2002-12-01 00:00:00,10420,F99Q10000029,0,3,6.375,,0,618,200210,...,55.923044,6e-06,1.453149e-09,0.000121,29.853005,358143.40386,55.720054,0.114308,Relatively Moderate,Ohio
4,2003-01-01 00:00:00,10420,F99Q10000029,0,4,6.375,,0,618,200210,...,55.923044,6e-06,1.453149e-09,0.000121,29.853005,358143.40386,55.720054,0.114308,Relatively Moderate,Ohio


### Aggregate on MSA and Date

In [5]:
# Step 1: Drop unnecessary columns
fm_combined = fm_combined.drop(columns=['ELTV', 'FPD', 'MD'], errors='ignore')
# Step 2: Create CLDS90 and CLDS180 columns
fm_combined['CLDS90'] = (fm_combined['CLDS'] == 3).astype(int)
fm_combined['CLDS30'] = (fm_combined['CLDS'] == 1).astype(int)
fm_combined['CLDS120'] = (fm_combined['CLDS'] == 4).astype(int)
fm_combined['D90_month'] = 1
fm_combined['D30_month'] = 1
fm_combined['D120_month'] = 1


In [6]:
#fm_combined to parquet
fm_combined.to_parquet('../Data/fm_combined_robust.parquet', engine='pyarrow')

In [7]:
# Define your function as before
def create_indicators(group):
    d90_date = group[group['CLDS90'] == 1]['Date'].min()
    d30_date = group[group['CLDS30'] == 1]['Date'].min()
    d120_date = group[group['CLDS120'] == 1]['Date'].min()
    if pd.notnull(d90_date):
        group['D90_month'] = (group['Date'] <= d90_date).astype(int)
    if pd.notnull(d30_date):
        group['D30_month'] = (group['Date'] <= d30_date).astype(int)
    if pd.notnull(d120_date):
        group['D120_month'] = (group['Date'] <= d120_date).astype(int)
    return group

# Use Dask's groupby and apply methods
with ProgressBar():
    fm_combined = fm_combined.groupby('LSN').apply(create_indicators, meta=fm_combined).compute(scheduler='threads')
#22 minutes

[########################################] | 100% Completed | 28m 8ss


In [16]:
fm_combined['STATE'].unique()

NameError: name 'fm_combined' is not defined

In [8]:
#remove index fm_combined
fm_combined.reset_index(drop=True).to_parquet('../Data/fm_combined_ind_with30.parquet', engine='pyarrow')

In [9]:
del fm_combined

In [10]:
# fm_combined_ind = dd.read_parquet('../Data/fm_combined_ind.parquet')
fm_combined_ind = dd.read_parquet('../Data/fm_combined_ind_with30.parquet')


In [11]:
fm_combined_ind.compute().groupby(['MSA', 'Date'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x16420c4d0>

In [12]:
# Define a custom aggregation function
def custom_aggregation(group):
    agg_data = {
        'UNQ_LSN': group['LSN'].nunique(),
        'P_TYPE_MOST_FREQ': group['P_TYPE'].mode()[0],  # Using mode for most frequent in Pandas
        'D90': group[group['D90_month'] == 1]['CLDS90'].sum() / group[group['D90_month'] == 1]['LSN'].nunique(),
        'D30': group[group['D30_month'] == 1]['CLDS30'].sum() / group[group['D30_month'] == 1]['LSN'].nunique(),
        'D120': group[group['D120_month'] == 1]['CLDS120'].sum() / group[group['D120_month'] == 1]['LSN'].nunique(),
        'STATE': group['STATE'].mode()[0]
    }
    
    # Add mean for all other columns
    for col in group.columns:
        if col not in ['MSA', 'Date', 'LSN', 'P_TYPE', 'CLDS90', 'CLDS30', 'D90_month', 'D30_month', 'STATE']:
            try:
                agg_data[col] = group[col].mean()
            except:
                pass

    return pd.Series(agg_data)

# Wrap the groupby object with tqdm for progress bar
tqdm.pandas(desc="Aggregating Data")
aggregated = fm_combined_ind.compute().groupby(['MSA', 'Date']).progress_apply(custom_aggregation).reset_index()
aggregated['Date'] = pd.to_datetime(aggregated['Date'])
# Runtime: 

  'D30': group[group['D30_month'] == 1]['CLDS30'].sum() / group[group['D30_month'] == 1]['LSN'].nunique(),
Aggregating Data: 100%|██████████| 77428/77428 [03:45<00:00, 343.29it/s]


In [17]:
aggregated.head()
aggregated['STATE'].unique()

array(['Texas', 'Ohio', 'Georgia', 'New York', 'Louisiana', 'New Jersey',
       'Pennsylvania', 'Iowa', 'Michigan', 'Alabama', 'Wisconsin',
       'North Carolina', 'Maryland', 'Maine', 'Massachusetts',
       'West Virginia', 'Virginia', 'Illinois', 'Indiana', 'Kentucky',
       'Vermont', 'Florida', 'South Carolina', 'Tennessee', 'Missouri',
       'Delaware', 'California', 'Oklahoma', 'Arkansas', 'Mississippi',
       'Kansas', 'Minnesota', 'New Mexico', 'New Hampshire', 'Arizona'],
      dtype=object)

In [14]:
del fm_combined_ind

In [18]:
#save aggregated to parquet
aggregated.to_parquet('../Data/aggregated_with30.parquet', engine='pyarrow')

### Aggreagate on Loan

In [None]:
# read fm_combined.to_parquet('../Data/fm_combined_2.parquet', engine='pyarrow')
fm_combined = dd.read_parquet('../Data/fm_combined.parquet')

In [None]:
# Create D90 and D180 indicator variables
fm_combined['D90'] = (fm_combined['CLDS'] == 3).astype(int)
fm_combined['D180'] = (fm_combined['CLDS'] == 6).astype(int)
 
agg_loan = fm_combined
del fm_combined


In [None]:
# Adjusting to capture only the first occurrence of D90 and D180 for each LSN
aggregation = {
    'MSA': 'first',  # Assuming it's constant for a given LSN
    'FPD': 'first',
    'FIRST_F': 'first',
    'MD': 'first',
    'POSTAL': 'first',
    'P_TYPE': 'first',
    'D90': 'max',
    'D180': 'max',
    'HRCN_RISK_CATEGORY_QUANTILE': 'first',
    'Date' : 'last'
}
# Exclude specific columns from aggregation
exclude_cols = ['LSN', 'Date', 'MSA', 'FPD', 'FIRST_F', 'MD', 'POSTAL', 'P_TYPE', 'D90', 'D180', 'HRCN_RISK_CATEGORY_QUANTILE']
# Add the remaining columns to the aggregation dictionary with mean function
for col in agg_loan.columns:
    if col not in exclude_cols:
        aggregation[col] = 'mean'
# Perform aggregation
agg_loan = agg_loan.groupby('LSN').agg(aggregation).reset_index()

In [None]:
agg_loan = agg_loan.compute()

In [None]:
# Convert MD and FPD to date format with %Y%m
agg_loan['MD'] = pd.to_datetime(agg_loan['MD'].astype(str), format='%Y%m')
agg_loan['FPD'] = pd.to_datetime(agg_loan['FPD'].astype(str), format='%Y%m')
agg_loan['Date'] = pd.to_datetime(agg_loan['Date'])


In [None]:
#average number of D90 = 1 loans
display(agg_loan['D90'].mean())
#unique MSA
display(agg_loan['MSA'].nunique())

0.05431396688674988

270

In [None]:
#to parquet
agg_loan.to_parquet('../Data/agg_loan.parquet', engine='pyarrow')

# Modelling on MSA and Date

In [18]:
aggregated = dd.read_parquet('../Data/aggregated_with30.parquet')

In [19]:
aggregated.head()

Unnamed: 0,MSA,Date,UNQ_LSN,P_TYPE_MOST_FREQ,D90,D30,D120,STATE,CLDS,AGE,...,HRCN_EALS,HRCN_ALRB,HRCN_ALRP,HRCN_ALRA,HRCN_ALR_N,HRCN_RISKV,HRCN_RISKS,HRCN_EALS_Norm,CLDS120,D120_month
0,10180,1999-03-01,1,SF,0.0,0.0,0.0,Texas,0.0,1.0,...,29.528751,6e-06,1.386015e-09,8.7e-05,31.546332,82619.358078,31.493943,-0.872395,0.0,1.0
1,10180,1999-04-01,1,SF,0.0,0.0,0.0,Texas,0.0,2.0,...,29.528751,6e-06,1.386015e-09,8.7e-05,31.546332,82619.358078,31.493943,-0.872395,0.0,1.0
2,10180,1999-05-01,1,SF,0.0,0.0,0.0,Texas,0.0,3.0,...,29.528751,6e-06,1.386015e-09,8.7e-05,31.546332,82619.358078,31.493943,-0.872395,0.0,1.0
3,10180,1999-06-01,2,PU,0.0,0.0,0.0,Texas,0.0,2.5,...,29.528751,6e-06,1.386015e-09,8.7e-05,31.546332,82619.358078,31.493943,-0.872395,0.0,1.0
4,10180,1999-07-01,3,SF,0.0,0.0,0.0,Texas,0.0,2.333333,...,29.528751,6e-06,1.386015e-09,8.7e-05,31.546332,82619.358078,31.493943,-0.872395,0.0,1.0


In [20]:
db_path = "../Database/thesis_database.db"
conn = sqlite3.connect(db_path, check_same_thread=False)

In [25]:
aggregated_df = aggregated.compute()
aggregated_df['Date'] = pd.to_datetime(aggregated_df['Date'])

### HPI

In [26]:
#loag hpi master
hpi_master = pd.read_excel('../Data/HPI_master.xls', dtype={'place_id': str, 'yr': int, 'period': int, 'index_nsa': float, 'quarter': str})
#keep place_id, yr, period and index_nsa
hpi_master = hpi_master[['place_id', 'yr', 'period', 'index_nsa', 'quarter']]
#add period column to aggregated_df for each quarter
aggregated_df['quarter'] = aggregated_df['Date'].dt.year.astype(str) + "Q" + aggregated_df['Date'].dt.quarter.astype(str)
#merge aggregated_df and hpi_master on place_id and period only keeping index_nsa
aggregated_df = aggregated_df.merge(hpi_master, left_on=['MSA','quarter'], right_on=['place_id', 'quarter'], how='left')

In [27]:
aggregated_df

Unnamed: 0,MSA,Date,UNQ_LSN,P_TYPE_MOST_FREQ,D90,D30,D120,STATE,CLDS,AGE,...,HRCN_RISKV,HRCN_RISKS,HRCN_EALS_Norm,CLDS120,D120_month,quarter,place_id,yr,period,index_nsa
0,10180,1999-03-01,1,SF,0.000000,0.000000,0.000000,Texas,0.000000,1.000000,...,82619.358078,31.493943,-0.872395,0.000000,1.000000,1999Q1,10180,1999.0,1.0,112.00
1,10180,1999-04-01,1,SF,0.000000,0.000000,0.000000,Texas,0.000000,2.000000,...,82619.358078,31.493943,-0.872395,0.000000,1.000000,1999Q2,10180,1999.0,2.0,113.30
2,10180,1999-05-01,1,SF,0.000000,0.000000,0.000000,Texas,0.000000,3.000000,...,82619.358078,31.493943,-0.872395,0.000000,1.000000,1999Q2,10180,1999.0,2.0,113.30
3,10180,1999-06-01,2,PU,0.000000,0.000000,0.000000,Texas,0.000000,2.500000,...,82619.358078,31.493943,-0.872395,0.000000,1.000000,1999Q2,10180,1999.0,2.0,113.30
4,10180,1999-07-01,3,SF,0.000000,0.000000,0.000000,Texas,0.000000,2.333333,...,82619.358078,31.493943,-0.872395,0.000000,1.000000,1999Q3,10180,1999.0,3.0,114.39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100322,49740,2022-11-01,105,SF,0.000000,0.000000,0.000000,Arizona,0.019048,46.171429,...,19715.930121,17.317183,,0.000000,0.942857,2022Q4,49740,2022.0,4.0,313.18
100323,49740,2022-12-01,105,SF,0.000000,0.011494,0.000000,Arizona,0.038095,47.171429,...,19715.930121,17.317183,,0.000000,0.942857,2022Q4,49740,2022.0,4.0,313.18
100324,49740,2023-01-01,108,SF,0.009804,0.000000,0.000000,Arizona,0.037037,46.833333,...,19715.930121,17.317183,,0.000000,0.944444,2023Q1,49740,2023.0,1.0,298.41
100325,49740,2023-02-01,107,SF,0.000000,0.000000,0.009901,Arizona,0.046729,47.850467,...,19715.930121,17.317183,,0.009346,0.943925,2023Q1,49740,2023.0,1.0,298.41


### GDP

In [28]:
#Read GDP.xlsx
gdp = pd.read_excel('../Data/GDP.xlsx', dtype={'quarter': str, 'GDP': float, 'GDP_change': float})

In [29]:
gdp.head()

Unnamed: 0,observation_date,GDP,GDP_change,year,period,quarter
0,1947-01-01,243.164,0.0,1947,1,1947Q1
1,1947-04-01,245.968,0.011531,1947,2,1947Q2
2,1947-07-01,249.585,0.014705,1947,3,1947Q3
3,1947-10-01,259.745,0.040708,1947,4,1947Q4
4,1948-01-01,265.742,0.023088,1948,1,1948Q1


In [30]:
#merge gdp with aggregated_df on quarter
aggregated_df = aggregated_df.merge(gdp[['GDP', 'GDP_change','quarter']], on='quarter', how='left')

In [31]:
aggregated_df.head()

Unnamed: 0,MSA,Date,UNQ_LSN,P_TYPE_MOST_FREQ,D90,D30,D120,STATE,CLDS,AGE,...,HRCN_EALS_Norm,CLDS120,D120_month,quarter,place_id,yr,period,index_nsa,GDP,GDP_change
0,10180,1999-03-01,1,SF,0.0,0.0,0.0,Texas,0.0,1.0,...,-0.872395,0.0,1.0,1999Q1,10180,1999.0,1.0,112.0,9411.682,0.012663
1,10180,1999-04-01,1,SF,0.0,0.0,0.0,Texas,0.0,2.0,...,-0.872395,0.0,1.0,1999Q2,10180,1999.0,2.0,113.3,9526.21,0.012169
2,10180,1999-05-01,1,SF,0.0,0.0,0.0,Texas,0.0,3.0,...,-0.872395,0.0,1.0,1999Q2,10180,1999.0,2.0,113.3,9526.21,0.012169
3,10180,1999-06-01,2,PU,0.0,0.0,0.0,Texas,0.0,2.5,...,-0.872395,0.0,1.0,1999Q2,10180,1999.0,2.0,113.3,9526.21,0.012169
4,10180,1999-07-01,3,SF,0.0,0.0,0.0,Texas,0.0,2.333333,...,-0.872395,0.0,1.0,1999Q3,10180,1999.0,3.0,114.39,9686.626,0.016839


### CPI Urban

In [32]:
cpi = pd.read_excel('../Data/CPI_Urban.xlsx', dtype={'Date': str, 'CPI_SA': float, 'CPI_SA_change': float})

In [33]:
cpi['Date'] = pd.to_datetime(cpi['Date'])  

In [34]:
#merge cpi with aggregated_df on Date
aggregated_df = aggregated_df.merge(cpi[['CPI_SA', 'CPI_SA_change','Date']], on='Date', how='left')

In [35]:
aggregated_df.head()

Unnamed: 0,MSA,Date,UNQ_LSN,P_TYPE_MOST_FREQ,D90,D30,D120,STATE,CLDS,AGE,...,D120_month,quarter,place_id,yr,period,index_nsa,GDP,GDP_change,CPI_SA,CPI_SA_change
0,10180,1999-03-01,1,SF,0.0,0.0,0.0,Texas,0.0,1.0,...,1.0,1999Q1,10180,1999.0,1.0,112.0,9411.682,0.012663,164.8,0.000607
1,10180,1999-04-01,1,SF,0.0,0.0,0.0,Texas,0.0,2.0,...,1.0,1999Q2,10180,1999.0,2.0,113.3,9526.21,0.012169,165.9,0.006675
2,10180,1999-05-01,1,SF,0.0,0.0,0.0,Texas,0.0,3.0,...,1.0,1999Q2,10180,1999.0,2.0,113.3,9526.21,0.012169,166.0,0.000603
3,10180,1999-06-01,2,PU,0.0,0.0,0.0,Texas,0.0,2.5,...,1.0,1999Q2,10180,1999.0,2.0,113.3,9526.21,0.012169,166.0,0.0
4,10180,1999-07-01,3,SF,0.0,0.0,0.0,Texas,0.0,2.333333,...,1.0,1999Q3,10180,1999.0,3.0,114.39,9686.626,0.016839,166.7,0.004217


### Load and merge MEI

In [36]:
#Get enso_mei
query = """
SELECT
    *
FROM enso_mei;
"""
enso_mei = pd.read_sql_query(query, conn)
enso_mei['Date'] = pd.to_datetime(enso_mei['Date'])
#merge enso_mei with aggregated_df
aggregated_df = aggregated_df.merge(enso_mei, on='Date', how='left')

### Load and merge Unemployment

In [37]:
#load UNRATE.csv by ; and merge with aggregated_df on Date
unrate = pd.read_csv('../Data/UNRATE.csv', sep=';')
#Convert Date to datetime format 01/01/1948
unrate['Date'] = pd.to_datetime(unrate['DATE'], format= '%d/%m/%Y')
#Only merge UNRATE column with aggregated_df
unrate = unrate[['Date', 'UNRATE']]
aggregated_df = aggregated_df.merge(unrate, how='left', on='Date')

In [38]:
aggregated_df.CS

0         750.000000
1         750.000000
2         750.000000
3         754.500000
4         752.333333
             ...    
100322    738.685714
100323    738.685714
100324    738.629630
100325    738.224299
100326    738.224299
Name: CS, Length: 100327, dtype: float64

## Regression Models

In [39]:
from linearmodels.panel import PanelOLS, RandomEffects
# Convert the dataset into a panel structure
fm_agg_model = aggregated_df.set_index(['MSA', 'Date'])

# Define dependent variable and independent variables
dependent_var90 = fm_agg_model['D90'] * 100
dependent_var30 = fm_agg_model['D30'] * 100
dependent_var120 = fm_agg_model['D120'] * 100
fm_agg_model['DHRI'] = fm_agg_model['MEI'] * ((fm_agg_model['HRCN_HLRB']- fm_agg_model['HRCN_HLRB'].mean()) / fm_agg_model['HRCN_HLRB'].std())
fm_agg_model['DHRI2'] = fm_agg_model['DHRI'] * fm_agg_model['DHRI']
fm_agg_model['MEI2'] = fm_agg_model['MEI']**2

### VIF

In [51]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
#remove NA rows fm_agg_model
fm_agg_model_na = fm_agg_model.dropna()
exog_vars = ['DHRI','DHRI2','LTV', 'UNRATE', 'OIR','index_nsa', 'CPI_SA_change', 'GDP_change', 'DTI', 'AGE', 'HRCN_SSN']
X = add_constant(fm_agg_model_na[exog_vars])
vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
for i, v in enumerate(vif):
    print(f"VIF for {X.columns[i]}: {v}")

VIF for const: 646.8859222490137
VIF for DHRI: 1.2008153333560123
VIF for DHRI2: 1.2132106198559585
VIF for LTV: 1.209353591938664
VIF for UNRATE: 1.3435604744725287
VIF for OIR: 5.886048862572715
VIF for index_nsa: 1.867410920539089
VIF for CPI_SA_change: 1.1327006249902152
VIF for GDP_change: 1.1824445660980865
VIF for DTI: 1.7647015143802602
VIF for AGE: 5.041582369503043
VIF for HRCN_SSN: 1.009475100294676


: 

In [40]:
fm_agg_model.head()
#create HRCN_SSN indicator for MOnth_nums 6,7,8,9,10

#format Month_num to datetime with 01, 02, 03, ..., 12 format
fm_agg_model['Month_num'] = pd.to_datetime(fm_agg_model['Month_num'], format = '%m').dt.month
fm_agg_model['HRCN_SSN'] = ((fm_agg_model['Month_num'] >= 6) & (fm_agg_model['Month_num'] <= 10)).astype(int)
fm_agg_model['Nina'] = (fm_agg_model['MEI'] < 0).astype(int)
fm_agg_model['DHRNina'] = ((fm_agg_model['HRCN_HLRB']- fm_agg_model['HRCN_HLRB'].mean()) / fm_agg_model['HRCN_HLRB'].std()) * fm_agg_model['Nina']

In [41]:
fm_agg_model['HRCN_SSN']

MSA    Date      
10180  1999-03-01    0
       1999-04-01    0
       1999-05-01    0
       1999-06-01    1
       1999-07-01    1
                    ..
49740  2022-11-01    0
       2022-12-01    0
       2023-01-01    0
       2023-02-01    0
       2023-03-01    0
Name: HRCN_SSN, Length: 100327, dtype: int64

In [42]:
fm_agg_model['HRCN_HLRB'].describe()


count    100327.000000
mean          0.002487
std           0.003941
min           0.000005
25%           0.000252
50%           0.000924
75%           0.002130
max           0.018967
Name: HRCN_HLRB, dtype: float64

### Fixed Effects vs Random Effects

DHRI with MEI, LTV, OIR and DTI with all the other control variables

In [50]:
exog_vars = ['DHRI','LTV', 'OIR', 'DTI', 'index_nsa', 'UNRATE','CPI_SA_change', 'GDP_change', 'HRCN_SSN']
# exog_vars = ['HRCN_AFREQ', 'HRCN_EALS', 'LTV', 'OIR', 'DTI', 'index_nsa', 'UNRATE','CPI_SA_change', 'GDP_change', 'HRCN_SSN']

exog = fm_agg_model[exog_vars]
mod_fe = PanelOLS(dependent_var90,exog, entity_effects=True, drop_absorbed=True)
fe_res = mod_fe.fit(cov_type = 'robust')

mod_re = RandomEffects(dependent_var90, exog)
re_res = mod_re.fit(cov_type='robust')
fe_res, re_res

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)
Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


(                          PanelOLS Estimation Summary                           
 Dep. Variable:                    D90   R-squared:                        0.0166
 Estimator:                   PanelOLS   R-squared (Between):              0.1568
 No. Observations:               99729   R-squared (Within):               0.0166
 Date:                Mon, Dec 04 2023   R-squared (Overall):              0.0253
 Time:                        21:36:34   Log-likelihood                -7.358e+04
 Cov. Estimator:                Robust                                           
                                         F-statistic:                      186.83
 Entities:                         268   P-value                           0.0000
 Avg Obs:                       372.12   Distribution:                 F(9,99452)
 Min Obs:                       222.00                                           
 Max Obs:                       1160.0   F-statistic (robust):             305.98
                

-0.208    0.045 -4.662 0.000 -0.295 -0.120
DHRI           -0.013    0.002 -7.351 0.000 -0.017 -0.010
LTV             0.001    0.000  2.502 0.012  0.000  0.002
OIR             0.003    0.002  1.168 0.243 -0.002  0.008
DTI            -0.000    0.000 -5.550 0.000 -0.000 -0.000
index_nsa       0.000    0.000  2.973 0.003  0.000  0.000
UNRATE          0.033    0.001 36.668 0.000  0.031  0.035
CPI_SA_change   0.774    0.564  1.373 0.170 -0.331  1.878
GDP_change     -0.212    0.121 -1.745 0.081 -0.449  0.026
HRCN_SSN        0.013    0.003  4.111 0.000  0.007  0.020
Group Var       0.001    0.000           

### Hausman

In [42]:
import numpy as np
import scipy.stats

def hausman_test(fe_res, re_res):
    """
    Conduct Hausman test to compare fixed effects and random effects models.

    Args:
    - fe_res: Fitted results from fixed effects model.
    - re_res: Fitted results from random effects model.

    Returns:
    - test_stat: Hausman test statistic.
    - p_value: P-value of the test statistic.
    """
    # Calculate difference in coefficients
    b = fe_res.params
    B = re_res.params
    diff = b - B

    # Calculate variance difference
    v_b = fe_res.cov
    v_B = re_res.cov
    diff_se = v_b - v_B  # Note the change here

    # Hausman test statistic
    test_stat = diff @ np.linalg.inv(diff_se) @ diff  # Corrected calculation

    # Degrees of freedom
    df = len(fe_res.params)

    # P-value
    p_value = 1 - scipy.stats.chi2.cdf(test_stat, df)

    return test_stat, p_value

# Run Hausman test
hausman_stat, hausman_p_value = hausman_test(fe_res, re_res)
print(f"Hausman Test Statistic: {hausman_stat}")
print(f"P-value: {hausman_p_value}")


Hausman Test Statistic: 27.140616808927046
P-value: 0.0013257733884837197


### Including HRCN frequency variable has to be REM since 

In [145]:
# Run a fixed effects regression
exog_vars = ['HRCN_AFREQ', 'HRCN_EALS','LTV', 'DTI', 'UNRATE', 'OIR', 'index_nsa', 'CPI_SA_change', 'GDP_change']
exog = fm_agg_model[exog_vars]
# mod_fe = PanelOLS(dependent_var, exog, entity_effects=True)
# fe_res = mod_fe.fit()

mod_re = RandomEffects(dependent_var, exog)
re_res = mod_re.fit()
re_res

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


0,1,2,3
Dep. Variable:,D90,R-squared:,0.0318
Estimator:,RandomEffects,R-squared (Between):,0.8067
No. Observations:,99729,R-squared (Within):,0.0160
Date:,"Tue, Nov 21 2023",R-squared (Overall):,0.0513
Time:,14:26:43,Log-likelihood,-7.38e+04
Cov. Estimator:,Unadjusted,,
,,F-statistic:,363.80
Entities:,268,P-value,0.0000
Avg Obs:,372.12,Distribution:,"F(9,99720)"
Min Obs:,222.00,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
HRCN_AFREQ,0.1919,0.0391,4.9022,0.0000,0.1151,0.2686
HRCN_EALS,-0.0001,0.0001,-0.7957,0.4262,-0.0004,0.0002
LTV,-0.0006,0.0003,-2.4953,0.0126,-0.0011,-0.0001
DTI,-0.0002,3.272e-05,-7.5928,0.0000,-0.0003,-0.0002
UNRATE,0.0313,0.0008,37.766,0.0000,0.0297,0.0329
OIR,-0.0032,0.0021,-1.5468,0.1219,-0.0073,0.0009
index_nsa,-1.128e-05,3.64e-05,-0.3099,0.7566,-8.262e-05,6.006e-05
CPI_SA_change,0.9192,0.5633,1.6317,0.1028,-0.1850,2.0234
GDP_change,-0.2197,0.1204,-1.8250,0.0680,-0.4556,0.0162


### MEI Squared

In [43]:
# Run a random effects regression
exog_vars = ['MEI', 'MEI2','LTV', 'UNRATE', 'OIR', 'index_nsa', 'CPI_SA_change', 'GDP_change', 'DTI']
exog = fm_agg_model[exog_vars]
mod_fe = PanelOLS(dependent_var, exog, entity_effects=True)
fe_res = mod_fe.fit()
mod_re = RandomEffects(dependent_var, exog)
re_res = mod_re.fit()
fe_res, re_res

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)
Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


(                          PanelOLS Estimation Summary                           
 Dep. Variable:                    D30   R-squared:                        0.0197
 Estimator:                   PanelOLS   R-squared (Between):             -8.7478
 No. Observations:               99715   R-squared (Within):               0.0197
 Date:                Sun, Dec 03 2023   R-squared (Overall):             -0.8224
 Time:                        11:57:24   Log-likelihood                -1.512e+05
 Cov. Estimator:            Unadjusted                                           
                                         F-statistic:                      221.87
 Entities:                         268   P-value                           0.0000
 Avg Obs:                       372.07   Distribution:                 F(9,99438)
 Min Obs:                       222.00                                           
 Max Obs:                       1160.0   F-statistic (robust):             221.87
                

### Including Squared DHRI term

In [46]:
# Run a random effects regression
fm_agg_model['DHRI2'] = fm_agg_model['DHRI'] * fm_agg_model['DHRI']
exog_vars = ['DHRI','DHRI2','LTV', 'UNRATE', 'OIR', 'index_nsa', 'CPI_SA_change', 'GDP_change', 'DTI']
exog = fm_agg_model[exog_vars]

mod_fe = PanelOLS(dependent_var, exog, entity_effects=True)
fe_res = mod_fe.fit()

mod_re = RandomEffects(dependent_var, exog)
re_res = mod_re.fit()

fe_res, re_res

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)
Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


(                          PanelOLS Estimation Summary                           
 Dep. Variable:                    D30   R-squared:                        0.0196
 Estimator:                   PanelOLS   R-squared (Between):             -9.2136
 No. Observations:               99715   R-squared (Within):               0.0196
 Date:                Sun, Dec 03 2023   R-squared (Overall):             -0.8670
 Time:                        11:57:55   Log-likelihood                -1.512e+05
 Cov. Estimator:            Unadjusted                                           
                                         F-statistic:                      220.77
 Entities:                         268   P-value                           0.0000
 Avg Obs:                       372.07   Distribution:                 F(9,99438)
 Min Obs:                       222.00                                           
 Max Obs:                       1160.0   F-statistic (robust):             220.77
                

HRCN

In [47]:
# Run a random effects regression
exog_vars = ['HRCN_EALS', 'HRCN_AFREQ', 'LTV', 'UNRATE', 'OIR', 'index_nsa', 'DTI', 'GDP_change', 'CPI_SA_change']
exog = fm_agg_model[exog_vars]
mod_re = RandomEffects(dependent_var, exog)
re_res = mod_re.fit()
re_res

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


0,1,2,3
Dep. Variable:,D30,R-squared:,0.0515
Estimator:,RandomEffects,R-squared (Between):,0.9131
No. Observations:,99715,R-squared (Within):,0.0182
Date:,"Sun, Dec 03 2023",R-squared (Overall):,0.1058
Time:,11:57:58,Log-likelihood,-1.514e+05
Cov. Estimator:,Unadjusted,,
,,F-statistic:,601.17
Entities:,268,P-value,0.0000
Avg Obs:,372.07,Distribution:,"F(9,99706)"
Min Obs:,222.00,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
HRCN_EALS,-5.769e-07,0.0003,-0.0017,0.9987,-0.0007,0.0007
HRCN_AFREQ,0.2853,0.0946,3.0162,0.0026,0.0999,0.4706
LTV,-0.0013,0.0006,-2.3882,0.0169,-0.0025,-0.0002
UNRATE,0.0162,0.0018,8.9805,0.0000,0.0127,0.0198
OIR,0.0769,0.0046,16.617,0.0000,0.0678,0.0860
index_nsa,0.0002,8.154e-05,2.4550,0.0141,4.036e-05,0.0004
DTI,-0.0008,7.269e-05,-11.521,0.0000,-0.0010,-0.0007
GDP_change,-4.1448,0.2622,-15.808,0.0000,-4.6587,-3.6309
CPI_SA_change,-1.1124,1.2275,-0.9062,0.3648,-3.5184,1.2936


### Mixed Model

In [48]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
aggregated_df_mm = aggregated_df.copy()
#drop rows with index_nsa nan
aggregated_df_mm = aggregated_df_mm.dropna(subset=['index_nsa'])

aggregated_df_mm['Month_num'] = pd.to_datetime(aggregated_df_mm['Month_num'], format = '%m').dt.month
aggregated_df_mm['HRCN_SSN'] = ((aggregated_df_mm['Month_num'] >= 6) & (aggregated_df_mm['Month_num'] <= 10)).astype(int)


exog_vars = ['DHRI','LTV', 'OIR', 'DTI', 'index_nsa', 'UNRATE','CPI_SA_change', 'GDP_change', 'HRCN_SSN']
# Prepare the dataset
aggregated_df_mm['D30_scaled'] = aggregated_df_mm['D30'] * 100
aggregated_df_mm['DHRI'] = aggregated_df_mm['MEI'] * ((aggregated_df_mm['HRCN_HLRB'] - aggregated_df_mm['HRCN_HLRB'].mean()) / aggregated_df_mm['HRCN_HLRB'].std())
# Create a formula for the mixed model with fixed effects for Date
formula = "D30_scaled ~ DHRI + LTV + OIR + DTI + index_nsa + UNRATE + CPI_SA_change + GDP_change + HRCN_SSN"
# Fit the mixed model with random intercepts for each 3ZIP
mixed_model = smf.mixedlm(formula, aggregated_df_mm, groups=aggregated_df_mm['MSA'])
mixed_result = mixed_model.fit()
print(mixed_result.summary())

IndexError: index 99715 is out of bounds for axis 0 with size 99715

In [251]:
hausman_stat, hausman_p_value = hausman_test(fe_res)
print(f"Hausman Test Statistic: {hausman_stat}")
print(f"P-value: {hausman_p_value}")

AttributeError: 'MixedLMResults' object has no attribute 'cov'

In [243]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
aggregated_df_mm = aggregated_df.copy()
#drop rows with index_nsa nan
aggregated_df_mm = aggregated_df_mm.dropna(subset=['index_nsa'])

aggregated_df_mm['Month_num'] = pd.to_datetime(aggregated_df_mm['Month_num'], format = '%m').dt.month
aggregated_df_mm['HRCN_SSN'] = ((aggregated_df_mm['Month_num'] >= 6) & (aggregated_df_mm['Month_num'] <= 10)).astype(int)


exog_vars = ['DHRI', 'DHRI2','LTV', 'OIR', 'DTI', 'index_nsa', 'UNRATE','CPI_SA_change', 'GDP_change', 'HRCN_SSN']
# Prepare the dataset
aggregated_df_mm['D90_scaled'] = aggregated_df_mm['D90'] * 100
aggregated_df_mm['DHRI'] = aggregated_df_mm['MEI'] * ((aggregated_df_mm['HRCN_HLRB'] - aggregated_df_mm['HRCN_HLRB'].mean()) / aggregated_df_mm['HRCN_HLRB'].std())
# Create a formula for the mixed model with fixed effects for Date
formula = "D90_scaled ~ HRCN_AFREQ + HRCN_EALS + LTV + OIR + DTI + index_nsa + UNRATE + CPI_SA_change + GDP_change + HRCN_SSN"
# Fit the mixed model with random intercepts for each 3ZIP
mixed_model = smf.mixedlm(formula, aggregated_df_mm, groups=aggregated_df_mm['MSA'])
mixed_result = mixed_model.fit()
display(mixed_result.summary())



0,1,2,3
Model:,MixedLM,Dependent Variable:,D90_scaled
No. Observations:,99729,Method:,REML
No. Groups:,268,Scale:,0.2569
Min. group size:,222,Log-Likelihood:,-73928.3789
Max. group size:,1160,Converged:,Yes
Mean group size:,372.1,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-0.217,0.044,-4.877,0.000,-0.304,-0.130
HRCN_AFREQ,0.181,0.044,4.109,0.000,0.095,0.267
HRCN_EALS,-0.000,0.000,-0.285,0.775,-0.000,0.000
LTV,0.001,0.000,2.899,0.004,0.000,0.002
OIR,0.001,0.002,0.443,0.658,-0.004,0.006
DTI,-0.000,0.000,-6.221,0.000,-0.000,-0.000
index_nsa,0.000,0.000,2.270,0.023,0.000,0.000
UNRATE,0.033,0.001,36.647,0.000,0.031,0.035
CPI_SA_change,0.789,0.564,1.400,0.162,-0.316,1.894


In [None]:
# Run a random effects regression
exog_vars = ['HRCN_EALS', 'HRCN_AFREQ', 'LTV', 'UNRATE', 'OIR', 'index_nsa']
exog = fm_agg_model[exog_vars]
mod_re = RandomEffects(dependent_var, exog)
re_res = mod_re.fit()
re_res

# Modelling on Loan Level

In [53]:
agg_loan = dd.read_parquet('../Data/agg_loan.parquet')

In [54]:
agg_loan.head()

Unnamed: 0,LSN,MSA,FPD,FIRST_F,MD,POSTAL,P_TYPE,D90,D180,HRCN_RISK_CATEGORY_QUANTILE,...,HRCN_EALA,HRCN_EALT,HRCN_EALS,HRCN_ALRB,HRCN_ALRP,HRCN_ALRA,HRCN_ALR_N,HRCN_RISKV,HRCN_RISKS,HRCN_EALS_Norm
0,F99Q10000029,10420,2002-10-01,0,2029-02-01,44200,SF,0,0,Relatively Moderate,...,3899.622027,359481.079592,55.923044,6e-06,1.453149e-09,0.000121,29.853005,358143.40386,55.720054,0.114308
1,F99Q10002396,10420,1999-03-01,0,2029-02-01,44200,SF,0,0,Relatively Moderate,...,3899.622027,359481.079592,55.923044,6e-06,1.453149e-09,0.000121,29.853005,358143.40386,55.720054,0.114308
2,F99Q10043324,10420,1999-03-01,0,2029-02-01,44200,SF,0,0,Relatively Moderate,...,3899.622027,359481.079592,55.923044,6e-06,1.453149e-09,0.000121,29.853005,358143.40386,55.720054,0.114308
3,F99Q10099411,10420,1999-05-01,0,2029-04-01,44200,CO,0,0,Relatively Moderate,...,3899.622027,359481.079592,55.923044,6e-06,1.453149e-09,0.000121,29.853005,358143.40386,55.720054,0.114308
4,F99Q10102136,10420,1999-04-01,0,2029-03-01,44200,SF,0,0,Relatively Moderate,...,3899.622027,359481.079592,55.923044,6e-06,1.453149e-09,0.000121,29.853005,358143.40386,55.720054,0.114308


In [55]:
db_path = "../Database/thesis_database.db"
conn = sqlite3.connect(db_path, check_same_thread=False)

### HPI

In [56]:
# Create the FPD_quarter and MD_quarter variables
agg_loan['FPD_quarter'] = agg_loan['FPD'].dt.year.astype(str) + 'Q' + agg_loan['FPD'].dt.quarter.astype(str)
agg_loan['Last_quarter'] = agg_loan['Date'].dt.year.astype(str) + 'Q' + agg_loan['MD'].dt.quarter.astype(str)


In [57]:
hpi_master = pd.read_excel('../Data/HPI_master.xlsx', sheet_name= "HPI_master",dtype={'place_id': str, 'yr': int, 'period': int, 'index_nsa': float, 'quarter': str})

In [58]:
agg_loan = agg_loan.merge(hpi_master[['place_id', 'quarter', 'index_nsa']], left_on = ['MSA', 'FPD_quarter'], right_on = ['place_id', 'quarter'] ,how='left').rename(columns={'index_nsa': 'HPI_FPD'})
agg_loan = agg_loan.merge(hpi_master[['place_id', 'quarter', 'index_nsa']], left_on = ['MSA', 'Last_quarter'], right_on = ['place_id', 'quarter'] ,how='left').rename(columns={'index_nsa': 'HPI_Last'})
agg_loan['HPI'] = agg_loan['HPI_Last'] - agg_loan['HPI_FPD']

In [59]:
agg_loan[['FPD_quarter', 'HPI_FPD', 'Last_quarter', 'HPI_Last', 'HPI']].head()

Unnamed: 0,FPD_quarter,HPI_FPD,Last_quarter,HPI_Last,HPI
0,2002Q4,139.84,2004Q1,144.52,4.68
1,1999Q1,122.26,2010Q1,135.21,12.95
2,1999Q1,122.26,2002Q1,136.77,14.51
3,1999Q2,123.53,2002Q2,136.99,13.46
4,1999Q2,123.53,2003Q1,140.76,17.23


### MEI

In [60]:
query = """
SELECT * FROM enso_mei;
"""
enso_mei = pd.read_sql_query(query, conn)


In [61]:
enso_mei['Date'] = pd.to_datetime(enso_mei['Date'])

In [62]:
agg_loan_df = agg_loan.compute()

In [63]:
#Create a function that gets the average of the MEI between two dates
tqdm.pandas()
def get_avg_mei(start_date, end_date):
    avg_mei = enso_mei[(enso_mei['Date'] >= start_date) & (enso_mei['Date'] <= end_date)]['MEI'].mean()
    return avg_mei
#apply to agg_loan in new column called MEI 
agg_loan_df['MEI'] = agg_loan_df.progress_apply(lambda x: get_avg_mei(x['FPD'], x['MD']), axis=1)

100%|██████████| 456494/456494 [01:21<00:00, 5584.88it/s]


In [64]:
agg_loan_df['DHRI'] = agg_loan_df['MEI'] * (agg_loan_df['HRCN_HLRA'] - agg_loan_df['HRCN_HLRA'].mean()) / agg_loan_df['HRCN_HLRA'].std()

In [65]:
#save agg_loan_df to parquet
agg_loan_df.to_parquet('../Data/agg_loanlvl_df.parquet', engine='pyarrow')

### UNRATE

In [32]:
type(agg_loan_df)

pandas.core.frame.DataFrame

In [19]:
agg_loan_df.dtypes
#Set MSA as category, FIRST_F as category, P_TYPE as category, Date as category
agg_loan_df['MSA'] = agg_loan_df['MSA'].astype('category')
agg_loan_df['FIRST_F'] = agg_loan_df['FIRST_F'].astype('category')
agg_loan_df['P_TYPE'] = agg_loan_df['P_TYPE'].astype('category')
agg_loan_df['Date'] = agg_loan_df['Date'].astype('category')
#remove string columns
agg_loan_df.dtypes


LSN                            string[pyarrow]
MSA                                   category
FPD                             datetime64[ns]
FIRST_F                               category
MD                              datetime64[ns]
POSTAL                         string[pyarrow]
P_TYPE                                category
D90                                      int64
D180                                     int64
HRCN_RISK_CATEGORY_QUANTILE    string[pyarrow]
Date                                  category
CLDS                                   float64
AGE                                    float64
CIR                                    float64
ELTV                                   float64
DDD                                    float64
CS                                     float64
MIP                                    float64
CLTV                                   float64
DTI                                    float64
LTV                                    float64
OIR          

In [20]:
#remove datatime columns
agg_loan_df = agg_loan_df.drop(columns=['FPD', 'MD'], errors='ignore')

In [21]:
string_cols = agg_loan_df.select_dtypes(include=['string']).columns
#remove string columns
agg_loan_xgb = agg_loan_df.drop(columns=string_cols)

# Add geometry

In [47]:
#load geometry_key.to_pickle('../../Data/geometry_key.pkl')
geometry_key = pd.read_pickle('../Data/geometry_key.pkl')

In [48]:
#merge with aggregated_df on MSA
aggregated_df_geo = aggregated_df_mm.merge(geometry_key, on='MSA', how='left')

# Gekloot

In [52]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
# Assume 'aggregated' is your DataFrame and 'geometry_data' is your GeoDataFrame with MSA geometries


specific_month = '2022-05'  # Replace with the month you are interested in
# Step 1: Filter the dataset for the specific month
filtered_data = aggregated_df_geo[aggregated_df_geo['Date'] == specific_month]
#select only D90 is not 0
filtered_data = filtered_data[(filtered_data['D90'] != 0)]


In [109]:
#Extract MEI timeseries on unique date
mei_ts = aggregated_df[['Date', 'MEI']].drop_duplicates()

In [129]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib.colors import ListedColormap
import matplotlib.gridspec as gridspec

aggregated_df_geo.loc[:,'DHRI'] = aggregated_df_geo['MEI'] * ((aggregated_df_geo['HRCN_EALS']- aggregated_df_geo['HRCN_EALS'].mean()) / aggregated_df_geo['HRCN_EALS'].std())
 
# ... your data loading code here ...
bin_edges = np.linspace(aggregated_df_geo['DHRI'].min(), aggregated_df_geo['DHRI'].max(), 7)
# Function to update the plot for each month
anim_df = aggregated_df_geo[['MSA', 'Date', 'DHRI', 'geometry']].copy()
#merge geometry_key with aggregated_df_geo['MSA', 'D90'] on MSA if MSA not foung in aggreagted_df then set D90 to 0
def update(month):
    ax1.clear()
    ax2.clear()

    #Update MSA plot
    filtered_data = anim_df[anim_df['Date'] == month].copy()
    # Use cut instead of qcut
    filtered_data.loc[:, 'DHRI_cat'] = pd.cut(
        filtered_data['DHRI'], 
        bins=bin_edges, 
        labels=['Very Low', "Relatively Low", 'Relatively Moderate', 'Moderate', 'Relatively High', 'Very High'],
        include_lowest=True
    )
    merged_geo_data = gpd.GeoDataFrame(filtered_data, geometry='geometry')  # Ensure the result is a GeoDataFrame
    
    ax1.set_title(f"DHRI (USA) {month}", fontdict={'fontsize': '25', 'fontweight' : '3'})
    merged_geo_data.plot(column='DHRI_cat', ax=ax1, cmap=custom_cmap, legend=True)
    ax1.set_xticks([])
    ax1.set_yticks([])
    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax1.spines['bottom'].set_visible(False)
    ax1.spines['left'].set_visible(False)

    # Updating MEI timeseries plot (ax2)
    mei_data = mei_ts[mei_ts['Date'] <= month]
    ax2.plot(mei_data['Date'], mei_data['MEI'], '-o')
    ax2.set_title("MEI Timeseries", fontdict={'fontsize': '15', 'fontweight' : '3'})
    ax2.set_xlim([dates.min(), dates.max()])
    ax2.set_xlabel("Date")
    ax2.set_ylabel("MEI")
    ax2.grid(True)

# List of months to animate through
months = pd.date_range(start='2005-01', end='2010-05', freq='M').strftime('%Y-%m')
dates = pd.to_datetime(months)

# Set up the plot
fig = plt.figure(figsize=(10, 15))
gs = gridspec.GridSpec(2, 1, height_ratios=[3, 1])
ax1 = plt.subplot(gs[0])
ax2 = plt.subplot(gs[1])
colors = ['#08306b', '#4292c6', '#fdae6b', '#f16913', '#67000d']
custom_cmap = ListedColormap(colors)


# Create the animation
ani = animation.FuncAnimation(fig, update, frames=months, repeat=False)

# Save the animation to a file
ani.save('delinquency_animation.mp4', writer='ffmpeg', fps=1)

# Optionally, display the animation in the notebook (this might not work in all environments)
plt.close(fig)
from IPython.display import Video
Video('delinquency_animation.mp4')
