In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import linearmodels as lm
import statsmodels.api as sm
from linearmodels import PanelOLS
from linearmodels import RandomEffects
from linearmodels import PooledOLS
from linearmodels import FirstDifferenceOLS
from linearmodels import BetweenOLS
from linearmodels import FamaMacBeth
import sqlite3
from tqdm import tqdm
import dask.dataframe as dd
from dask import delayed, compute
from dask.diagnostics import ProgressBar

# Merge FM and HRCN to get smaller set (Only states with HRCN risk data)

In [3]:
db_path = "../Database/thesis_database.db"
conn = sqlite3.connect(db_path, check_same_thread=False)

In [4]:
hrcn_risk_agg = pd.read_sql("SELECT * FROM hrcn_risk_agg", conn)

In [5]:
# read'../../Data/mainland_usa_gdf_msa_aggregated.pkl'
mortgage_hrcn = {}
#Merge Mortgage data and Hurricane Data on MSA ||| This reduces the datasets by removing all unessential states data
pb = tqdm(range(1999, 2023))
for year in pb:
    pb.set_description(f"Merging {year} hrcn and mortgage data")
    query = f"""
    SELECT *
    FROM fm_{year}
    """
    df_year = pd.read_sql(query, conn) 
    merged_df = df_year.merge(hrcn_risk_agg, on='MSA', how='inner')
    # fm_combined = pd.concat([fm_combined, merged_df])
    key_name = f"fm_{year}"
    mortgage_hrcn[key_name] = merged_df
    del merged_df, df_year
#Runtime: 4:30

Merging 1999 hrcn and mortgage data:   0%|          | 0/24 [00:00<?, ?it/s]

Merging 2019 hrcn and mortgage data:  83%|████████▎ | 20/24 [03:55<00:47, 11.94s/it]

In [None]:
pb = tqdm(mortgage_hrcn.items())
for key, dataset in pb:
    pb.set_description(f"writing {key} to database")
    key_hrcn = f"{key}_hrcn"
    # Write the dataset to the database
    dataset.to_sql(key, conn, if_exists="replace", index=False)
    del dataset

writing fm_2022 to database: 100%|██████████| 24/24 [02:57<00:00,  7.38s/it]


# Combine Files

In [None]:
# Assuming you have already created a Dask SQL connection
# (if not, you'll need to set that up)
def fetch_and_merge(year):
    query = f"""
    SELECT *
    FROM fm_{year}_hrcn
    """
    merged_df = dd.from_pandas(pd.read_sql(query, conn), npartitions=10)  # Adjust npartitions based on your available cores and data size
    return merged_df

# Using list comprehension with Dask's delayed
results = [delayed(fetch_and_merge)(year) for year in range(1999, 2023)]

# Compute the results in parallel
with ProgressBar():
    merged_dataframes = compute(*results, scheduler='single-threaded')
# Ensure merged_dataframes is a list
merged_dataframes_list = list(merged_dataframes)
fm_combined = dd.concat(merged_dataframes_list)


[########################################] | 100% Completed | 237.19 s


In [None]:
fm_combined['FIRST_F'] = fm_combined['FIRST_F'].astype(str)
fm_combined.to_parquet('../Data/fm_combined.parquet', engine='pyarrow')

# Aggregate Dataset

In [None]:
fm_combined = dd.read_parquet('../Data/fm_combined.parquet')

### Aggregate on MSA and Date

In [None]:
# Step 1: Drop unnecessary columns
fm_combined = fm_combined.drop(columns=['ELTV', 'FPD', 'MD'], errors='ignore')
# Step 2: Create CLDS90 and CLDS180 columns
fm_combined['CLDS90'] = (fm_combined['CLDS'] == 3).astype(int)
fm_combined['CLDS180'] = (fm_combined['CLDS'] == 6).astype(int)
fm_combined['D90_month'] = 1
fm_combined['D180_month'] = 1


In [None]:
#fm_combined to parquet
fm_combined.to_parquet('../Data/fm_combined_2.parquet', engine='pyarrow')

In [None]:
# Define your function as before
def create_indicators(group):
    d90_date = group[group['CLDS90'] == 1]['Date'].min()
    d180_date = group[group['CLDS180'] == 1]['Date'].min()
    if pd.notnull(d90_date):
        group['D90_month'] = (group['Date'] <= d90_date).astype(int)
    if pd.notnull(d180_date):
        group['D180_month'] = (group['Date'] <= d180_date).astype(int)
    return group

# Use Dask's groupby and apply methods
with ProgressBar():
    fm_combined = fm_combined.groupby('LSN').apply(create_indicators, meta=fm_combined).compute(scheduler='threads')
#22 minutes

[########################################] | 100% Completed | 21m 27s


In [None]:
# Define a custom aggregation function
def custom_aggregation(group):
    agg_data = {
        'UNQ_LSN': group['LSN'].nunique(),
        'P_TYPE_MOST_FREQ': group['P_TYPE'].mode()[0],  # Using mode for most frequent in Pandas
        'D90': group[group['D90_month'] == 1]['CLDS90'].sum() / group[group['D90_month'] == 1]['LSN'].nunique(),
        'D180': group[group['D180_month'] == 1]['CLDS180'].sum() / group[group['D180_month'] == 1]['LSN'].nunique()
    }
    
    # Add mean for all other columns
    for col in group.columns:
        if col not in ['MSA', 'Date', 'LSN', 'P_TYPE', 'CLDS90', 'CLDS180', 'D90_month', 'D180_month']:
            try:
                agg_data[col] = group[col].mean()
            except:
                pass

    return pd.Series(agg_data)

# Wrap the groupby object with tqdm for progress bar
tqdm.pandas(desc="Aggregating Data")
aggregated = fm_combined.groupby(['MSA', 'Date']).progress_apply(custom_aggregation).reset_index()
aggregated['Date'] = pd.to_datetime(aggregated['Date'])
# Runtime: 

Aggregating Data: 100%|██████████| 77428/77428 [03:41<00:00, 349.91it/s]


In [None]:
del fm_combined

In [None]:
aggregated.head()

Unnamed: 0,MSA,Date,UNQ_LSN,P_TYPE_MOST_FREQ,D90,D180,CLDS,AGE,CIR,DDD,...,HRCN_EALA,HRCN_EALT,HRCN_EALS,HRCN_ALRB,HRCN_ALRP,HRCN_ALRA,HRCN_ALR_N,HRCN_RISKV,HRCN_RISKS,HRCN_EALS_Norm
0,10180,1999-03-01,1,SF,0.0,0.0,0.0,1.0,6.625,0.0,...,3450.990886,67062.792715,29.528751,6e-06,1.386015e-09,8.7e-05,31.546332,82619.358078,31.493943,-0.872395
1,10180,1999-04-01,1,SF,0.0,0.0,0.0,2.0,6.625,0.0,...,3450.990886,67062.792715,29.528751,6e-06,1.386015e-09,8.7e-05,31.546332,82619.358078,31.493943,-0.872395
2,10180,1999-05-01,1,SF,0.0,0.0,0.0,3.0,6.625,0.0,...,3450.990886,67062.792715,29.528751,6e-06,1.386015e-09,8.7e-05,31.546332,82619.358078,31.493943,-0.872395
3,10180,1999-06-01,2,SF,0.0,0.0,0.0,2.5,7.0625,0.0,...,3450.990886,67062.792715,29.528751,6e-06,1.386015e-09,8.7e-05,31.546332,82619.358078,31.493943,-0.872395
4,10180,1999-07-01,3,SF,0.0,0.0,0.0,2.333333,7.291667,0.0,...,3450.990886,67062.792715,29.528751,6e-06,1.386015e-09,8.7e-05,31.546332,82619.358078,31.493943,-0.872395


In [None]:
#save aggregated to parquet
aggregated.to_parquet('../Data/aggregated.parquet', engine='pyarrow')

### Aggreagate on Loan

In [None]:
# read fm_combined.to_parquet('../Data/fm_combined_2.parquet', engine='pyarrow')
fm_combined = dd.read_parquet('../Data/fm_combined.parquet')

In [None]:
# Create D90 and D180 indicator variables
fm_combined['D90'] = (fm_combined['CLDS'] == 3).astype(int)
fm_combined['D180'] = (fm_combined['CLDS'] == 6).astype(int)
 
agg_loan = fm_combined
del fm_combined


In [None]:
# Adjusting to capture only the first occurrence of D90 and D180 for each LSN
aggregation = {
    'MSA': 'first',  # Assuming it's constant for a given LSN
    'FPD': 'first',
    'FIRST_F': 'first',
    'MD': 'first',
    'POSTAL': 'first',
    'P_TYPE': 'first',
    'D90': 'max',
    'D180': 'max',
    'HRCN_RISK_CATEGORY_QUANTILE': 'first',
    'Date' : 'last'
}
# Exclude specific columns from aggregation
exclude_cols = ['LSN', 'Date', 'MSA', 'FPD', 'FIRST_F', 'MD', 'POSTAL', 'P_TYPE', 'D90', 'D180', 'HRCN_RISK_CATEGORY_QUANTILE']
# Add the remaining columns to the aggregation dictionary with mean function
for col in agg_loan.columns:
    if col not in exclude_cols:
        aggregation[col] = 'mean'
# Perform aggregation
agg_loan = agg_loan.groupby('LSN').agg(aggregation).reset_index()

In [None]:
agg_loan = agg_loan.compute()

In [None]:
# Convert MD and FPD to date format with %Y%m
agg_loan['MD'] = pd.to_datetime(agg_loan['MD'].astype(str), format='%Y%m')
agg_loan['FPD'] = pd.to_datetime(agg_loan['FPD'].astype(str), format='%Y%m')
agg_loan['Date'] = pd.to_datetime(agg_loan['Date'])


In [None]:
#average number of D90 = 1 loans
display(agg_loan['D90'].mean())
#unique MSA
display(agg_loan['MSA'].nunique())

0.05431396688674988

270

In [None]:
#to parquet
agg_loan.to_parquet('../Data/agg_loan.parquet', engine='pyarrow')

# Modelling on MSA and Date

In [None]:
aggregated = dd.read_parquet('../Data/aggregated.parquet')

In [None]:
aggregated.head()

Unnamed: 0,MSA,Date,UNQ_LSN,P_TYPE_MOST_FREQ,D90,D180,CLDS,AGE,CIR,DDD,...,HRCN_EALA,HRCN_EALT,HRCN_EALS,HRCN_ALRB,HRCN_ALRP,HRCN_ALRA,HRCN_ALR_N,HRCN_RISKV,HRCN_RISKS,HRCN_EALS_Norm
0,10180,1999-03-01,1,SF,0.0,0.0,0.0,1.0,6.625,0.0,...,3450.990886,67062.792715,29.528751,6e-06,1.386015e-09,8.7e-05,31.546332,82619.358078,31.493943,-0.872395
1,10180,1999-04-01,1,SF,0.0,0.0,0.0,2.0,6.625,0.0,...,3450.990886,67062.792715,29.528751,6e-06,1.386015e-09,8.7e-05,31.546332,82619.358078,31.493943,-0.872395
2,10180,1999-05-01,1,SF,0.0,0.0,0.0,3.0,6.625,0.0,...,3450.990886,67062.792715,29.528751,6e-06,1.386015e-09,8.7e-05,31.546332,82619.358078,31.493943,-0.872395
3,10180,1999-06-01,2,SF,0.0,0.0,0.0,2.5,7.0625,0.0,...,3450.990886,67062.792715,29.528751,6e-06,1.386015e-09,8.7e-05,31.546332,82619.358078,31.493943,-0.872395
4,10180,1999-07-01,3,SF,0.0,0.0,0.0,2.333333,7.291667,0.0,...,3450.990886,67062.792715,29.528751,6e-06,1.386015e-09,8.7e-05,31.546332,82619.358078,31.493943,-0.872395


In [None]:
db_path = "../Database/thesis_database.db"
conn = sqlite3.connect(db_path, check_same_thread=False)

In [None]:
aggregated_df = aggregated.compute()
aggregated_df['Date'] = pd.to_datetime(aggregated_df['Date'])

### HPI

In [None]:
#loag hpi master
hpi_master = pd.read_excel('../Data/HPI_master.xls', dtype={'place_id': str, 'yr': int, 'period': int, 'index_nsa': float, 'quarter': str})
#keep place_id, yr, period and index_nsa
hpi_master = hpi_master[['place_id', 'yr', 'period', 'index_nsa', 'quarter']]
#add period column to aggregated_df for each quarter
aggregated_df['quarter'] = aggregated_df['Date'].dt.year.astype(str) + "Q" + aggregated_df['Date'].dt.quarter.astype(str)
#merge aggregated_df and hpi_master on place_id and period only keeping index_nsa
aggregated_df = aggregated_df.merge(hpi_master, left_on=['MSA','quarter'], right_on=['place_id', 'quarter'], how='left')

####Missing 2023q1 of 48060 and 13220. Moreover missing data from 19260 and 41780. Thats all!

### Load and merge MEI

In [None]:
#Get enso_mei
query = """
SELECT
    *
FROM enso_mei;
"""
enso_mei = pd.read_sql_query(query, conn)
enso_mei['Date'] = pd.to_datetime(enso_mei['Date'])
#merge enso_mei with aggregated_df
aggregated_df = aggregated_df.merge(enso_mei, on='Date', how='left')

### Load and merge Unemployment

In [None]:
#load UNRATE.csv by ; and merge with aggregated_df on Date
unrate = pd.read_csv('../Data/UNRATE.csv', sep=';')
#Convert Date to datetime format 01/01/1948
unrate['Date'] = pd.to_datetime(unrate['DATE'], format= '%d/%m/%Y')
#Only merge UNRATE column with aggregated_df
unrate = unrate[['Date', 'UNRATE']]
aggregated_df = aggregated_df.merge(unrate, how='left', on='Date')


## Regression Models

In [None]:
from linearmodels.panel import PanelOLS, RandomEffects
# Convert the dataset into a panel structure
fm_agg_model = aggregated_df.set_index(['MSA', 'Date'])

# Define dependent variable and independent variables
dependent_var = fm_agg_model['D90']*100
fm_agg_model['DHRI'] = fm_agg_model['MEI'] * ((fm_agg_model['HRCN_HLRA']- fm_agg_model['HRCN_HLRA'].mean()) / fm_agg_model['HRCN_HLRA'].std())
fm_agg_model['DHRI2'] = fm_agg_model['MEI'] * fm_agg_model['HRCN_EALS']
fm_agg_model['MEI2'] = fm_agg_model['MEI']**2

In [None]:
from linearmodels.panel.results import compare
from scipy import stats
# Assuming df is your dataframe and 'MSA' and 'Date' are your indices
exog_vars = ['LTV', 'MEI','DHRI','UNRATE', 'OIR', 'index_nsa']
exog = fm_agg_model[exog_vars]
mod_fe = PanelOLS(fm_agg_model.D90, exog, entity_effects=True)
fe_res = mod_fe.fit(cov_type='clustered', cluster_entity=True)

mod_re = RandomEffects(fm_agg_model.D90, exog)
re_res = mod_re.fit()

print(compare({'FE': fe_res, 'RE': re_res}))


# Extract coefficients and variance-covariance matrices
b_fe = fe_res.params
b_re = re_res.params
var_fe = fe_res.cov
var_re = re_res.cov

# Hausman test statistic
diff = b_fe - b_re
diff_var = var_fe - var_re
m = np.dot(diff.T, np.linalg.solve(diff_var, diff))

# Degree of freedom is number of regressors (including intercept if it exists)
df = len(b_fe)

# P-value from chi-squared distribution
p_val = 1 - stats.chi2.cdf(m, df)

print(f'Hausman Test Statistic: {m}')
print(f'P-value: {p_val}')

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)
Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


                    Model Comparison                    
                                    FE                RE
--------------------------------------------------------
Dep. Variable                      D90               D90
Estimator                     PanelOLS     RandomEffects
No. Observations                 99729             99729
Cov. Est.                    Clustered        Unadjusted
R-squared                       0.0154            0.0298
R-Squared (Within)              0.0154            0.0152
R-Squared (Between)            -2.4882            0.8070
R-Squared (Overall)            -0.0902            0.0505
F-statistic                     259.67            509.68
P-value (F-stat)                0.0000            0.0000
LTV                          8.339e-07        -1.882e-05
                              (0.0952)         (-9.5686)
MEI                         -8.461e-06        -1.491e-05
                             (-0.3927)         (-0.7795)
DHRI                        -6.

### VIF

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
#remove NA rows fm_agg_model
fm_agg_model_na = fm_agg_model.dropna()
exog_vars = ['LTV', 'HRCN_HLRA','MEI','DHRI','UNRATE', 'OIR', 'index_nsa']
X = add_constant(fm_agg_model_na[exog_vars])
vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
for i, v in enumerate(vif):
    print(f"VIF for {X.columns[i]}: {v}")


VIF for const: 524.9177312569249
VIF for LTV: 1.1479399588502743
VIF for HRCN_HLRA: 1.1727537261021963
VIF for MEI: 1.031426719909225
VIF for DHRI: 1.1611125332632917
VIF for UNRATE: 1.1090298678216541
VIF for OIR: 1.6583117109759138
VIF for index_nsa: 1.7452613455149235


### Robuts SE test

### Autocorrelation

### Fixed Effects

In [None]:
# Run a fixed effects regression
exog_vars = ['MEI','DHRI','LTV', 'UNRATE', 'OIR','index_nsa']
exog = fm_agg_model[exog_vars]
mod_fe = PanelOLS(dependent_var, exog, entity_effects=True)
fe_res = mod_fe.fit()
fe_res

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


0,1,2,3
Dep. Variable:,D90,R-squared:,0.0154
Estimator:,PanelOLS,R-squared (Between):,-2.4926
No. Observations:,99729,R-squared (Within):,0.0154
Date:,"Wed, Nov 01 2023",R-squared (Overall):,-0.0907
Time:,14:37:24,Log-likelihood,-7.364e+04
Cov. Estimator:,Unadjusted,,
,,F-statistic:,259.44
Entities:,268,P-value,0.0000
Avg Obs:,372.12,Distribution:,"F(6,99455)"
Min Obs:,222.00,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
MEI,-0.0009,0.0019,-0.4511,0.6519,-0.0046,0.0029
DHRI,-0.0062,0.0019,-3.3687,0.0008,-0.0099,-0.0026
LTV,9.917e-05,0.0005,0.1860,0.8525,-0.0009,0.0011
UNRATE,0.0329,0.0009,37.428,0.0000,0.0311,0.0346
OIR,0.0106,0.0019,5.6762,0.0000,0.0070,0.0143
index_nsa,0.0002,4.433e-05,4.6601,0.0000,0.0001,0.0003


In [None]:
# Run a fixed effects regression
exog_vars = ['HRCN_EALS', 'HRCN_AFREQ', 'LTV', 'UNRATE', 'OIR', 'index_nsa']
exog = fm_agg_model[exog_vars]
mod_fe = PanelOLS(dependent_var, exog, entity_effects=True)
fe_res = mod_fe.fit()
fe_res

In [None]:
# Run a fixed effects regression
exog_vars = ['DHRI', 'LTV', 'UNRATE', 'OIR', 'index_nsa']
exog = fm_agg_model[exog_vars]
mod_fe = PanelOLS(dependent_var, exog, entity_effects=True)
fe_res = mod_fe.fit()
fe_res

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


0,1,2,3
Dep. Variable:,D90,R-squared:,0.0154
Estimator:,PanelOLS,R-squared (Between):,-2.5202
No. Observations:,99729,R-squared (Within):,0.0154
Date:,"Mon, Oct 30 2023",R-squared (Overall):,-0.0920
Time:,13:13:50,Log-likelihood,-7.364e+04
Cov. Estimator:,Unadjusted,,
,,F-statistic:,311.28
Entities:,268,P-value,0.0000
Avg Obs:,372.12,Distribution:,"F(5,99456)"
Min Obs:,222.00,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
DHRI,-0.0062,0.0019,-3.3651,0.0008,-0.0099,-0.0026
LTV,8.826e-05,0.0005,0.1657,0.8684,-0.0010,0.0011
UNRATE,0.0329,0.0009,37.663,0.0000,0.0312,0.0346
OIR,0.0108,0.0018,5.8394,0.0000,0.0072,0.0144
index_nsa,0.0002,4.285e-05,4.9412,0.0000,0.0001,0.0003


In [None]:
# Run a fixed effects regression
exog_vars = ['HRCN_EALS', 'HRCN_AFREQ', 'LTV', 'UNRATE', 'OIR', 'index_nsa']
exog = fm_agg_model[exog_vars]
mod_fe = PanelOLS(dependent_var, exog, entity_effects=True)
fe_res = mod_fe.fit()
fe_res

### Random Effects

DHRI

In [None]:
# Run a random effects regression
exog_vars = ['MEI', 'MEI2','LTV', 'UNRATE', 'OIR', 'index_nsa']
exog = fm_agg_model[exog_vars]
mod_re = RandomEffects(dependent_var, exog)
re_res = mod_re.fit()
re_res

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


0,1,2,3
Dep. Variable:,D90,R-squared:,0.0293
Estimator:,RandomEffects,R-squared (Between):,0.8022
No. Observations:,99729,R-squared (Within):,0.0152
Date:,"Fri, Nov 03 2023",R-squared (Overall):,0.0502
Time:,10:48:50,Log-likelihood,-7.383e+04
Cov. Estimator:,Unadjusted,,
,,F-statistic:,500.83
Entities:,268,P-value,0.0000
Avg Obs:,372.12,Distribution:,"F(6,99723)"
Min Obs:,222.00,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
MEI,-0.0030,0.0020,-1.4710,0.1413,-0.0070,0.0010
MEI2,-0.0036,0.0015,-2.3850,0.0171,-0.0066,-0.0006
LTV,-0.0019,0.0002,-9.6578,0.0000,-0.0023,-0.0015
UNRATE,0.0312,0.0008,38.638,0.0000,0.0296,0.0328
OIR,0.0075,0.0017,4.5125,0.0000,0.0042,0.0108
index_nsa,0.0001,3.524e-05,3.2628,0.0011,4.591e-05,0.0002


In [None]:
# Run a random effects regression
exog_vars = ['MEI','HRCN_HLRA','DHRI', 'LTV', 'UNRATE', 'OIR', 'index_nsa']
exog = fm_agg_model[exog_vars]
mod_re = RandomEffects(dependent_var, exog)
re_res = mod_re.fit()
re_res

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


0,1,2,3
Dep. Variable:,D90,R-squared:,0.0299
Estimator:,RandomEffects,R-squared (Between):,0.8109
No. Observations:,99729,R-squared (Within):,0.0152
Date:,"Fri, Nov 03 2023",R-squared (Overall):,0.0506
Time:,10:49:03,Log-likelihood,-7.382e+04
Cov. Estimator:,Unadjusted,,
,,F-statistic:,438.56
Entities:,268,P-value,0.0000
Avg Obs:,372.12,Distribution:,"F(7,99722)"
Min Obs:,222.00,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
MEI,-0.0017,0.0019,-0.8634,0.3879,-0.0054,0.0021
HRCN_HLRA,0.3932,0.1669,2.3557,0.0185,0.0661,0.7204
DHRI,-0.0069,0.0019,-3.7144,0.0002,-0.0105,-0.0033
LTV,-0.0019,0.0002,-9.6338,0.0000,-0.0023,-0.0015
UNRATE,0.0308,0.0008,38.500,0.0000,0.0292,0.0324
OIR,0.0070,0.0017,4.1974,0.0000,0.0037,0.0103
index_nsa,8.425e-05,3.553e-05,2.3708,0.0177,1.46e-05,0.0002


HRCN

In [None]:
fm_agg_model.columns

Index(['UNQ_LSN', 'P_TYPE_MOST_FREQ', 'D90', 'D180', 'CLDS', 'AGE', 'CIR',
       'DDD', 'CS', 'MIP', 'CLTV', 'DTI', 'LTV', 'OIR', 'OLT', 'HRCN_EVNTS',
       'HRCN_AFREQ', 'HRCN_EXP_A', 'HRCN_EXPB', 'HRCN_EXPP', 'HRCN_EXPPE',
       'HRCN_EXPA', 'HRCN_EXPT', 'HRCN_HLRB', 'HRCN_HLRP', 'HRCN_HLRA',
       'HRCN_EALB', 'HRCN_EALP', 'HRCN_EALPE', 'HRCN_EALA', 'HRCN_EALT',
       'HRCN_EALS', 'HRCN_ALRB', 'HRCN_ALRP', 'HRCN_ALRA', 'HRCN_ALR_N',
       'HRCN_RISKV', 'HRCN_RISKS', 'HRCN_EALS_Norm', 'quarter', 'place_id',
       'yr', 'period', 'index_nsa', 'Year', 'Month', 'MEI', 'Month_num',
       'UNRATE', 'DHRI', 'DHRI2', 'MEI2'],
      dtype='object')

In [None]:
# Run a random effects regression
exog_vars = ['HRCN_EALS', 'HRCN_AFREQ', 'LTV', 'UNRATE', 'OIR', 'index_nsa']
exog = fm_agg_model[exog_vars]
mod_re = RandomEffects(dependent_var, exog)
re_res = mod_re.fit()
re_res

KeyError: "['HRCN_LRB'] not in index"

### Mixed Model

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
aggregated_df_mm = aggregated_df.copy()
#drop rows with index_nsa nan
aggregated_df_mm = aggregated_df_mm.dropna(subset=['index_nsa'])
# Prepare the dataset
aggregated_df_mm['D90_scaled'] = aggregated_df_mm['D90'] * 100
aggregated_df_mm['DHRI'] = aggregated_df_mm['MEI'] * ((aggregated_df_mm['HRCN_EALS'] - aggregated_df_mm['HRCN_EALS'].mean()) / aggregated_df_mm['HRCN_EALS'].std())
# Create a formula for the mixed model with fixed effects for Date
formula = "D90_scaled ~ DHRI + LTV + UNRATE + OIR + C(Year) + index_nsa"
# Fit the mixed model with random intercepts for each 3ZIP
mixed_model = smf.mixedlm(formula, aggregated_df_mm, groups=aggregated_df_mm['MSA'])
mixed_result = mixed_model.fit()
print(mixed_result.summary())



          Mixed Linear Model Regression Results
Model:             MixedLM Dependent Variable: D90_scaled 
No. Observations:  99729   Method:             REML       
No. Groups:        268     Scale:              0.2557     
Min. group size:   222     Log-Likelihood:     -73752.9047
Max. group size:   1160    Converged:          Yes        
Mean group size:   372.1                                  
----------------------------------------------------------
                Coef.  Std.Err.   z    P>|z| [0.025 0.975]
----------------------------------------------------------
Intercept       -0.372    0.073 -5.124 0.000 -0.514 -0.230
C(Year)[T.2000] -0.022    0.013 -1.656 0.098 -0.048  0.004
C(Year)[T.2001]  0.010    0.013  0.782 0.434 -0.016  0.036
C(Year)[T.2002]  0.019    0.013  1.514 0.130 -0.006  0.044
C(Year)[T.2003]  0.055    0.013  4.332 0.000  0.030  0.079
C(Year)[T.2004]  0.067    0.014  4.838 0.000  0.040  0.094
C(Year)[T.2005]  0.104    0.015  6.865 0.000  0.074  0.134
C(Year)[

In [None]:
# Run a random effects regression
exog_vars = ['HRCN_EALS', 'HRCN_AFREQ', 'LTV', 'UNRATE', 'OIR', 'index_nsa']
exog = fm_agg_model[exog_vars]
mod_re = RandomEffects(dependent_var, exog)
re_res = mod_re.fit()
re_res

# Modelling on Loan Level

In [2]:
agg_loan = dd.read_parquet('../Data/agg_loan.parquet')

In [3]:
agg_loan.head()

Unnamed: 0,LSN,MSA,FPD,FIRST_F,MD,POSTAL,P_TYPE,D90,D180,HRCN_RISK_CATEGORY_QUANTILE,...,HRCN_EALA,HRCN_EALT,HRCN_EALS,HRCN_ALRB,HRCN_ALRP,HRCN_ALRA,HRCN_ALR_N,HRCN_RISKV,HRCN_RISKS,HRCN_EALS_Norm
0,F99Q10000029,10420,2002-10-01,0,2029-02-01,44200,SF,0,0,Relatively Moderate,...,3899.622027,359481.079592,55.923044,6e-06,1.453149e-09,0.000121,29.853005,358143.40386,55.720054,0.114308
1,F99Q10002396,10420,1999-03-01,0,2029-02-01,44200,SF,0,0,Relatively Moderate,...,3899.622027,359481.079592,55.923044,6e-06,1.453149e-09,0.000121,29.853005,358143.40386,55.720054,0.114308
2,F99Q10043324,10420,1999-03-01,0,2029-02-01,44200,SF,0,0,Relatively Moderate,...,3899.622027,359481.079592,55.923044,6e-06,1.453149e-09,0.000121,29.853005,358143.40386,55.720054,0.114308
3,F99Q10099411,10420,1999-05-01,0,2029-04-01,44200,CO,0,0,Relatively Moderate,...,3899.622027,359481.079592,55.923044,6e-06,1.453149e-09,0.000121,29.853005,358143.40386,55.720054,0.114308
4,F99Q10102136,10420,1999-04-01,0,2029-03-01,44200,SF,0,0,Relatively Moderate,...,3899.622027,359481.079592,55.923044,6e-06,1.453149e-09,0.000121,29.853005,358143.40386,55.720054,0.114308


In [4]:
db_path = "../Database/thesis_database.db"
conn = sqlite3.connect(db_path, check_same_thread=False)

### HPI

In [5]:
# Create the FPD_quarter and MD_quarter variables
agg_loan['FPD_quarter'] = agg_loan['FPD'].dt.year.astype(str) + 'Q' + agg_loan['FPD'].dt.quarter.astype(str)
agg_loan['Last_quarter'] = agg_loan['Date'].dt.year.astype(str) + 'Q' + agg_loan['MD'].dt.quarter.astype(str)


In [6]:
hpi_master = pd.read_excel('../Data/HPI_master.xlsx', sheet_name= "HPI_master",dtype={'place_id': str, 'yr': int, 'period': int, 'index_nsa': float, 'quarter': str})

In [8]:
agg_loan = agg_loan.merge(hpi_master[['place_id', 'quarter', 'index_nsa']], left_on = ['MSA', 'FPD_quarter'], right_on = ['place_id', 'quarter'] ,how='left').rename(columns={'index_nsa': 'HPI_FPD'})
agg_loan = agg_loan.merge(hpi_master[['place_id', 'quarter', 'index_nsa']], left_on = ['MSA', 'Last_quarter'], right_on = ['place_id', 'quarter'] ,how='left').rename(columns={'index_nsa': 'HPI_Last'})
agg_loan['HPI'] = agg_loan['HPI_Last'] - agg_loan['HPI_FPD']

In [9]:
agg_loan[['FPD_quarter', 'HPI_FPD', 'Last_quarter', 'HPI_Last', 'HPI']].head()

Unnamed: 0,FPD_quarter,HPI_FPD,Last_quarter,HPI_Last,HPI
0,2002Q4,139.84,2004Q1,144.52,4.68
1,1999Q1,122.26,2010Q1,135.21,12.95
2,1999Q1,122.26,2002Q1,136.77,14.51
3,1999Q2,123.53,2002Q2,136.99,13.46
4,1999Q2,123.53,2003Q1,140.76,17.23


### MEI

In [11]:
query = """
SELECT * FROM enso_mei;
"""
enso_mei = pd.read_sql_query(query, conn)


In [12]:
enso_mei['Date'] = pd.to_datetime(enso_mei['Date'])

In [13]:
agg_loan_df = agg_loan.compute()

In [15]:
#Create a function that gets the average of the MEI between two dates
tqdm.pandas()
def get_avg_mei(start_date, end_date):
    avg_mei = enso_mei[(enso_mei['Date'] >= start_date) & (enso_mei['Date'] <= end_date)]['MEI'].mean()
    return avg_mei
#apply to agg_loan in new column called MEI 
agg_loan_df['MEI'] = agg_loan_df.progress_apply(lambda x: get_avg_mei(x['FPD'], x['MD']), axis=1)

100%|██████████| 456494/456494 [01:22<00:00, 5546.67it/s]


In [16]:
agg_loan_df['DHRI'] = agg_loan_df['MEI'] * (agg_loan_df['HRCN_HLRA'] - agg_loan_df['HRCN_HLRA'].mean()) / agg_loan_df['HRCN_HLRA'].std()

### UNRATE

### Logit Models

In [17]:
del agg_loan
dependent_var = agg_loan_df['D90']

In [18]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Assuming fm_agg_model is your DataFrame and dependent_var is your dependent variable
formula = 'dependent_var ~ DHRI + LTV + HPI + OIR + HRCN_ALRA'
mod_pl = smf.logit(formula, data=agg_loan_df)
pl_res = mod_pl.fit()
print(pl_res.summary())

Optimization terminated successfully.
         Current function value: 0.205654
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:          dependent_var   No. Observations:               389920
Model:                          Logit   Df Residuals:                   389914
Method:                           MLE   Df Model:                            5
Date:                Sun, 05 Nov 2023   Pseudo R-squ.:                 0.05026
Time:                        17:54:10   Log-Likelihood:                -80188.
converged:                       True   LL-Null:                       -84432.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -6.3273      0.046   -138.649      0.000      -6.417      -6.238
DHRI           0.0708      0.

In [32]:
type(agg_loan_df)

pandas.core.frame.DataFrame

In [19]:
agg_loan_df.dtypes
#Set MSA as category, FIRST_F as category, P_TYPE as category, Date as category
agg_loan_df['MSA'] = agg_loan_df['MSA'].astype('category')
agg_loan_df['FIRST_F'] = agg_loan_df['FIRST_F'].astype('category')
agg_loan_df['P_TYPE'] = agg_loan_df['P_TYPE'].astype('category')
agg_loan_df['Date'] = agg_loan_df['Date'].astype('category')
#remove string columns
agg_loan_df.dtypes


LSN                            string[pyarrow]
MSA                                   category
FPD                             datetime64[ns]
FIRST_F                               category
MD                              datetime64[ns]
POSTAL                         string[pyarrow]
P_TYPE                                category
D90                                      int64
D180                                     int64
HRCN_RISK_CATEGORY_QUANTILE    string[pyarrow]
Date                                  category
CLDS                                   float64
AGE                                    float64
CIR                                    float64
ELTV                                   float64
DDD                                    float64
CS                                     float64
MIP                                    float64
CLTV                                   float64
DTI                                    float64
LTV                                    float64
OIR          

In [20]:
#remove datatime columns
agg_loan_df = agg_loan_df.drop(columns=['FPD', 'MD'], errors='ignore')

In [21]:
string_cols = agg_loan_df.select_dtypes(include=['string']).columns
#remove string columns
agg_loan_xgb = agg_loan_df.drop(columns=string_cols)

### XGBoost

In [28]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd
import numpy as np
import warnings
# Suppress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
# Assuming agg_loan_xgb is your dataframe and it's already loaded
# Let's say the 'D90' column is your binary target and the rest are features

# Splitting the data into features and target
X = agg_loan_xgb.drop('D90', axis=1)
y = agg_loan_xgb['D90']

# Splitting the dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Create the XGBoost model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', enable_categorical=True)

# Hyperparameter tuning with cross-validation
param_grid = {
    'max_depth': [3, 4, 5, 10, 20],
}

# Including "DHRI" in the model; assume that "DHRI" is continuous.
X_train['DHRI'] = X_train['DHRI'].astype(float)


# GridSearchCV with the specified parameter grid
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1', verbose=2, n_jobs=-1)
grid_search.fit(
    X_train, 
    y_train  # Call the callback function every 50 rounds
)

# Best hyperparameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Train the final model with the best parameters
final_model = xgb.XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss')
final_model.fit(X_train, y_train)

# Predictions
y_pred = final_model.predict(X_test)

# Performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'ROC AUC Score: {roc_auc}')


Fitting 5 folds for each of 5 candidates, totalling 25 fits


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

[CV] END ........................................max_depth=3; total time=   4.1s
[CV] END ........................................max_depth=3; total time=   4.2s
[CV] END ........................................max_depth=3; total time=   4.2s


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

[CV] END ........................................max_depth=3; total time=   4.4s
[CV] END ........................................max_depth=3; total time=   4.4s


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

[CV] END ........................................max_depth=4; total time=   5.2s
[CV] END ........................................max_depth=4; total time=   5.2s
[CV] END ........................................max_depth=4; total time=   5.4s


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

[CV] END ........................................max_depth=4; total time=   4.8s
[CV] END ........................................max_depth=4; total time=   4.9s


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

[CV] END ........................................max_depth=5; total time=   5.5s
[CV] END ........................................max_depth=5; total time=   5.3s


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


[CV] END ........................................max_depth=5; total time=   5.6s


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

[CV] END ........................................max_depth=5; total time=   5.6s
[CV] END ........................................max_depth=5; total time=   5.6s


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


[CV] END .......................................max_depth=10; total time=   6.6s


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

[CV] END .......................................max_depth=10; total time=   7.0s
[CV] END .......................................max_depth=10; total time=   7.1s


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

[CV] END .......................................max_depth=10; total time=   7.3s
[CV] END .......................................max_depth=10; total time=   7.3s


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


[CV] END .......................................max_depth=20; total time=   7.2s


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


[CV] END .......................................max_depth=20; total time=   6.9s
[CV] END .......................................max_depth=20; total time=   7.2s


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

[CV] END .......................................max_depth=20; total time=   6.7s


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


[CV] END .......................................max_depth=20; total time=   5.2s
Best parameters found:  {'max_depth': 20}


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:MSA: category, FIRST_F: category, P_TYPE: category, Date: category

### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd
import numpy as np

# Assuming agg_loan is your dataframe and it's already loaded
# Let's say the 'D90' column is your binary target and the rest are features

# Splitting the data into features and target
X = agg_loan.drop('D90', axis=1)
y = agg_loan['D90']

# Splitting the dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# AdaBoost does not inherently have a feature selection mechanism like XGBoost,
# so to "force" a feature like DHRI, you might consider feature engineering techniques
# or include it in every set of hyperparameters if it was categorical. Since it's not,
# AdaBoost will naturally give it a weight in the model training process.

# Create the AdaBoost model
model = AdaBoostClassifier()

# Hyperparameter tuning with cross-validation
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0]
}

# GridSearchCV with the specified parameter grid
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='f1', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best hyperparameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Train the final model with the best parameters
final_model = AdaBoostClassifier(**best_params)
final_model.fit(X_train, y_train)

# Predictions
y_pred = final_model.predict(X_test)

# Performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'ROC AUC Score: {roc_auc}')


# Add geometry

In [47]:
#load geometry_key.to_pickle('../../Data/geometry_key.pkl')
geometry_key = pd.read_pickle('../Data/geometry_key.pkl')

In [48]:
#merge with aggregated_df on MSA
aggregated_df_geo = aggregated_df_mm.merge(geometry_key, on='MSA', how='left')

# Gekloot

In [52]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
# Assume 'aggregated' is your DataFrame and 'geometry_data' is your GeoDataFrame with MSA geometries


specific_month = '2022-05'  # Replace with the month you are interested in
# Step 1: Filter the dataset for the specific month
filtered_data = aggregated_df_geo[aggregated_df_geo['Date'] == specific_month]
#select only D90 is not 0
filtered_data = filtered_data[(filtered_data['D90'] != 0)]


In [109]:
#Extract MEI timeseries on unique date
mei_ts = aggregated_df[['Date', 'MEI']].drop_duplicates()

In [129]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib.colors import ListedColormap
import matplotlib.gridspec as gridspec

aggregated_df_geo.loc[:,'DHRI'] = aggregated_df_geo['MEI'] * ((aggregated_df_geo['HRCN_EALS']- aggregated_df_geo['HRCN_EALS'].mean()) / aggregated_df_geo['HRCN_EALS'].std())
 
# ... your data loading code here ...
bin_edges = np.linspace(aggregated_df_geo['DHRI'].min(), aggregated_df_geo['DHRI'].max(), 7)
# Function to update the plot for each month
anim_df = aggregated_df_geo[['MSA', 'Date', 'DHRI', 'geometry']].copy()
#merge geometry_key with aggregated_df_geo['MSA', 'D90'] on MSA if MSA not foung in aggreagted_df then set D90 to 0
def update(month):
    ax1.clear()
    ax2.clear()

    #Update MSA plot
    filtered_data = anim_df[anim_df['Date'] == month].copy()
    # Use cut instead of qcut
    filtered_data.loc[:, 'DHRI_cat'] = pd.cut(
        filtered_data['DHRI'], 
        bins=bin_edges, 
        labels=['Very Low', "Relatively Low", 'Relatively Moderate', 'Moderate', 'Relatively High', 'Very High'],
        include_lowest=True
    )
    merged_geo_data = gpd.GeoDataFrame(filtered_data, geometry='geometry')  # Ensure the result is a GeoDataFrame
    
    ax1.set_title(f"DHRI (USA) {month}", fontdict={'fontsize': '25', 'fontweight' : '3'})
    merged_geo_data.plot(column='DHRI_cat', ax=ax1, cmap=custom_cmap, legend=True)
    ax1.set_xticks([])
    ax1.set_yticks([])
    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax1.spines['bottom'].set_visible(False)
    ax1.spines['left'].set_visible(False)

    # Updating MEI timeseries plot (ax2)
    mei_data = mei_ts[mei_ts['Date'] <= month]
    ax2.plot(mei_data['Date'], mei_data['MEI'], '-o')
    ax2.set_title("MEI Timeseries", fontdict={'fontsize': '15', 'fontweight' : '3'})
    ax2.set_xlim([dates.min(), dates.max()])
    ax2.set_xlabel("Date")
    ax2.set_ylabel("MEI")
    ax2.grid(True)

# List of months to animate through
months = pd.date_range(start='2005-01', end='2010-05', freq='M').strftime('%Y-%m')
dates = pd.to_datetime(months)

# Set up the plot
fig = plt.figure(figsize=(10, 15))
gs = gridspec.GridSpec(2, 1, height_ratios=[3, 1])
ax1 = plt.subplot(gs[0])
ax2 = plt.subplot(gs[1])
colors = ['#08306b', '#4292c6', '#fdae6b', '#f16913', '#67000d']
custom_cmap = ListedColormap(colors)


# Create the animation
ani = animation.FuncAnimation(fig, update, frames=months, repeat=False)

# Save the animation to a file
ani.save('delinquency_animation.mp4', writer='ffmpeg', fps=1)

# Optionally, display the animation in the notebook (this might not work in all environments)
plt.close(fig)
from IPython.display import Video
Video('delinquency_animation.mp4')
