# Probability of Default Model

Here we will assess the dynamic hurricane risk index and its effect it has on the probability of default. The goal is to measure the effect of the DHRI on the probability of default. Moreover, we will compare this model to a model using actual HRCN frequency data.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import linearmodels as lm
import statsmodels.api as sm
from linearmodels import PanelOLS
from linearmodels import RandomEffects
from linearmodels import PooledOLS
from linearmodels import FirstDifferenceOLS
from linearmodels import BetweenOLS
from linearmodels import FamaMacBeth
import sqlite3

Set Parameters:

In [2]:
db_path = "../Database/thesis_database.db"
conn = sqlite3.connect(db_path)

In [52]:
#query that binds fm_2022 and fm_2021
query = "SELECT * FROM fm_2022 UNION ALL SELECT * FROM fm_2021 UNION ALL SELECT * FROM fm_2020 UNION ALL SELECT * FROM fm_2019 UNION ALL SELECT * FROM fm_2018"

#load mainland_usa_gdf_HRCN.csv
hrcn_data = pd.read_csv('mainland_usa_gdf_HRCN.csv', dtype={'3ZIP':str})

fm_test = pd.read_sql_query(query, conn)
fm_test.head()



Unnamed: 0,MRP,LSN,CLDS,AGE,MONTS_REM,CIR,ELTV,DDD,CS,FPD,...,DTI,LTV,OIR,P_TYPE,POSTAL,OLT,Date,Year,Month,MEI
0,202202,F22Q10000012,0,0,180,2.625,57,,768,202203,...,28,57,2.625,SF,12500,180,202202,2022,2,-1.28
1,202203,F22Q10000012,0,1,179,2.625,48,,768,202203,...,28,57,2.625,SF,12500,180,202203,2022,3,-1.76
2,202204,F22Q10000012,0,2,178,2.625,52,,768,202203,...,28,57,2.625,SF,12500,180,202204,2022,4,-1.88
3,202205,F22Q10000012,0,3,177,2.625,40,,768,202203,...,28,57,2.625,SF,12500,180,202205,2022,5,-2.07
4,202206,F22Q10000012,0,4,176,2.625,39,,768,202203,...,28,57,2.625,SF,12500,180,202206,2022,6,-2.1


## Create 3ZIP and remove unnecessary data

In [53]:
fm_test['3ZIP'] = fm_test['POSTAL'].astype(str).str[:3]
fm_test.head()

Unnamed: 0,MRP,LSN,CLDS,AGE,MONTS_REM,CIR,ELTV,DDD,CS,FPD,...,LTV,OIR,P_TYPE,POSTAL,OLT,Date,Year,Month,MEI,3ZIP
0,202202,F22Q10000012,0,0,180,2.625,57,,768,202203,...,57,2.625,SF,12500,180,202202,2022,2,-1.28,125
1,202203,F22Q10000012,0,1,179,2.625,48,,768,202203,...,57,2.625,SF,12500,180,202203,2022,3,-1.76,125
2,202204,F22Q10000012,0,2,178,2.625,52,,768,202203,...,57,2.625,SF,12500,180,202204,2022,4,-1.88,125
3,202205,F22Q10000012,0,3,177,2.625,40,,768,202203,...,57,2.625,SF,12500,180,202205,2022,5,-2.07,125
4,202206,F22Q10000012,0,4,176,2.625,39,,768,202203,...,57,2.625,SF,12500,180,202206,2022,6,-2.1,125


In [54]:
#get vector of unique 3ZIP codes from hrcn_data
hrcn_3zip = hrcn_data['3ZIP'].unique()

#subset fm_test by 3ZIP codes in hrcn_3zip
fm_test = fm_test[fm_test['3ZIP'].isin(hrcn_3zip)]
fm_test

Unnamed: 0,MRP,LSN,CLDS,AGE,MONTS_REM,CIR,ELTV,DDD,CS,FPD,...,LTV,OIR,P_TYPE,POSTAL,OLT,Date,Year,Month,MEI,3ZIP
0,202202,F22Q10000012,0,0,180,2.625,57,,768,202203,...,57,2.625,SF,12500,180,202202,2022,2,-1.28,125
1,202203,F22Q10000012,0,1,179,2.625,48,,768,202203,...,57,2.625,SF,12500,180,202203,2022,3,-1.76,125
2,202204,F22Q10000012,0,2,178,2.625,52,,768,202203,...,57,2.625,SF,12500,180,202204,2022,4,-1.88,125
3,202205,F22Q10000012,0,3,177,2.625,40,,768,202203,...,57,2.625,SF,12500,180,202205,2022,5,-2.07,125
4,202206,F22Q10000012,0,4,176,2.625,39,,768,202203,...,57,2.625,SF,12500,180,202206,2022,6,-2.10,125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5737736,202211,F18Q40289813,0,25,321,2.625,60,,756,202011,...,76,2.625,SF,52000,346,202211,2022,11,-1.25,520
5737737,202212,F18Q40289813,0,26,320,2.625,60,,756,202011,...,76,2.625,SF,52000,346,202212,2022,12,-1.06,520
5737738,202301,F18Q40289813,0,27,319,2.625,55,,756,202011,...,76,2.625,SF,52000,346,202301,2023,1,-0.81,520
5737739,202302,F18Q40289813,0,28,318,2.625,54,,756,202011,...,76,2.625,SF,52000,346,202302,2023,2,-0.67,520


In [67]:
#Aggregate fm_test by Date and 3ZIP

#count 1's and 0's in CLDS
fm_test['CLDS'] = pd.to_numeric(fm_test['CLDS'], errors='coerce')

fm_test_agg = fm_test.groupby(['3ZIP','Date']).agg({'CLDS':'sum','ELTV':'mean', 'LTV': 'mean', 'MEI': 'mean'}).reset_index()
fm_test_agg['Date'] = pd.to_datetime(fm_test_agg['Date'], format='%Y%m')


In [68]:
fm_test_agg


Unnamed: 0,3ZIP,Date,CLDS,ELTV,LTV,MEI
0,100,2018-02-01,0.0,999.000000,71.250000,-0.92
1,100,2018-03-01,0.0,418.125000,73.375000,-1.35
2,100,2018-04-01,0.0,205.642857,77.785714,-0.89
3,100,2018-05-01,0.0,244.500000,75.000000,-0.61
4,100,2018-06-01,0.0,275.333333,75.388889,-0.34
...,...,...,...,...,...,...
30015,922,2022-11-01,10.0,119.625541,67.651515,-1.25
30016,922,2022-12-01,13.0,123.191579,67.631579,-1.06
30017,922,2023-01-01,12.0,94.654167,67.685417,-0.81
30018,922,2023-02-01,16.0,97.408333,67.685417,-0.67


In [69]:
fm_test_agg = fm_test_agg.set_index(['3ZIP', 'Date'])

In [70]:
mod = PanelOLS(dependent=fm_test_agg['CLDS'], exog=fm_test_agg[['ELTV', 'LTV', 'MEI']])
res = mod.fit()
print(res)


                          PanelOLS Estimation Summary                           
Dep. Variable:                   CLDS   R-squared:                        0.1866
Estimator:                   PanelOLS   R-squared (Between):              0.2640
No. Observations:               30020   R-squared (Within):               0.1058
Date:                Mon, Oct 09 2023   R-squared (Overall):              0.1866
Time:                        22:13:14   Log-likelihood                -1.398e+05
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      2295.1
Entities:                         493   P-value                           0.0000
Avg Obs:                       60.892   Distribution:                 F(3,30017)
Min Obs:                       14.000                                           
Max Obs:                       62.000   F-statistic (robust):             2295.1
                            

In [71]:
mod = RandomEffects(dependent=fm_test_agg['CLDS'], exog=fm_test_agg[['ELTV', 'LTV', 'MEI']])
res = mod.fit()
print(res)


                        RandomEffects Estimation Summary                        
Dep. Variable:                   CLDS   R-squared:                        0.1203
Estimator:              RandomEffects   R-squared (Between):              0.2278
No. Observations:               30020   R-squared (Within):               0.1180
Date:                Mon, Oct 09 2023   R-squared (Overall):              0.1743
Time:                        22:13:23   Log-likelihood                -1.311e+05
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      1367.9
Entities:                         493   P-value                           0.0000
Avg Obs:                       60.892   Distribution:                 F(3,30017)
Min Obs:                       14.000                                           
Max Obs:                       62.000   F-statistic (robust):             1367.9
                            

In [72]:
mod = PooledOLS(dependent=fm_test_agg['CLDS'], exog=fm_test_agg[['ELTV', 'LTV', 'MEI']])
res = mod.fit()
print(res)


                          PooledOLS Estimation Summary                          
Dep. Variable:                   CLDS   R-squared:                        0.1866
Estimator:                  PooledOLS   R-squared (Between):              0.2640
No. Observations:               30020   R-squared (Within):               0.1058
Date:                Mon, Oct 09 2023   R-squared (Overall):              0.1866
Time:                        22:13:27   Log-likelihood                -1.398e+05
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      2295.1
Entities:                         493   P-value                           0.0000
Avg Obs:                       60.892   Distribution:                 F(3,30017)
Min Obs:                       14.000                                           
Max Obs:                       62.000   F-statistic (robust):             2295.1
                            

In [51]:
mod = FirstDifferenceOLS(dependent=fm_test_agg['CLDS'], exog=fm_test_agg[['ELTV', 'LTV', 'MEI']])
res = mod.fit()
print(res)


                     FirstDifferenceOLS Estimation Summary                      
Dep. Variable:                   CLDS   R-squared:                        0.0017
Estimator:         FirstDifferenceOLS   R-squared (Between):             -0.1325
No. Observations:               11767   R-squared (Within):               0.0081
Date:                Mon, Oct 09 2023   R-squared (Overall):             -0.0559
Time:                        22:04:38   Log-likelihood                -1.878e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      6.8660
Entities:                         490   P-value                           0.0001
Avg Obs:                       25.014   Distribution:                 F(3,11764)
Min Obs:                       3.0000                                           
Max Obs:                       26.000   F-statistic (robust):             6.8660
                            