# Probability of Default Model

Here we will assess the dynamic hurricane risk index and its effect it has on the probability of default. The goal is to measure the effect of the DHRI on the probability of default. Moreover, we will compare this model to a model using actual HRCN frequency data.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import linearmodels as lm
import statsmodels.api as sm
from linearmodels import PanelOLS
from linearmodels import RandomEffects
from linearmodels import PooledOLS
from linearmodels import FirstDifferenceOLS
from linearmodels import BetweenOLS
from linearmodels import FamaMacBeth
import sqlite3

Set Parameters:

In [71]:
db_path = "../Database/thesis_database.db"
conn = sqlite3.connect(db_path)

In [19]:
years = range(1999,2023)
union_query = [f"SELECT * FROM fm_{year}" for year in years]
combined_query = " UNION ALL ".join(union_query)
#query that binds fm_2022 and fm_2021
query = f"""
SELECT 
    combined_fm."3ZIP",
    combined_fm."Date",
    COUNT(DISTINCT LSN) AS UNQ_LSN,
    AVG(CIR) AS AVG_CIR,
    AVG(CLTV) AS AVG_CLTV,
    AVG(OIR) AS AVG_OIR,
    AVG(LTV) AS AVG_LTV,
    AVG(MEI) AS MEI,
    AVG(DTI) AS AVG_DTI,
    SUM(CASE WHEN CLDS >= 3 THEN 1 ELSE 0 END) AS COUNT_D90,
    SUM(CASE WHEN CLDS >= 7 THEN 1 ELSE 0 END) AS COUNT_D180,
    AVG(HRCN_RISKS) AS HRCN_RISKS, 
    AVG(HRCN_RISKV) AS HRCN_RISKV,
    AVG(HRCN_EVNTS) AS HRCN_EVNTS,
    AVG(HRCN_EALS) AS HRCN_EALS
FROM ({combined_query}) AS combined_fm 
LEFT JOIN enso_mei_long
 ON combined_fm.Date = enso_mei_long.Date
INNER JOIN hrcn_data_short
ON combined_fm."3ZIP" = hrcn_data_short."3ZIP"
GROUP BY combined_fm."3ZIP", combined_fm."DATE";
"""

fm_test_2 = pd.read_sql_query(query, conn)
##Takes around 36 mins to run
fm_test_2

Unnamed: 0,3ZIP,Date,UNQ_LSN,AVG_CIR,AVG_CLTV,AVG_OIR,AVG_LTV,MEI,AVG_DTI,COUNT_D90,COUNT_D180,HRCN_RISKS,HRCN_RISKV,HRCN_EVNTS,HRCN_EALS
0,100,199902,3,6.833333,72.000000,6.833333,72.000000,-1.04,26.333333,0,0,96.680126,7.006102e+07,15.0,96.541288
1,100,199903,9,6.750000,71.555556,6.750000,71.555556,-1.20,27.000000,0,0,96.680126,7.006102e+07,15.0,96.541288
2,100,199904,19,6.861842,71.315789,6.861842,71.315789,-1.42,28.315789,0,0,96.680126,7.006102e+07,15.0,96.541288
3,100,199905,31,6.895161,67.387097,6.895161,67.387097,-1.26,30.677419,0,0,96.680126,7.006102e+07,15.0,96.541288
4,100,199906,34,6.926471,66.088235,6.926471,66.088235,-1.21,30.470588,0,0,96.680126,7.006102e+07,15.0,96.541288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142135,922,202211,751,3.958254,70.976032,3.968907,70.388815,-1.25,111.286285,2,0,21.175415,2.655263e+04,1.5,18.244704
142136,922,202212,762,4.001819,70.892388,4.012318,70.313648,-1.06,108.930446,2,0,21.175415,2.655263e+04,1.5,18.244704
142137,922,202301,764,4.016361,70.854712,4.026832,70.277487,-0.81,107.502618,2,0,21.175415,2.655263e+04,1.5,18.244704
142138,922,202302,764,4.016361,70.854712,4.026832,70.277487,-0.67,107.502618,8,0,21.175415,2.655263e+04,1.5,18.244704


In [109]:
query = f"""
SELECT *
FROM fm_2008"""
fm_2008 = pd.read_sql_query(query, conn)


In [110]:

fm_2008['D90'] = 0
fm_2008['D180'] = 0
#Set Date to %Y%m
fm_2008['Date'] = pd.to_datetime(fm_2008['Date'], format='%Y%m')
# Process each loan
for LSN, group in fm_2008.groupby('LSN'):
    
    # If a '3' is encountered in CLDS
    if 3 in group['CLDS'].values:
        # Find the date when 3 first appears
        delinquency_date = group[group['CLDS'] == 3]['Date'].min()
        
        # Trace back 3 months
        back_date = delinquency_date - pd.DateOffset(months=3)
        
        # Set D90+ for that earlier date
        fm_2008.loc[(fm_2008['LSN'] == LSN) & (fm_2008['Date'] == back_date), 'D90'] = 1
    
    # If a '3' is encountered in CLDS
    if 7 in group['CLDS'].values:
        # Find the date when 3 first appears
        delinquency_date = group[group['CLDS'] == 7]['Date'].min()
        
        # Trace back 3 months
        back_date = delinquency_date - pd.DateOffset(months=6)
        
        # Set D90+ for that earlier date
        fm_2008.loc[(fm_2008['LSN'] == LSN) & (fm_2008['Date'] == back_date), 'D180'] = 1
    print(LSN)

fm_2008

F08Q10000071
F08Q10000079
F08Q10000086
F08Q10000220
F08Q10000233
F08Q10000307
F08Q10000318
F08Q10000392
F08Q10000395
F08Q10000402
F08Q10000410
F08Q10000536
F08Q10000646
F08Q10000706
F08Q10000748
F08Q10000794
F08Q10000818
F08Q10000843
F08Q10000876
F08Q10000900
F08Q10000918
F08Q10000973
F08Q10000985
F08Q10000989
F08Q10001008
F08Q10001032
F08Q10001061
F08Q10001215
F08Q10001216
F08Q10001245
F08Q10001263
F08Q10001451
F08Q10001452
F08Q10001474
F08Q10001517
F08Q10001525
F08Q10001549
F08Q10001568
F08Q10001707
F08Q10001722
F08Q10001742
F08Q10001756
F08Q10001768
F08Q10001776
F08Q10001813
F08Q10001860
F08Q10001891
F08Q10001896
F08Q10001903
F08Q10001904
F08Q10001907
F08Q10001919
F08Q10001944
F08Q10001965
F08Q10002084
F08Q10002145
F08Q10002159
F08Q10002274
F08Q10002485
F08Q10002515
F08Q10002581
F08Q10002607
F08Q10002713
F08Q10002747
F08Q10002831
F08Q10002878
F08Q10002925
F08Q10002936
F08Q10002979
F08Q10003094
F08Q10003265
F08Q10003291
F08Q10003298
F08Q10003320
F08Q10003375
F08Q10003415
F08Q10003487

Unnamed: 0,Date,3ZIP,LSN,CLDS,CIR,ELTV,DDD,CS,FPD,FIRST_F,MD,CLTV,DTI,LTV,OIR,P_TYPE,OLT,D90,D180
0,2008-02-01,473,F08Q10000071,0,6.750,,0,746,200803,1,203802,93,42,93,6.750,SF,360,0,0
1,2008-03-01,473,F08Q10000071,0,6.750,,0,746,200803,1,203802,93,42,93,6.750,SF,360,0,0
2,2008-04-01,473,F08Q10000071,0,6.750,,0,746,200803,1,203802,93,42,93,6.750,SF,360,0,0
3,2008-05-01,473,F08Q10000071,0,6.750,,0,746,200803,1,203802,93,42,93,6.750,SF,360,0,0
4,2008-06-01,473,F08Q10000071,0,6.750,,0,746,200803,1,203802,93,42,93,6.750,SF,360,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2414319,2022-11-01,531,F08Q40225716,0,3.745,29.0,0,755,201203,0,203811,50,31,50,3.745,CO,321,0,0
2414320,2022-12-01,531,F08Q40225716,0,3.745,27.0,0,755,201203,0,203811,50,31,50,3.745,CO,321,0,0
2414321,2023-01-01,531,F08Q40225716,0,3.745,25.0,0,755,201203,0,203811,50,31,50,3.745,CO,321,0,0
2414322,2023-02-01,531,F08Q40225716,0,3.745,25.0,0,755,201203,0,203811,50,31,50,3.745,CO,321,0,0


In [112]:
display(fm_2008['CLDS'].value_counts())
display(fm_2008['LSN'].nunique())
180/50000

CLDS
0      2251854
1        55323
2        19613
RA       10476
3         9763
        ...   
146          1
145          1
144          1
143          1
169          1
Name: count, Length: 171, dtype: int64

49992

0.0036

In [106]:
fm_2022['D180'].value_counts()

D180
0    419402
1        38
Name: count, dtype: int64

In [89]:
# Compute the value counts for each LSN and CLDS value
value_counts = fm_2022['CLDS'].groupby(fm_2022['LSN']).value_counts()

# Filter only those LSNs that have a count for CLDS = 3
filtered_loans = value_counts.loc[:, 3]

#show progress of loan F22Q10005371
fm_2022[fm_2022['LSN'] == 'F22Q10005371']


Unnamed: 0,Date,3ZIP,LSN,CLDS,CIR,ELTV,DDD,CS,FPD,FIRST_F,MD,CLTV,DTI,LTV,OIR,P_TYPE,OLT,D90
1627.0,202202,983.0,F22Q10005371,0.0,2.99,58.0,0.0,629.0,202203.0,0.0,205202.0,57.0,35.0,57.0,2.99,SF,360.0,0
1628.0,202203,983.0,F22Q10005371,0.0,2.99,54.0,0.0,629.0,202203.0,0.0,205202.0,57.0,35.0,57.0,2.99,SF,360.0,0
1629.0,202204,983.0,F22Q10005371,0.0,2.99,50.0,0.0,629.0,202203.0,0.0,205202.0,57.0,35.0,57.0,2.99,SF,360.0,0
1630.0,202205,983.0,F22Q10005371,0.0,2.99,48.0,0.0,629.0,202203.0,0.0,205202.0,57.0,35.0,57.0,2.99,SF,360.0,0
1631.0,202206,983.0,F22Q10005371,0.0,2.99,49.0,0.0,629.0,202203.0,0.0,205202.0,57.0,35.0,57.0,2.99,SF,360.0,0
1632.0,202207,983.0,F22Q10005371,0.0,2.99,54.0,0.0,629.0,202203.0,0.0,205202.0,57.0,35.0,57.0,2.99,SF,360.0,0
1633.0,202208,983.0,F22Q10005371,0.0,2.99,57.0,0.0,629.0,202203.0,0.0,205202.0,57.0,35.0,57.0,2.99,SF,360.0,0
1634.0,202209,983.0,F22Q10005371,0.0,2.99,61.0,0.0,629.0,202203.0,0.0,205202.0,57.0,35.0,57.0,2.99,SF,360.0,0
1635.0,202210,983.0,F22Q10005371,0.0,2.99,63.0,0.0,629.0,202203.0,0.0,205202.0,57.0,35.0,57.0,2.99,SF,360.0,0
1636.0,202211,983.0,F22Q10005371,1.0,2.99,61.0,0.0,629.0,202203.0,0.0,205202.0,57.0,35.0,57.0,2.99,SF,360.0,0


In [20]:
#save fm_test_2 to csv
fm_test_2.to_csv("../Data/fm_agg.csv")

In [32]:
conn.close()

In [37]:
fm_test_2['3ZIP'].unique().sum()

217721

In [21]:
#convert Date to %Y%m
fm_test_2['Date'] = pd.to_datetime(fm_test_2['Date'], format='%Y%m')
fm_test_2['DHRI'] = fm_test_2['HRCN_EALS']*fm_test_2['MEI']
fm_test_agg = fm_test_2.set_index(['3ZIP', 'Date'])

### Using HDRI

In [70]:
fm_test_agg['COUNT_D90'] / fm_test_agg['UNQ_LSN']*100

3ZIP  Date      
100   1999-02-01    0.000000
      1999-03-01    0.000000
      1999-04-01    0.000000
      1999-05-01    0.000000
      1999-06-01    0.000000
                      ...   
922   2022-11-01    0.266312
      2022-12-01    0.262467
      2023-01-01    0.261780
      2023-02-01    1.047120
      2023-03-01    0.790514
Length: 142140, dtype: float64

In [69]:
mod = PanelOLS(dependent=fm_test_agg['COUNT_D90'] / fm_test_agg['UNQ_LSN'], exog=fm_test_agg[['DHRI', 'AVG_LTV', 'AVG_DTI']])
res = mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:                      0   R-squared:                        0.2228
Estimator:                   PanelOLS   R-squared (Between):              0.5720
No. Observations:              142140   R-squared (Within):               0.0143
Date:                Tue, Oct 10 2023   R-squared (Overall):              0.2228
Time:                        16:04:14   Log-likelihood                 1.755e+05
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   1.358e+04
Entities:                         493   P-value                           0.0000
Avg Obs:                       288.32   Distribution:                F(3,142137)
Min Obs:                       162.00                                           
Max Obs:                       290.00   F-statistic (robust):          1.358e+04
                            

### USING Hurr Freq

In [31]:
mod = PanelOLS(dependent=fm_test_agg['COUNT_D90'] / fm_test_agg['UNQ_LSN'], exog=fm_test_agg[['HRCN_EALS', 'HRCN_EVNTS' ,'AVG_LTV', 'AVG_DTI']])
res = mod.fit()
print(res)


                          PanelOLS Estimation Summary                           
Dep. Variable:                      0   R-squared:                        0.2283
Estimator:                   PanelOLS   R-squared (Between):              0.5887
No. Observations:              142140   R-squared (Within):               0.0129
Date:                Tue, Oct 10 2023   R-squared (Overall):              0.2283
Time:                        15:36:36   Log-likelihood                  1.76e+05
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   1.051e+04
Entities:                         493   P-value                           0.0000
Avg Obs:                       288.32   Distribution:                F(4,142136)
Min Obs:                       162.00                                           
Max Obs:                       290.00   F-statistic (robust):          1.051e+04
                            

In [65]:
mod = RandomEffects(dependent=fm_test_agg['COUNT_D90'] / fm_test_agg['UNQ_LSN'], exog=fm_test_agg[['AVG_OIR', 'AVG_CLTV', 'AVG_DTI','AVG_LTV', 'MEI']])
res = mod.fit()
print(res)


                        RandomEffects Estimation Summary                        
Dep. Variable:                      0   R-squared:                        0.0372
Estimator:              RandomEffects   R-squared (Between):              0.4992
No. Observations:              142140   R-squared (Within):               0.0335
Date:                Tue, Oct 10 2023   R-squared (Overall):              0.2068
Time:                        15:56:26   Log-likelihood                 1.929e+05
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      1097.4
Entities:                         493   P-value                           0.0000
Avg Obs:                       288.32   Distribution:                F(5,142135)
Min Obs:                       162.00                                           
Max Obs:                       290.00   F-statistic (robust):             1097.4
                            

In [17]:
mod = PooledOLS(dependent=fm_test_agg['COUNT_D90'] / fm_test_agg['UNQ_LSN'], exog=fm_test_agg[['AVG_OIR', 'AVG_CLTV', 'AVG_DTI','AVG_LTV', 'MEI']])
res = mod.fit()
print(res)


                          PooledOLS Estimation Summary                          
Dep. Variable:               AVG_CLDS   R-squared:                        0.3067
Estimator:                  PooledOLS   R-squared (Between):              0.6402
No. Observations:              142140   R-squared (Within):               0.1015
Date:                Tue, Oct 10 2023   R-squared (Overall):              0.3067
Time:                        13:57:05   Log-likelihood                    2412.3
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   1.258e+04
Entities:                         493   P-value                           0.0000
Avg Obs:                       288.32   Distribution:                F(5,142135)
Min Obs:                       162.00                                           
Max Obs:                       290.00   F-statistic (robust):          1.258e+04
                            

In [18]:
mod = FirstDifferenceOLS(dependent=fm_test_agg['COUNT_D90'] / fm_test_agg['UNQ_LSN'], exog=fm_test_agg[['AVG_OIR', 'AVG_CLTV', 'AVG_DTI','AVG_LTV', 'MEI']])
print(res)


                          PooledOLS Estimation Summary                          
Dep. Variable:               AVG_CLDS   R-squared:                        0.3067
Estimator:                  PooledOLS   R-squared (Between):              0.6402
No. Observations:              142140   R-squared (Within):               0.1015
Date:                Tue, Oct 10 2023   R-squared (Overall):              0.3067
Time:                        13:57:05   Log-likelihood                    2412.3
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   1.258e+04
Entities:                         493   P-value                           0.0000
Avg Obs:                       288.32   Distribution:                F(5,142135)
Min Obs:                       162.00                                           
Max Obs:                       290.00   F-statistic (robust):          1.258e+04
                            