In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import linearmodels as lm
import statsmodels.api as sm
from linearmodels import PanelOLS
from linearmodels import RandomEffects
from linearmodels import PooledOLS
from linearmodels import FirstDifferenceOLS
from linearmodels import BetweenOLS
from linearmodels import FamaMacBeth
import sqlite3
from tqdm import tqdm

In [5]:
db_path = "../Database/thesis_database.db"
conn = sqlite3.connect(db_path)

In [6]:
#load hrcn_risk_agg
query = """
SELECT *
FROM hrcn_risk_agg
"""
hrcn_risk_agg = pd.read_sql(query, conn)

In [15]:
mortgage_hrcn = {}
#Merge Mortgage data and Hurricane Data on MSA ||| This reduces the datasets by removing all unessential states data
for year in tqdm(range(1999, 2023), desc = "merging hrcn and mortgage data"):
    query = f"""
    SELECT *
    FROM fm_{year}
    """
    df_year = pd.read_sql(query, conn) 
    merged_df = df_year.merge(hrcn_risk_agg, on='MSA', how='inner')
    key_name = f"fm_{year}"
    mortgage_hrcn[key_name] = merged_df

merging hrcn and mortgage data:  38%|███▊      | 9/24 [02:04<04:05, 16.39s/it]

In [2]:
years = range(1999,2023)
union_query = [f"SELECT * FROM fm_{year}" for year in years]
combined_query = " UNION ALL ".join(union_query)
query = f"""

####
####

"""

fm_combined = pd.read_sql_query(query, conn)

NameError: name 'pd' is not defined

In [40]:
#2020-06-01 zip 331 data
query = """
SELECT * 
FROM fm_combined
WHERE "3ZIP" = 331;
"""
fm_agg_check = pd.read_sql_query(query, conn)

In [15]:
fm_agg['D90'] = fm_agg['CLDS90_COUNT'] / fm_agg['UNQ_LSN']

In [17]:
#Select Random zip codes 1000
np.random.seed(42)
random_zip = np.random.choice(fm_agg['3ZIP'].unique(), 10, replace=False)
fm_agg_rand = fm_agg[fm_agg['3ZIP'].isin(random_zip)]

In [18]:
#save to csv
fm_agg_rand.to_csv('../Data/fm_agg_rand.csv', index=False)

In [4]:
years = range(1999,2023)
union_query = [f"SELECT * FROM fm_{year}" for year in years]
combined_query = " UNION ALL ".join(union_query)
query = f"""
SELECT
    fm_combined.LSN,
    AVG(MEI) AS AVG_MEI,
    AVG(CIR) AS AVG_CIR,
    AVG(CLTV) AS AVG_CLTV,
    AVG(OIR) AS AVG_OIR,
    AVG(LTV) AS AVG_LTV,
    AVG(DTI) AS AVG_DTI,
    CASE 
        WHEN SUM(CASE WHEN CLDS = 3 THEN 1 ELSE 0 END) > 0 THEN 1
        ELSE 0
    END AS CLDS90,
    CASE 
        WHEN SUM(CASE WHEN CLDS = 6 THEN 1 ELSE 0 END) > 0 THEN 1
        ELSE 0
    END AS CLDS180
FROM fm_combined
LEFT JOIN enso_mei
ON fm_combined.Date = enso_mei.Date
GROUP BY fm_combined.LSN;
"""

fm_agg_loan = pd.read_sql_query(query, conn)
##Takes around 36 mins to run

In [None]:
query = f"""
SELECT
    fm_combined.LSN,
    AVG(MEI) AS AVG_MEI,
    AVG(CIR) AS AVG_CIR,
    AVG(CLTV) AS AVG_CLTV,
    AVG(OIR) AS AVG_OIR,
    AVG(LTV) AS AVG_LTV,
    AVG(DTI) AS AVG_DTI,
    CASE 
        WHEN SUM(CASE WHEN CLDS = 3 THEN 1 ELSE 0 END) > 0 THEN 1
        ELSE 0
    END AS CLDS90,
    CASE 
        WHEN SUM(CASE WHEN CLDS = 6 THEN 1 ELSE 0 END) > 0 THEN 1
        ELSE 0
    END AS CLDS180
FROM fm_combined
LEFT JOIN enso_mei
ON fm_combined.Date = enso_mei.Date
GROUP BY fm_combined.LSN;
"""

fm_agg_loan = pd.read_sql_query(query, conn)
##Takes around 36 mins to run

In [85]:
#percentage of loans  that are 90 days delinquent
print("Percentage of loans that are 90 days delinquent: ", fm_agg_loan["CLDS90"].mean())
print("Percentage of loans that are 180 days delinquent: ", fm_agg_loan["CLDS180"].mean())


Percentage of loans that are 90 days delinquent:  0.05044860052968697
Percentage of loans that are 180 days delinquent:  0.03491133826796694


In [86]:
#Get enso_mei
query = """
SELECT
    *
FROM enso_mei;
"""
enso_mei = pd.read_sql_query(query, conn)


In [87]:
#Get hrcn_data_short
query = """
SELECT
    *
FROM hrcn_data_short;
"""
hrcn_data_short = pd.read_sql_query(query, conn)

In [88]:
#Inner join fm_loan_level and hrcn_data_short on 3ZIP
fm_agg_hrcn = fm_agg.merge(hrcn_data_short, how='inner', on='3ZIP')
fm_agg_hrcn['Date'] = pd.to_datetime(fm_agg_hrcn['Date'])

In [89]:
fm_agg_hrcn['D90'] = fm_agg_hrcn['CLDS90_COUNT'] / fm_agg_hrcn['UNQ_LSN']

In [90]:
fm_agg_hrcn['D90'].mean()

0.0017805038504697982

In [102]:
#load UNRATE.csv by ; and merge with fm_agg_hrcn on Date
unrate = pd.read_csv('../Data/UNRATE.csv', sep=';')
#Convert Date to datetime format 01/01/1948
unrate['Date'] = pd.to_datetime(unrate['DATE'], format= '%d/%m/%Y')
#Only merge UNRATE column with fm_agg_hrcn
unrate = unrate[['Date', 'UNRATE']]
fm_agg_hrcn = fm_agg_hrcn.merge(unrate, how='left', on='Date')


In [96]:
from linearmodels.panel import PanelOLS, RandomEffects

# Convert the dataset into a panel structure

fm_agg_model = fm_agg_hrcn.set_index(['3ZIP', 'Date'])

# Define dependent variable and independent variables
dependent_var = fm_agg_model['D90']*100
fm_agg_model['DHRI'] = fm_agg_model['AVG_MEI'] * ((fm_agg_model['HRCN_EALS']- fm_agg_model['HRCN_EALS'].mean()) / fm_agg_model['HRCN_EALS'].std())
exog_vars = ['DHRI', 'AVG_LTV', 'UNRATE']
exog_vars = ['DHRI', 'AVG_LTV', 'UNRATE', 'AVG_OIR']
exog = fm_agg_model[exog_vars]

# Run a fixed effects regression
mod_fe = PanelOLS(dependent_var, exog, entity_effects=True)
fe_res = mod_fe.fit()

# Run a random effects regression
mod_re = RandomEffects(dependent_var, exog)
re_res = mod_re.fit()

re_res




0,1,2,3
Dep. Variable:,D90,R-squared:,0.0266
Estimator:,RandomEffects,R-squared (Between):,0.7726
No. Observations:,616218,R-squared (Within):,0.0236
Date:,"Wed, Oct 25 2023",R-squared (Overall):,0.1019
Time:,16:16:10,Log-likelihood,-5.465e+05
Cov. Estimator:,Unadjusted,,
,,F-statistic:,4206.8
Entities:,493,P-value,0.0000
Avg Obs:,1249.9,Distribution:,"F(4,616214)"
Min Obs:,206.00,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
DHRI,-0.0040,0.0009,-4.6251,0.0000,-0.0056,-0.0023
AVG_LTV,0.0011,7.343e-05,14.626,0.0000,0.0009,0.0012
UNRATE,0.0446,0.0004,117.92,0.0000,0.0438,0.0453
AVG_OIR,-0.0266,0.0006,-41.587,0.0000,-0.0278,-0.0253


In [103]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
fm_agg_model = fm_agg_hrcn.copy()
# Prepare the dataset
fm_agg_model['D90_scaled'] = fm_agg_model['D90'] * 100
fm_agg_model['DHRI'] = fm_agg_model['AVG_MEI'] * ((fm_agg_model['HRCN_EALS'] - fm_agg_model['HRCN_EALS'].mean()) / fm_agg_model['HRCN_EALS'].std())

# Create a formula for the mixed model with fixed effects for Date
formula = "D90_scaled ~ DHRI + AVG_LTV + UNRATE + AVG_OIR + C(Date)"

# Fit the mixed model with random intercepts for each 3ZIP
mixed_model = smf.mixedlm(formula, fm_agg_model, groups=fm_agg_model['3ZIP'])
mixed_result = mixed_model.fit()

print(mixed_result.summary())




KeyboardInterrupt: 