# Probability of Default Model

Here we will assess the dynamic hurricane risk index and its effect it has on the probability of default. The goal is to measure the effect of the DHRI on the probability of default. Moreover, we will compare this model to a model using actual HRCN frequency data.

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import linearmodels as lm
import statsmodels.api as sm

Set Parameters:

In [2]:
import_data = True

## Import Datasets

In [6]:
#read sample_orig_2022.txt and sample_svcg_2022.txt
if import_data == True:
    origination_2022 = pd.read_csv('../Data/mortgage_data/sample_orig_2022.txt', sep="|", header=None)
    performance_2022 = pd.read_csv('../Data/mortgage_data/sample_svcg_2022.txt', sep="|", header=None)

  performance_2022 = pd.read_csv('../Data/mortgage_data/sample_svcg_2022.txt', sep="|", header=None)


In [7]:
#Import File Layout
if import_data == True:
    orig_layout = pd.read_excel('../Data/mortgage_data/file_layout.xlsx', sheet_name = 0)
    svcg_layout = pd.read_excel('../Data/mortgage_data/file_layout.xlsx', sheet_name = 1)

In [8]:
# Extracting column details for both origination and performance datasets
orig_columns = orig_layout[["COLUMN", "ATTRIBUTE NAME", "DATA TYPE & FORMAT"]]
svcg_columns = svcg_layout[["COLUMN", "ATTRIBUTE NAME", "DATA TYPE & FORMAT"]]

orig_columns.head(), svcg_columns.head()

(   COLUMN                                     ATTRIBUTE NAME   
 0       1                                       Credit Score  \
 1       2                                 First Payment Date   
 2       3                          First Time Homebuyer Flag   
 3       4                                      Maturity Date   
 4       5  Metropolitan Statistical Area (MSA) Or Metropo...   
 
   DATA TYPE & FORMAT  
 0            Numeric  
 1               Date  
 2              Alpha  
 3               Date  
 4            Numeric  ,
    COLUMN                   ATTRIBUTE NAME            DATA TYPE & FORMAT
 0       1             Loan Sequence Number  Alpha Numeric - PYYQnXXXXXXX
 1       2         Monthly Reporting Period                          Date
 2       3               Current Actual UPB                Numeric - 12,2
 3       4  Current Loan Delinquency Status                 Alpha Numeric
 4       5                         Loan Age                      Numeric )

In [11]:
orig_column_names = orig_layout['ATTRIBUTE NAME'].tolist()
orig_data_types = orig_layout['DATA TYPE & FORMAT'].tolist()
svcg_column_names = svcg_layout['ATTRIBUTE NAME'].tolist()
svcg_data_types = svcg_layout['DATA TYPE & FORMAT'].tolist()

In [None]:
#2022
#Renaming the columns of the datasets based on ATTRIBUTE NAME col
origination_2022.columns = orig_column_names
performance_2022.columns = svcg_column_names

In [13]:
#Convert "Numeric" and "Date" columns to numeric and date formats
orig_numeric_cols = orig_layout[orig_layout['DATA TYPE & FORMAT'] == 'Numeric']['ATTRIBUTE NAME'].tolist()
orig_date_cols = orig_layout[orig_layout['DATA TYPE & FORMAT'] == 'Date']['ATTRIBUTE NAME'].tolist()
origination_2022[orig_numeric_cols] = origination_2022[orig_numeric_cols].apply(pd.to_numeric, errors='coerce')
origination_2022[orig_date_cols] = origination_2022[orig_date_cols].apply(pd.to_datetime, errors='coerce', format = "%Y%m")

perf_numeric_cols = svcg_layout[svcg_layout['DATA TYPE & FORMAT'] == 'Numeric']['ATTRIBUTE NAME'].tolist()
perf_date_cols = svcg_layout[svcg_layout['DATA TYPE & FORMAT'] == 'Date']['ATTRIBUTE NAME'].tolist()
performance_2022[perf_numeric_cols] = performance_2022[perf_numeric_cols].apply(pd.to_numeric, errors='coerce')
performance_2022[perf_date_cols] = performance_2022[perf_date_cols].apply(pd.to_datetime, errors='coerce', format = "%Y%m")
#Save datasets
origination_2022.to_csv('../Data/mortgage_data/origination_2022.csv')
performance_2022.to_csv('../Data/mortgage_data/performance_2022.csv')

In [None]:
#Remove unnecessary column from both datasets


In [16]:
#Import 3-zip mapping file and map to origination dataset.
# Load the datasets
orig_df = pd.read_csv("../Data/mortgage_data/origination_2022.csv")
zip_county_df = pd.read_excel("../Data/ZIP_COUNTY_062023.xlsx")

# Extract the first three digits from the origination dataset's Postal Code column
orig_df['ZIP3'] = orig_df['Postal Code'].astype(str).str[:3]
zip_county_df['ZIP3'] = zip_county_df['ZIP'].astype(str).str[:3]

# Merging based on ZIP3 and state columns
merged_df = pd.merge(orig_df, zip_county_df, left_on=['ZIP3', 'Property State'], right_on=['ZIP3', 'USPS_ZIP_PREF_STATE'], how='left')

# Identifying rows with multiple possible counties
multi_county_rows = merged_df.groupby(['ZIP3', 'Property State']).filter(lambda x: x['COUNTY'].nunique() > 1)

# Counting the number of such rows
num_multi_county_rows = multi_county_rows.shape[0]

# For those rows, selecting one county at random
merged_df['COUNTY'] = merged_df.groupby(['ZIP3', 'Property State'])['COUNTY'].transform(lambda x: x.sample(n=1).iloc[0] if x.nunique() > 1 else x)

print(f"Number of rows where a random county had to be selected: {num_multi_county_rows}")

Number of rows where a random county had to be selected: 3490257
