In [1]:
import pandas as pd
from datetime import datetime
import pickle
import sqlite3

In [2]:
# Load the file layout
layout = pd.read_excel("../Data/mortgage_data/file_layout.xlsx", sheet_name=None)

## Column name extraction from Freddie Mac documentation

In [3]:
# sheet_names = layout.keys()
# Extract column names and data types for both origination and performance datasets
orig_layout = layout['Origination Data File']
perf_layout = layout['Monthly Performance Data File']

# Extract column names and data types
orig_column_names = orig_layout['ATTRIBUTE NAME'].tolist()
orig_data_types = orig_layout['DATA TYPE & FORMAT'].tolist()
perf_column_names = perf_layout['ATTRIBUTE NAME'].tolist()
perf_data_types = perf_layout['DATA TYPE & FORMAT'].tolist()

cols_keep_perf = perf_layout['KEEP'].tolist()
cols_keep_orig = orig_layout['KEEP'].tolist()


## Load data per year

In [4]:
def drop_cols_and_NAN(data):
    #first drop all columns that only have NaN values
    data = data.dropna(axis=1, how='all')
    #drop cols_to_drop
    return data

In [5]:
#function that drops columns where cols_keep is 0
def drop_cols(data, cols_keep, col_names):
    cols_to_drop = []
    for i in range(len(cols_keep)):
        if cols_keep[i] == 0:
            cols_to_drop.append(col_names[i])
    data = data.drop(cols_to_drop, axis=1)
    return data

In [6]:
def load_yearly_data(year, base_dir="../Data/mortgage_data"):
    """
    Load and format the origination and performance datasets for a given year, considering the folder structure.
    
    Parameters:
    - year: The year for which to load the data.
    - base_dir: The base directory where the datasets are stored.
    
    Returns:
    - orig_data: Formatted origination dataset for the given year.
    - perf_data: Formatted performance dataset for the given year.
    """


    # Construct file paths considering the "sample_YYYY" folder structure
    orig_file_path = f"{base_dir}/sample_{year}/sample_orig_{year}.txt"
    perf_file_path = f"{base_dir}/sample_{year}/sample_svcg_{year}.txt"
    
    # Load origination data
    orig_data = pd.read_csv(orig_file_path, sep="|", header=None, low_memory=False)
    #select only the first 22 columns
    orig_data = orig_data.iloc[:, 0:22]
    #rename columns according to orig_column_names first 22
    orig_data.columns = orig_column_names[0:22]
    
    # Load performance data
    perf_data = pd.read_csv(perf_file_path, sep="|", header=None, names=perf_column_names, low_memory=False)
    
    try:
        orig_data = drop_cols(orig_data, cols_keep_orig[0:22], orig_column_names)
        perf_data = drop_cols(perf_data, cols_keep_perf, perf_column_names)
        # display('cols dropped')
    except:
        # display('no cols dropped')
        pass
    orig_data = drop_cols_and_NAN(orig_data)
    perf_data = drop_cols_and_NAN(perf_data)
    return orig_data, perf_data


## Load all data at once into dictionary

In [7]:
def load_all_datasets(start_year=1999, end_year=2022, base_dir="../Data/mortgage_data/"):
    """
    Load all origination and performance datasets for a given range of years.
    
    Parameters:
    - start_year: The starting year (inclusive) for which to load the data.
    - end_year: The ending year (inclusive) for which to load the data.
    - base_dir: The base directory where the datasets are stored.
    
    Returns:
    - datasets: Dictionary containing formatted origination and performance datasets for the given range of years.
    """
    
    datasets = {}
    
    for year in range(start_year, end_year + 1):
        display(year)
        orig_data, perf_data = load_yearly_data(year, base_dir=base_dir)
        datasets[f"orig_{year}"] = orig_data
        # display(orig_data.shape)
        datasets[f"perf_{year}"] = perf_data
        # display(perf_data.shape)
    
    return datasets

# For demonstration purposes, we'll load only the 2022 sample data
# To load all years' data, you would simply call load_all_datasets() without the year range
datasets_tot = load_all_datasets(start_year=1999, end_year=2022)
datasets_tot.keys()  # Display the keys of the dictionary
#TAKES around 1.30 min to run

1999

2000

2001

2002

2003

2004

2005

2006

2007

2008

2009

2010

2011

2012

2013

2014

2015

2016

2017

2018

2019

2020

2021

2022

dict_keys(['orig_1999', 'perf_1999', 'orig_2000', 'perf_2000', 'orig_2001', 'perf_2001', 'orig_2002', 'perf_2002', 'orig_2003', 'perf_2003', 'orig_2004', 'perf_2004', 'orig_2005', 'perf_2005', 'orig_2006', 'perf_2006', 'orig_2007', 'perf_2007', 'orig_2008', 'perf_2008', 'orig_2009', 'perf_2009', 'orig_2010', 'perf_2010', 'orig_2011', 'perf_2011', 'orig_2012', 'perf_2012', 'orig_2013', 'perf_2013', 'orig_2014', 'perf_2014', 'orig_2015', 'perf_2015', 'orig_2016', 'perf_2016', 'orig_2017', 'perf_2017', 'orig_2018', 'perf_2018', 'orig_2019', 'perf_2019', 'orig_2020', 'perf_2020', 'orig_2021', 'perf_2021', 'orig_2022', 'perf_2022'])

In [8]:
def merge_orig_with_perf(orig_data, perf_data):
    merged_data = pd.merge(perf_data, orig_data, on="LSN", how="left")
    return merged_data

In [9]:
def merge_all_datasets(datasets):
    """
    Merge all origination and performance datasets within the provided dictionary according to their year.
    
    Parameters:
    - datasets: Dictionary containing formatted origination and performance datasets.
    
    Returns:
    - merged_datasets: Dictionary containing merged datasets for each year.
    """
    merged_datasets = {}
    # Extract the range of years from the dataset keys
    years = sorted(set(int(key.split("_")[-1]) for key in datasets.keys()))
    for year in years:
        orig_key = f"orig_{year}"
        perf_key = f"perf_{year}"
        if orig_key in datasets and perf_key in datasets:
            merged_data = merge_orig_with_perf(datasets[orig_key], datasets[perf_key])
            merged_data['Date'] = pd.to_datetime(merged_data['MRP'], format = '%Y%m')
            merged_data = merged_data.drop(['MRP'], axis=1)
            merged_data = merged_data[["Date"] + ["LSN"] + [col for col in merged_data.columns if col != "LSN" and col != "Date"]]
            merged_datasets[f"fm_{year}"] = merged_data
            print("merged", year)
    return merged_datasets

# Merge all datasets in the provided dictionary (in this case, datasets_demo)
merged_datasets = merge_all_datasets(datasets_tot)
merged_datasets.keys()  # Display the keys of the merged datasets dictionary


merged 1999
merged 2000
merged 2001
merged 2002
merged 2003
merged 2004
merged 2005
merged 2006
merged 2007
merged 2008
merged 2009
merged 2010
merged 2011
merged 2012
merged 2013
merged 2014
merged 2015
merged 2016
merged 2017
merged 2018
merged 2019
merged 2020
merged 2021
merged 2022


dict_keys(['fm_1999', 'fm_2000', 'fm_2001', 'fm_2002', 'fm_2003', 'fm_2004', 'fm_2005', 'fm_2006', 'fm_2007', 'fm_2008', 'fm_2009', 'fm_2010', 'fm_2011', 'fm_2012', 'fm_2013', 'fm_2014', 'fm_2015', 'fm_2016', 'fm_2017', 'fm_2018', 'fm_2019', 'fm_2020', 'fm_2021', 'fm_2022'])

In [10]:
fm_1999 = merged_datasets['fm_1999']
fm_1999.head()

Unnamed: 0,Date,LSN,CLDS,CIR,ELTV,DDD,CS,FPD,FIRST_F,MD,CLTV,DTI,LTV,OIR,P_TYPE,POSTAL,OLT
0,2002-09-01,F99Q10000029,0,6.375,,,618,200210,N,202902,85,24,85,6.375,SF,44200,317
1,2002-10-01,F99Q10000029,0,6.375,,,618,200210,N,202902,85,24,85,6.375,SF,44200,317
2,2002-11-01,F99Q10000029,0,6.375,,,618,200210,N,202902,85,24,85,6.375,SF,44200,317
3,2002-12-01,F99Q10000029,0,6.375,,,618,200210,N,202902,85,24,85,6.375,SF,44200,317
4,2003-01-01,F99Q10000029,0,6.375,,,618,200210,N,202902,85,24,85,6.375,SF,44200,317


In [11]:
del datasets_tot
del cols_keep_orig
del cols_keep_perf
del layout
del orig_column_names
del orig_data_types
del perf_column_names
del perf_data_types
del perf_layout
del orig_layout


Drop Columns and add 3ZiP

In [12]:
# for key in merged_datasets.keys():
#     merged_datasets[key]['Date'] = merged_datasets[key]['MRP'].astype(str)
#     merged_datasets[key]['3ZIP'] = merged_datasets[key]['POSTAL'].astype(str).str[:3].astype('int16')
#     #Transform DDD to 0 if NaN and 1 if Y
#     merged_datasets[key]['DDD'] = merged_datasets[key]['DDD'].fillna(0)
#     merged_datasets[key]['DDD'] = merged_datasets[key]['DDD'].replace('Y', 1)
#     #Transform FIRST_F to 0 if N and 1 if Y
#     merged_datasets[key]['FIRST_F'] = merged_datasets[key]['FIRST_F'].replace('N', 0)
#     merged_datasets[key]['FIRST_F'] = merged_datasets[key]['FIRST_F'].replace('Y', 1)
#     #Convert int64 to int32 or int16 or bool
#     merged_datasets[key]['DDD'] = merged_datasets[key]['DDD'].astype('bool')
#     merged_datasets[key]['FIRST_F'] = merged_datasets[key]['FIRST_F'].astype('bool')
#     merged_datasets[key]['ELTV'] = merged_datasets[key]['ELTV'].astype('Int16')
#     merged_datasets[key]['CS'] = merged_datasets[key]['CS'].astype('Int16')
#     merged_datasets[key]['CLTV'] = merged_datasets[key]['CLTV'].astype('Int16')
#     merged_datasets[key]['OLT'] = merged_datasets[key]['OLT'].astype('Int16')
#     merged_datasets[key]['DTI'] = merged_datasets[key]['DTI'].astype('Int16')
#     merged_datasets[key]['FPD'] = merged_datasets[key]['FPD'].astype('Int32')
#     merged_datasets[key]['MD'] = merged_datasets[key]['MD'].astype('int32')
#     #Drop POSTAL and MRP
#     merged_datasets[key].drop(['POSTAL'], axis=1, inplace=True)
#     merged_datasets[key].drop(['MRP'], axis=1, inplace=True)
#     #Move Date and 3ZIP to the front
#     merged_datasets[key] = merged_datasets[key][["Date", "3ZIP"] + [col for col in merged_datasets[key].columns if col not in ["Date", "3ZIP"]]]
#     print(f"{key} added and dropped")
# merged_datasets['fm_2022'].head()

In [13]:
import dask.dataframe as dd
from dask.diagnostics.progress import ProgressBar
def process_dataset(df):
    # Transformations
    df['3ZIP'] = df['POSTAL'].astype(str).str[:3]
    df['DDD'] = df['DDD'].fillna(0).replace('Y', 1)
    df['FIRST_F'] = df['FIRST_F'].replace({'N': 0, 'Y': 1})
    #change RA values to 99 in CLDS
    df['CLDS'] = df['CLDS'].replace('RA', 99)
    df['CLDS'] = df['CLDS'].astype('int16')
    # Convert data types
    # int16_cols = ['ELTV', 'CS', 'CLTV', 'OLT', 'DTI']
    # int32_cols = ['FPD', 'MD']
    # for col in int16_cols:
    #     df[col] = df[col].astype('Int16')
    # for col in int32_cols:
    #     df[col] = df[col].astype('Int32')
    df['D90'] = 0
    df['D180'] = 0
    # Drop columns
    df = df.drop(['POSTAL'], axis=1)
    
    # Process group
    def process_group(group):
        for val, offset, column in [(3, 3, 'D90'), (7, 6, 'D180')]:
            if val in group['CLDS'].values:
                delinquency_date = group[group['CLDS'] == val]['Date'].min()
                back_date = delinquency_date - pd.DateOffset(months=offset)
                group.loc[group['Date'] == back_date, column] = 1
        return group
    
    df = df.groupby('LSN').apply(process_group, meta=df._meta)
    #ungroup df
    df = df.reset_index(drop=True)
    
    # Move Date and 3ZIP to the front
    df = df[["Date", "3ZIP"] + [col for col in df.columns if col not in ["Date", "3ZIP"]]]
    
    return df

# Convert to Dask DataFrames and process
with ProgressBar():
    for key in merged_datasets.keys():
        ddf = dd.from_pandas(merged_datasets[key], npartitions=6)
        merged_datasets[key] = process_dataset(ddf).compute()
        print(f"{key} added and dropped")

merged_datasets['fm_2022'].head()

[########################################] | 100% Completed | 29.05 ss
fm_2022 added and dropped
[########################################] | 100% Completed | 29.53 ss
fm_2021 added and dropped
[########################################] | 100% Completed | 30.73 ss
fm_2020 added and dropped
[########################################] | 100% Completed | 31.57 s
fm_2019 added and dropped
[########################################] | 100% Completed | 32.48 s
fm_2018 added and dropped
[########################################] | 100% Completed | 33.86 s
fm_2017 added and dropped
[########################################] | 100% Completed | 34.91 s
fm_2016 added and dropped
[########################################] | 100% Completed | 36.56 s
fm_2015 added and dropped
[########################################] | 100% Completed | 34.63 s
fm_2014 added and dropped
[########################################] | 100% Completed | 36.34 s
fm_2013 added and dropped
[####################################

Unnamed: 0,Date,3ZIP,LSN,CLDS,CIR,ELTV,DDD,CS,FPD,FIRST_F,MD,CLTV,DTI,LTV,OIR,P_TYPE,OLT,D90,D180
0,2022-02-01,125,F22Q10000012,0,2.625,57,0,768,202203,0,203702,57,28,57,2.625,SF,180,0,0
1,2022-03-01,125,F22Q10000012,0,2.625,48,0,768,202203,0,203702,57,28,57,2.625,SF,180,0,0
2,2022-04-01,125,F22Q10000012,0,2.625,52,0,768,202203,0,203702,57,28,57,2.625,SF,180,0,0
3,2022-05-01,125,F22Q10000012,0,2.625,40,0,768,202203,0,203702,57,28,57,2.625,SF,180,0,0
4,2022-06-01,125,F22Q10000012,0,2.625,39,0,768,202203,0,203702,57,28,57,2.625,SF,180,0,0


In [20]:
merged_datasets['fm_2008']['D90'].value_counts()

D90
0    2409669
1       4655
Name: count, dtype: int64

In [None]:
display(merged_datasets['fm_2022'].dtypes)
display(merged_datasets['fm_2022']['DTI'].describe())

In [21]:
with open("../Data/mortgage_data/fm_datasets.pickle", "wb") as f:
    pickle.dump(merged_datasets, f)

# From here you can work with fm_datasets

In [None]:
import pandas as pd
from datetime import datetime
import pickle
import sqlite3

In [None]:

with open("../Data/mortgage_data/fm_datasets.pickle", "rb") as f:
    fm_datasets = pickle.load(f)

In [None]:
#query that binds fm_2022 and fm_2021
query = "SELECT * FROM fm_2022 UNION ALL SELECT * FROM fm_2021;"

fm_21_22 = pd.read_sql_query(query, conn)
#save to csv


# Merge ENSO data

In [None]:
#Load enso_mei_long.csv
enso_mei_long = pd.read_csv("../Data/enso_mei_long.csv")
#Transform Month Dec to 12, Jan to 1, Feb to 2, etc.
enso_mei_long['Month'] = enso_mei_long['Month'].apply(extract_month)
enso_mei_long['Date'] = enso_mei_long.apply(lambda row: f"{row['Year'].astype(int)}{row['Month'].astype(int):02}", axis = 1)
enso_mei_long.head()

In [None]:
enso_mei_long = enso_mei_long[['Date', 'MEI']]

In [None]:
enso_mei_long.to_sql('enso_mei_long', conn, if_exists = "replace", index = False)

## Add HURR data to database

In [None]:
import pandas as pd
hrcn_data = pd.read_csv('mainland_usa_gdf_HRCN.csv', dtype={'3ZIP':str})
hrcn_data_short = hrcn_data[['3ZIP', 'HRCN_RISKS', 'HRCN_RISKV', 'HRCN_EVNTS', 'HRCN_EALS']]
hrcn_data_short.to_sql('hrcn_data_short', conn, if_exists = "replace", index = False)

### Aggregate HRCN data first

In [None]:
query_hrcn = f""" 
SELECT * 
FROM hrcn_data_short 
GROUP BY "3ZIP"
"""
hrcn_data_short_agg_1 = pd.read_sql_query(query_hrcn, conn)
hrcn_data_short_agg_1.to_sql('hrcn_data_short_agg_1', conn, if_exists = "replace", index = False)

# Close Database

In [None]:
conn.close()

## Function to merge zip data to performance

In [None]:
def merge_zip_data(perf_data, zip_data, perf_zip_col, zip_data_col):
    """
    Merge external data with the performance dataset based on the 3zip code.
    
    Parameters:
    - perf_data: The performance dataset.
    - zip_data: The external dataset with 3zip level information.
    - perf_zip_col: The zip column name in the performance dataset (might be 5-digit zip).
    - zip_data_col: The 3zip column name in the external dataset.
    
    Returns:
    - Merged dataset.
    """
    # Convert 5-digit zip code to 3zip format
    perf_data['3zip'] = perf_data[perf_zip_col].astype(str).str[:3]
    zip_data['3zip'] = zip_data[zip_data_col].astype(str).str[:3]
    
    # Merge datasets based on the 3zip code
    merged_data = pd.merge(perf_data, zip_data, left_on='3zip', right_on='3zip', how='left')
    
    return merged_data