In [2]:
import pandas as pd
from datetime import datetime
import pickle
import sqlite3

In [2]:
# Load the file layout
layout = pd.read_excel("../Data/mortgage_data/file_layout.xlsx", sheet_name=None)

## Column name extraction from Freddie Mac documentation

In [3]:
# sheet_names = layout.keys()
# Extract column names and data types for both origination and performance datasets
orig_layout = layout['Origination Data File']
perf_layout = layout['Monthly Performance Data File']

# Extract column names and data types
orig_column_names = orig_layout['ATTRIBUTE NAME'].tolist()
orig_data_types = orig_layout['DATA TYPE & FORMAT'].tolist()
perf_column_names = perf_layout['ATTRIBUTE NAME'].tolist()
perf_data_types = perf_layout['DATA TYPE & FORMAT'].tolist()

cols_keep_perf = perf_layout['KEEP'].tolist()
cols_keep_orig = orig_layout['KEEP'].tolist()


In [4]:
cols_keep_orig[0:22]

[1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1]

## Load data per year

In [5]:
def drop_cols_and_NAN(data):
    #first drop all columns that only have NaN values
    data = data.dropna(axis=1, how='all')
    #drop cols_to_drop
    return data

In [6]:
#function that drops columns where cols_keep is 0
def drop_cols(data, cols_keep, col_names):
    cols_to_drop = []
    for i in range(len(cols_keep)):
        if cols_keep[i] == 0:
            cols_to_drop.append(col_names[i])
    data = data.drop(cols_to_drop, axis=1)
    return data

In [7]:
def load_yearly_data(year, base_dir="../Data/mortgage_data"):
    """
    Load and format the origination and performance datasets for a given year, considering the folder structure.
    
    Parameters:
    - year: The year for which to load the data.
    - base_dir: The base directory where the datasets are stored.
    
    Returns:
    - orig_data: Formatted origination dataset for the given year.
    - perf_data: Formatted performance dataset for the given year.
    """


    # Construct file paths considering the "sample_YYYY" folder structure
    orig_file_path = f"{base_dir}/sample_{year}/sample_orig_{year}.txt"
    perf_file_path = f"{base_dir}/sample_{year}/sample_svcg_{year}.txt"
    
    # Load origination data
    orig_data = pd.read_csv(orig_file_path, sep="|", header=None, low_memory=False)
    #select only the first 22 columns
    orig_data = orig_data.iloc[:, 0:22]
    #rename columns according to orig_column_names first 22
    orig_data.columns = orig_column_names[0:22]
    
    # Load performance data
    perf_data = pd.read_csv(perf_file_path, sep="|", header=None, names=perf_column_names, low_memory=False)
    
    try:
        orig_data = drop_cols(orig_data, cols_keep_orig[0:22], orig_column_names)
        perf_data = drop_cols(perf_data, cols_keep_perf, perf_column_names)
        # display('cols dropped')
    except:
        # display('no cols dropped')
        pass
    orig_data = drop_cols_and_NAN(orig_data)
    perf_data = drop_cols_and_NAN(perf_data)
    return orig_data, perf_data


## Load all data at once into dictionary

In [8]:
def load_all_datasets(start_year=1999, end_year=2022, base_dir="../Data/mortgage_data/"):
    """
    Load all origination and performance datasets for a given range of years.
    
    Parameters:
    - start_year: The starting year (inclusive) for which to load the data.
    - end_year: The ending year (inclusive) for which to load the data.
    - base_dir: The base directory where the datasets are stored.
    
    Returns:
    - datasets: Dictionary containing formatted origination and performance datasets for the given range of years.
    """
    
    datasets = {}
    
    for year in range(start_year, end_year + 1):
        display(year)
        orig_data, perf_data = load_yearly_data(year, base_dir=base_dir)
        datasets[f"orig_{year}"] = orig_data
        # display(orig_data.shape)
        datasets[f"perf_{year}"] = perf_data
        # display(perf_data.shape)
    
    return datasets

# For demonstration purposes, we'll load only the 2022 sample data
# To load all years' data, you would simply call load_all_datasets() without the year range
datasets_demo = load_all_datasets(start_year=1999, end_year=2022)
datasets_demo.keys()  # Display the keys of the dictionary

1999

2000

2001

2002

2003

2004

2005

2006

2007

2008

2009

2010

2011

2012

2013

2014

2015

2016

2017

2018

2019

2020

2021

2022

dict_keys(['orig_1999', 'perf_1999', 'orig_2000', 'perf_2000', 'orig_2001', 'perf_2001', 'orig_2002', 'perf_2002', 'orig_2003', 'perf_2003', 'orig_2004', 'perf_2004', 'orig_2005', 'perf_2005', 'orig_2006', 'perf_2006', 'orig_2007', 'perf_2007', 'orig_2008', 'perf_2008', 'orig_2009', 'perf_2009', 'orig_2010', 'perf_2010', 'orig_2011', 'perf_2011', 'orig_2012', 'perf_2012', 'orig_2013', 'perf_2013', 'orig_2014', 'perf_2014', 'orig_2015', 'perf_2015', 'orig_2016', 'perf_2016', 'orig_2017', 'perf_2017', 'orig_2018', 'perf_2018', 'orig_2019', 'perf_2019', 'orig_2020', 'perf_2020', 'orig_2021', 'perf_2021', 'orig_2022', 'perf_2022'])

In [9]:
# with open("../Data/mortgage_data/datasets_demo.pickle", "wb") as f:
#     pickle.dump(datasets_demo, f)

In [10]:
#with open("../Data/mortgage_data/datasets_demo.pickle", "rb") as f:
#    datasets_unmerged = pickle.load(f)

In [11]:
def merge_orig_with_perf(orig_data, perf_data):
    merged_data = pd.merge(perf_data, orig_data, on="LSN", how="left")
    #move Loan Sequence Number to the front
    merged_data = merged_data[["LSN"] + [col for col in merged_data.columns if col != "LSN"]]
    #move MRP to the front
    merged_data = merged_data[["MRP"] + [col for col in merged_data.columns if col != "MRP"]]
    return merged_data

In [12]:
def merge_all_datasets(datasets):
    """
    Merge all origination and performance datasets within the provided dictionary according to their year.
    
    Parameters:
    - datasets: Dictionary containing formatted origination and performance datasets.
    
    Returns:
    - merged_datasets: Dictionary containing merged datasets for each year.
    """
    merged_datasets = {}
    # Extract the range of years from the dataset keys
    years = sorted(set(int(key.split("_")[-1]) for key in datasets.keys()))
    for year in years:
        orig_key = f"orig_{year}"
        perf_key = f"perf_{year}"
        if orig_key in datasets and perf_key in datasets:
            merged_data = merge_orig_with_perf(datasets[orig_key], datasets[perf_key])
            merged_data['Date'] = merged_data['MRP'].astype(str)
            merged_datasets[f"fm_{year}"] = merged_data
            print("merged", year)
    return merged_datasets

# Merge all datasets in the provided dictionary (in this case, datasets_demo)
merged_datasets_demo = merge_all_datasets(datasets_demo)
merged_datasets_demo.keys()  # Display the keys of the merged datasets dictionary


merged 1999
merged 2000
merged 2001
merged 2002
merged 2003
merged 2004
merged 2005
merged 2006
merged 2007
merged 2008
merged 2009
merged 2010
merged 2011
merged 2012
merged 2013
merged 2014
merged 2015
merged 2016
merged 2017
merged 2018
merged 2019
merged 2020
merged 2021
merged 2022


dict_keys(['fm_1999', 'fm_2000', 'fm_2001', 'fm_2002', 'fm_2003', 'fm_2004', 'fm_2005', 'fm_2006', 'fm_2007', 'fm_2008', 'fm_2009', 'fm_2010', 'fm_2011', 'fm_2012', 'fm_2013', 'fm_2014', 'fm_2015', 'fm_2016', 'fm_2017', 'fm_2018', 'fm_2019', 'fm_2020', 'fm_2021', 'fm_2022'])

In [19]:
fm_2022 = merged_datasets_demo['fm_2022']
display(fm_2022.shape)
display(fm_2022.head())

(419440, 20)

Unnamed: 0,MRP,LSN,CLDS,AGE,MONTS_REM,CIR,ELTV,DDD,CS,FPD,FIRST_F,MD,CLTV,DTI,LTV,OIR,P_TYPE,POSTAL,OLT,Date
0,202202,F22Q10000012,0,0,180,2.625,57,,768,202203,N,203702,57,28,57,2.625,SF,12500,180,202202
1,202203,F22Q10000012,0,1,179,2.625,48,,768,202203,N,203702,57,28,57,2.625,SF,12500,180,202203
2,202204,F22Q10000012,0,2,178,2.625,52,,768,202203,N,203702,57,28,57,2.625,SF,12500,180,202204
3,202205,F22Q10000012,0,3,177,2.625,40,,768,202203,N,203702,57,28,57,2.625,SF,12500,180,202205
4,202206,F22Q10000012,0,4,176,2.625,39,,768,202203,N,203702,57,28,57,2.625,SF,12500,180,202206


In [21]:
del datasets_demo
del cols_keep_orig
del cols_keep_perf
del layout
del orig_column_names
del orig_data_types
del perf_column_names
del perf_data_types
del perf_layout
del orig_layout
del fm_2022


In [14]:
# with open("../Data/mortgage_data/fm_datasets.pickle", "wb") as f:
#     pickle.dump(merged_datasets_demo, f)

In [3]:
with open("../Data/mortgage_data/fm_datasets.pickle", "rb") as f:
    fm_datasets = pickle.load(f)

## Function to merge macro data to performance

In [4]:
def merge_macro_data(perf_data, macro_data, perf_date_col, macro_date_col):
    """
    Merge macroeconomic data with the performance dataset based on the date.
    
    Parameters:
    - perf_data: The performance dataset.
    - macro_data: The macroeconomic dataset.
    - perf_date_col: The date column name in the performance dataset.
    - macro_date_col: The date column name in the macroeconomic dataset.
    
    Returns:
    - Merged dataset.
    """
    
    # Merge datasets based on the date
    merged_data = pd.merge(perf_data, macro_data, left_on=perf_date_col, right_on=macro_date_col, how='left')
    return merged_data

# Sample usage of the functions can be provided if datasets are available.
# For now, these functions are generic and can be adapted to actual data.

In [5]:
def extract_year(value):
    value = str(value)
    if len(value) == 4:
        return value
    else:
        try:
            # Handle dates like "2022-01-15"
            return datetime.strptime(value, "%Y-%m-%d").year
        except:
            try:
                # Handle dates like "202201"
                return datetime.strptime(value, "%Y%m").year
            except:
                return value

def extract_month(value):
    value = str(value)
    try:
        # Handle month names like "Dec", "Jan", etc.
        return datetime.strptime(value, "%b").month
    except:
        try:
            # Handle dates like "2022-01-15"
            return datetime.strptime(value, "%Y-%m-%d").month
        except:
            try:
                # Handle dates like "202201" or "20221"
                return datetime.strptime(value, "%Y%m").month
            except:
                return value  #or some default value
#function that takes month column and year column and returns a "YYYYmm" string

# Merge ENSO data

In [6]:
#Load enso_mei_long.csv
enso_mei_long = pd.read_csv("../Data/enso_mei_long.csv")
#Transform Month Dec to 12, Jan to 1, Feb to 2, etc.
enso_mei_long['Month'] = enso_mei_long['Month'].apply(extract_month)
enso_mei_long['Date'] = enso_mei_long.apply(lambda row: f"{row['Year'].astype(int)}{row['Month'].astype(int):02}", axis = 1)
enso_mei_long.head()

Unnamed: 0,Year,Month,MEI,Date
0,1979,12,0.47,197912
1,1979,1,0.27,197901
2,1979,2,-0.04,197902
3,1979,3,0.26,197903
4,1979,4,0.35,197904


In [8]:
#merge enso_mei_long with all fm_YYYY datasets by Date in for loop
for key in fm_datasets.keys():
    fm_datasets[key] = merge_macro_data(fm_datasets[key], enso_mei_long, 'Date', 'Date')
    print("merged", key)

merged fm_1999
merged fm_2000
merged fm_2001
merged fm_2002
merged fm_2003
merged fm_2004
merged fm_2005
merged fm_2006
merged fm_2007
merged fm_2008
merged fm_2009
merged fm_2010
merged fm_2011
merged fm_2012
merged fm_2013
merged fm_2014
merged fm_2015
merged fm_2016
merged fm_2017
merged fm_2018
merged fm_2019
merged fm_2020
merged fm_2021
merged fm_2022


# Access Database

In [9]:
# Create a database connection
db_path = "../Database/thesis_database.db"
conn = sqlite3.connect(db_path)

In [11]:
for key, dataset in fm_datasets.items():
    print("Writing", key, "to database...")
    dataset.to_sql(key, conn, if_exists = "replace", index = False)

Writing fm_1999 to database...
Writing fm_2000 to database...
Writing fm_2001 to database...
Writing fm_2002 to database...
Writing fm_2003 to database...
Writing fm_2004 to database...
Writing fm_2005 to database...
Writing fm_2006 to database...
Writing fm_2007 to database...
Writing fm_2008 to database...
Writing fm_2009 to database...
Writing fm_2010 to database...
Writing fm_2011 to database...
Writing fm_2012 to database...
Writing fm_2013 to database...
Writing fm_2014 to database...
Writing fm_2015 to database...
Writing fm_2016 to database...
Writing fm_2017 to database...
Writing fm_2018 to database...
Writing fm_2019 to database...
Writing fm_2020 to database...
Writing fm_2021 to database...
Writing fm_2022 to database...


In [12]:
del fm_datasets

In [20]:
db_path = "../Database/thesis_database.db"
conn = sqlite3.connect(db_path)

In [22]:
query = "SELECT * FROM fm_2021"

#query that binds fm_2022 and fm_2021
query = "SELECT * FROM fm_2022 UNION ALL SELECT * FROM fm_2021;"


FM_20212022 = pd.read_sql_query(query, conn)
#save to csv
# fm_2021.to_csv("../Data/mortgage_data/fm_2021.csv", index=False)

Unnamed: 0,MRP,LSN,CLDS,AGE,MONTS_REM,CIR,ELTV,DDD,CS,FPD,...,DTI,LTV,OIR,P_TYPE,POSTAL,OLT,Date,Year,Month,MEI
0,202202,F22Q10000012,0,0,180,2.625,57,,768,202203,...,28,57,2.625,SF,12500,180,202202,2022,2,-1.28
1,202203,F22Q10000012,0,1,179,2.625,48,,768,202203,...,28,57,2.625,SF,12500,180,202203,2022,3,-1.76
2,202204,F22Q10000012,0,2,178,2.625,52,,768,202203,...,28,57,2.625,SF,12500,180,202204,2022,4,-1.88
3,202205,F22Q10000012,0,3,177,2.625,40,,768,202203,...,28,57,2.625,SF,12500,180,202205,2022,5,-2.07
4,202206,F22Q10000012,0,4,176,2.625,39,,768,202203,...,28,57,2.625,SF,12500,180,202206,2022,6,-2.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1398348,202303,F21Q40865007,0,6,354,3.000,63,,795,202210,...,19,65,3.000,PU,26500,360,202303,2023,3,-0.41
1398349,202212,F21Q40865038,0,0,360,3.375,68,,817,202301,...,33,60,3.375,PU,16000,360,202212,2022,12,-1.06
1398350,202301,F21Q40865038,0,1,359,3.375,59,,817,202301,...,33,60,3.375,PU,16000,360,202301,2023,1,-0.81
1398351,202302,F21Q40865038,0,2,358,3.375,59,,817,202301,...,33,60,3.375,PU,16000,360,202302,2023,2,-0.67


(1398353, 23)

In [40]:
conn.close()

In [36]:
#Do panel data regression on FM_20212022 per postal code and Date 


#Start with Panel Data Regression
#First, create a panel data frame
#convert Date to datetime




In [38]:
FM_20212022['Date'] = pd.to_datetime(FM_20212022['Date'], format='%Y%m')

FM_20212022 = FM_20212022.set_index(['POSTAL', 'Date'])

In [39]:
mod = PanelOLS(dependent=FM_20212022['y'], exog=FM_20212022[['x1', 'x2']])
res = mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:                   ELTV   R-squared:                        0.2273
Estimator:                   PanelOLS   R-squared (Between):              0.6481
No. Observations:             1398353   R-squared (Within):               0.0052
Date:                Mon, Oct 09 2023   R-squared (Overall):              0.2273
Time:                        20:49:02   Log-likelihood                -9.584e+06
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   1.371e+05
Entities:                         881   P-value                           0.0000
Avg Obs:                       1587.2   Distribution:               F(3,1398350)
Min Obs:                       3.0000                                           
Max Obs:                    1.503e+04   F-statistic (robust):          1.371e+05
                            

# Close Database

## Function to merge zip data to performance

In [13]:
def merge_zip_data(perf_data, zip_data, perf_zip_col, zip_data_col):
    """
    Merge external data with the performance dataset based on the 3zip code.
    
    Parameters:
    - perf_data: The performance dataset.
    - zip_data: The external dataset with 3zip level information.
    - perf_zip_col: The zip column name in the performance dataset (might be 5-digit zip).
    - zip_data_col: The 3zip column name in the external dataset.
    
    Returns:
    - Merged dataset.
    """
    # Convert 5-digit zip code to 3zip format
    perf_data['3zip'] = perf_data[perf_zip_col].astype(str).str[:3]
    zip_data['3zip'] = zip_data[zip_data_col].astype(str).str[:3]
    
    # Merge datasets based on the 3zip code
    merged_data = pd.merge(perf_data, zip_data, left_on='3zip', right_on='3zip', how='left')
    
    return merged_data