# Mortgage file to SQL database

load necessary packages

In [23]:
import pandas as pd
from datetime import datetime
import pickle
import sqlite3
import dask.dataframe as dd
from dask.diagnostics.progress import ProgressBar
from tqdm import tqdm

load necessary dataset

In [18]:
# load file_layout
layout = pd.read_excel("../../Data/mortgage_data/file_layout.xlsx", sheet_name=None)

In [19]:
# Extract column names and data types for both origination and performance datasets
orig_layout = layout['Origination Data File']
perf_layout = layout['Monthly Performance Data File']

# Extract column names and data types
orig_column_names = orig_layout['ATTRIBUTE NAME'].tolist()
perf_column_names = perf_layout['ATTRIBUTE NAME'].tolist()

cols_keep_perf = perf_layout['KEEP'].tolist()
cols_keep_orig = orig_layout['KEEP'].tolist()
del orig_layout, perf_layout, layout

### Load the origination and the performance datasets into dictionary. Also drop unnecessary columns.

In [20]:
def load_yearly_data(year, base_dir="../../Data/mortgage_data"):
    """
    Load and format the origination and performance datasets for a given year, considering the folder structure.
    
    Parameters:
    - year: The year for which to load the data.
    - base_dir: The base directory where the datasets are stored.
    
    Returns:
    - orig_data: Formatted origination dataset for the given year.
    - perf_data: Formatted performance dataset for the given year.
    """
    # Construct file paths considering the "sample_YYYY" folder structure
    orig_file_path = f"{base_dir}/sample_{year}/sample_orig_{year}.txt"
    perf_file_path = f"{base_dir}/sample_{year}/sample_svcg_{year}.txt"
    
    # Load origination data
    orig_data = pd.read_csv(orig_file_path, sep="|", header=None, low_memory=False)
    #select only the first 22 columns
    orig_data = orig_data.iloc[:, 0:22]
    #rename columns according to orig_column_names first 22
    orig_data.columns = orig_column_names[0:22]
    
    # Load performance data
    perf_data = pd.read_csv(perf_file_path, sep="|", header=None, names=perf_column_names, low_memory=False)
        #function that drops columns where cols_keep is 0
    def drop_cols(data, cols_keep, col_names):
        cols_to_drop = [col_names[i] for i, val in enumerate(cols_keep) if val == 0]
        return data.drop(columns=cols_to_drop)
    try:
        orig_data = drop_cols(orig_data, cols_keep_orig[0:22], orig_column_names)
        perf_data = drop_cols(perf_data, cols_keep_perf, perf_column_names)
        # display('cols dropped')
    except:
        # display('no cols dropped')
        pass
    orig_data = orig_data.dropna(axis=1, how='all')
    perf_data = perf_data.dropna(axis=1, how='all')
    return orig_data, perf_data

def load_all_datasets(start_year=1999, end_year=2022, base_dir="../../Data/mortgage_data/"):
    """
    Load all origination and performance datasets for a given range of years.
    
    Parameters:
    - start_year: The starting year (inclusive) for which to load the data.
    - end_year: The ending year (inclusive) for which to load the data.
    - base_dir: The base directory where the datasets are stored.
    
    Returns:
    - datasets: Dictionary containing formatted origination and performance datasets for the given range of years.
    """
    
    datasets = {}
    
    for year in tqdm(range(start_year, end_year + 1), desc = "Loading datasets"):
        orig_data, perf_data = load_yearly_data(year, base_dir=base_dir)
        datasets[f"orig_{year}"] = orig_data
        datasets[f"perf_{year}"] = perf_data
    return datasets

datasets_tot = load_all_datasets(start_year=1999, end_year=2022)
del cols_keep_orig, cols_keep_perf, orig_column_names, perf_column_names

Loading datasets: 100%|██████████| 24/24 [01:34<00:00,  3.94s/it]


### Merge Origination Dataset with Performance Dataset on LSN

In [None]:
def merge_all_datasets(datasets):
    """
    Merge all origination and performance datasets within the provided dictionary according to their year.
    
    Parameters:
    - datasets: Dictionary containing formatted origination and performance datasets.
    
    Returns:
    - merged_datasets: Dictionary containing merged datasets for each year.
    """
    
    def merge_orig_with_perf(orig_data, perf_data):
        merged_data = pd.merge(perf_data, orig_data, on="LSN", how="left")
        return merged_data
    
    merged_datasets = {}
    # Extract the range of years from the dataset keys
    years = sorted(set(int(key.split("_")[-1]) for key in datasets.keys()))
    for year in tqdm(years, desc="Merging datasets"):
        orig_key = f"orig_{year}"
        perf_key = f"perf_{year}"
        if orig_key in datasets and perf_key in datasets:
            merged_data = merge_orig_with_perf(datasets[orig_key], datasets[perf_key])
            merged_data['Date'] = pd.to_datetime(merged_data['MRP'], format = '%Y%m')
            merged_data = merged_data.drop(['MRP'], axis=1)
            merged_data = merged_data[["Date"] + ["LSN"] + [col for col in merged_data.columns if col != "LSN" and col != "Date"]]
            merged_datasets[f"fm_{year}"] = merged_data
            print("merged", year)
    return merged_datasets

# Merge all datasets in the provided dictionary (in this case, datasets_demo)
merged_datasets = merge_all_datasets(datasets_tot)
merged_datasets.keys()  # Display the keys of the merged datasets dictionary
del datasets_tot

In [24]:
def process_dataset(df):
    # Transformations
    df['3ZIP'] = df['POSTAL'].astype(str).str[:3]
    df['DDD'] = df['DDD'].fillna(0).replace('Y', 1)
    df['FIRST_F'] = df['FIRST_F'].replace({'N': 0, 'Y': 1})
    #change RA values to 99 in CLDS
    df['CLDS'] = df['CLDS'].replace('RA', 99)
    df['CLDS'] = df['CLDS'].astype('int16')
    df['D90'] = 0
    df['D180'] = 0
    # Drop columns
    df = df.drop(['POSTAL'], axis=1)
    
    # Process group
    def process_group(group):
        for val, offset, column in [(3, 3, 'D90'), (7, 6, 'D180')]:
            if val in group['CLDS'].values:
                delinquency_date = group[group['CLDS'] == val]['Date'].min()
                back_date = delinquency_date - pd.DateOffset(months=offset)
                group.loc[group['Date'] == back_date, column] = 1
        return group
    
    df = df.groupby('LSN').apply(process_group, meta=df._meta)
    #ungroup df
    df = df.reset_index(drop=True)
    
    # Move Date and 3ZIP to the front
    df = df[["Date", "3ZIP"] + [col for col in df.columns if col not in ["Date", "3ZIP"]]]
    
    return df

# Convert to Dask DataFrames and process
with ProgressBar():
    for key in merged_datasets.keys():
        print(f"{key} processing...")
        ddf = dd.from_pandas(merged_datasets[key], npartitions=6)
        merged_datasets[key] = process_dataset(ddf).compute()

[########################################] | 100% Completed | 34.74 s
fm_1999 added and dropped
[########################################] | 100% Completed | 33.09 s
fm_2000 added and dropped
[########################################] | 100% Completed | 33.50 s
fm_2001 added and dropped
[########################################] | 100% Completed | 34.57 s
fm_2002 added and dropped
[########################################] | 100% Completed | 39.04 s
fm_2003 added and dropped
[########################################] | 100% Completed | 39.77 s
fm_2004 added and dropped
[########################################] | 100% Completed | 41.34 s
fm_2005 added and dropped
[########################################] | 100% Completed | 41.33 s
fm_2006 added and dropped
[########################################] | 100% Completed | 40.78 s
fm_2007 added and dropped
[########################################] | 100% Completed | 36.98 s
fm_2008 added and dropped
[#######################################

Unnamed: 0,Date,3ZIP,LSN,CLDS,CIR,ELTV,DDD,CS,FPD,FIRST_F,MD,CLTV,DTI,LTV,OIR,P_TYPE,OLT,D90,D180
0,2022-02-01,125,F22Q10000012,0,2.625,57,0,768,202203,0,203702,57,28,57,2.625,SF,180,0,0
1,2022-03-01,125,F22Q10000012,0,2.625,48,0,768,202203,0,203702,57,28,57,2.625,SF,180,0,0
2,2022-04-01,125,F22Q10000012,0,2.625,52,0,768,202203,0,203702,57,28,57,2.625,SF,180,0,0
3,2022-05-01,125,F22Q10000012,0,2.625,40,0,768,202203,0,203702,57,28,57,2.625,SF,180,0,0
4,2022-06-01,125,F22Q10000012,0,2.625,39,0,768,202203,0,203702,57,28,57,2.625,SF,180,0,0


In [25]:
merged_datasets['fm_2005']['D90'].value_counts()

D90
0    3835398
1       5487
Name: count, dtype: int64

In [None]:
#Save dictionary to accessible pickle file
with open("../Data/mortgage_data/fm_datasets.pickle", "wb") as f:
    pickle.dump(merged_datasets, f)

## Open connection SQL

In [None]:
db_path = "../../Database/thesis_database.db"
conn = sqlite3.connect(db_path)

Store mortgage dataset into SQL database

In [None]:
for key, dataset in merged_datasets.items():
    print("Writing", key, "to database...")
    dataset.to_sql(key, conn, if_exists = "replace", index = False)

In [None]:
#close connection
conn.close()

## Connection Closed SQL