In [None]:
import pandas as pd
# Load the file layout
layout = pd.read_excel("../Data/mortgage_data/file_layout.xlsx", sheet_name=None)

## Column name extraction from Freddie Mac documentation

In [None]:
# sheet_names = layout.keys()
# Extract column names and data types for both origination and performance datasets
orig_layout = layout['Origination Data File']
perf_layout = layout['Monthly Performance Data File']

# Extract column names and data types
orig_column_names = orig_layout['ATTRIBUTE NAME'].tolist()
orig_data_types = orig_layout['DATA TYPE & FORMAT'].tolist()
perf_column_names = perf_layout['ATTRIBUTE NAME'].tolist()
perf_data_types = perf_layout['DATA TYPE & FORMAT'].tolist()

## Load data per year

In [None]:
def drop_cols_and_NAN(data, cols_to_drop):
    #first drop all columns that only have NaN values
    data = data.dropna(axis=1, how='all')
    #drop cols_to_drop
    data = data.drop(cols_to_drop, axis=1)
    return data

In [None]:
def drop_no_risk_loans(data, loans_to_drop):
    
    #drop loans_to_drop
    data = data.drop(loans_to_drop, axis=0)
    return data

In [None]:
def load_yearly_data(year, base_dir="../Data/mortgage_data"):
    """
    Load and format the origination and performance datasets for a given year, considering the folder structure.
    
    Parameters:
    - year: The year for which to load the data.
    - base_dir: The base directory where the datasets are stored.
    
    Returns:
    - orig_data: Formatted origination dataset for the given year.
    - perf_data: Formatted performance dataset for the given year.
    """


    # Construct file paths considering the "sample_YYYY" folder structure
    orig_file_path = f"{base_dir}/sample_{year}/sample_orig_{year}.txt"
    perf_file_path = f"{base_dir}/sample_{year}/sample_svcg_{year}.txt"
    
    # Load origination data
    orig_data = pd.read_csv(orig_file_path, sep="|", header=None, names=orig_column_names, low_memory=False)
    # Load performance data
    perf_data = pd.read_csv(perf_file_path, sep="|", header=None, names=perf_column_names, low_memory=False)
    
    perf_cols_drop = []#select columns you want to drop
    orig_cols_drop = []#select columns you want to drop
    orig_data = drop_cols_and_NAN(orig_data, orig_cols_drop)
    perf_data = drop_cols_and_NAN(perf_data, perf_cols_drop)

    return orig_data, perf_data


## Load all data at once into dictionary

In [None]:
def load_all_datasets(start_year=1999, end_year=2022, base_dir="../Data/mortgage_data/"):
    """
    Load all origination and performance datasets for a given range of years.
    
    Parameters:
    - start_year: The starting year (inclusive) for which to load the data.
    - end_year: The ending year (inclusive) for which to load the data.
    - base_dir: The base directory where the datasets are stored.
    
    Returns:
    - datasets: Dictionary containing formatted origination and performance datasets for the given range of years.
    """
    
    datasets = {}
    
    for year in range(start_year, end_year + 1):
        orig_data, perf_data = load_yearly_data(year, base_dir=base_dir)
        
        datasets[f"orig_{year}"] = orig_data
        datasets[f"perf_{year}"] = perf_data
    
    return datasets

# For demonstration purposes, we'll load only the 2022 sample data
# To load all years' data, you would simply call load_all_datasets() without the year range
datasets_demo = load_all_datasets(start_year=1999, end_year=2022)
datasets_demo.keys()  # Display the keys of the dictionary

dict_keys(['orig_1999', 'perf_1999', 'orig_2000', 'perf_2000', 'orig_2001', 'perf_2001', 'orig_2002', 'perf_2002', 'orig_2003', 'perf_2003', 'orig_2004', 'perf_2004', 'orig_2005', 'perf_2005', 'orig_2006', 'perf_2006', 'orig_2007', 'perf_2007', 'orig_2008', 'perf_2008', 'orig_2009', 'perf_2009', 'orig_2010', 'perf_2010', 'orig_2011', 'perf_2011', 'orig_2012', 'perf_2012', 'orig_2013', 'perf_2013', 'orig_2014', 'perf_2014', 'orig_2015', 'perf_2015', 'orig_2016', 'perf_2016', 'orig_2017', 'perf_2017', 'orig_2018', 'perf_2018', 'orig_2019', 'perf_2019', 'orig_2020', 'perf_2020', 'orig_2021', 'perf_2021', 'orig_2022', 'perf_2022'])

In [None]:
def merge_orig_with_perf(orig_data, perf_data):
    merged_data = pd.merge(perf_data, orig_data, on="Loan Sequence Number", how="left")
    #move Loan Sequence Number to the front
    merged_data = merged_data[["Loan Sequence Number"] + [col for col in merged_data.columns if col != "Loan Sequence Number"]]
    #move Monthly Reporting Period to the front
    merged_data = merged_data[["Monthly Reporting Period"] + [col for col in merged_data.columns if col != "Monthly Reporting Period"]]
    return merged_data

In [None]:
def merge_all_datasets(datasets):
    """
    Merge all origination and performance datasets within the provided dictionary according to their year.
    
    Parameters:
    - datasets: Dictionary containing formatted origination and performance datasets.
    
    Returns:
    - merged_datasets: Dictionary containing merged datasets for each year.
    """
    merged_datasets = {}
    # Extract the range of years from the dataset keys
    years = sorted(set(int(key.split("_")[-1]) for key in datasets.keys()))
    for year in years:
        orig_key = f"orig_{year}"
        perf_key = f"perf_{year}"
        if orig_key in datasets and perf_key in datasets:
            merged_data = merge_orig_with_perf(datasets[orig_key], datasets[perf_key])
            merged_datasets[f"fm_{year}"] = merged_data
            print("merged", year)
    return merged_datasets

# Merge all datasets in the provided dictionary (in this case, datasets_demo)
merged_datasets_demo = merge_all_datasets(datasets_demo)
merged_datasets_demo.keys()  # Display the keys of the merged datasets dictionary


[1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
1999
merged 1999
2000
merged 2000
2001
merged 2001
2002
merged 2002
2003
merged 2003
2004
merged 2004
2005
merged 2005
2006
merged 2006
2007
merged 2007
2008
merged 2008
2009
merged 2009
2010
merged 2010
2011
merged 2011
2012
merged 2012
2013
merged 2013
2014
merged 2014
2015
merged 2015
2016
merged 2016
2017
merged 2017
2018
merged 2018
2019
merged 2019
2020
merged 2020
2021
merged 2021
2022
merged 2022


dict_keys(['fm_1999', 'fm_2000', 'fm_2001', 'fm_2002', 'fm_2003', 'fm_2004', 'fm_2005', 'fm_2006', 'fm_2007', 'fm_2008', 'fm_2009', 'fm_2010', 'fm_2011', 'fm_2012', 'fm_2013', 'fm_2014', 'fm_2015', 'fm_2016', 'fm_2017', 'fm_2018', 'fm_2019', 'fm_2020', 'fm_2021', 'fm_2022'])

## Function to merge macro data to performance

In [None]:
def merge_macro_data(perf_data, macro_data, perf_date_col, macro_date_col):
    """
    Merge macroeconomic data with the performance dataset based on the date.
    
    Parameters:
    - perf_data: The performance dataset.
    - macro_data: The macroeconomic dataset.
    - perf_date_col: The date column name in the performance dataset.
    - macro_date_col: The date column name in the macroeconomic dataset.
    
    Returns:
    - Merged dataset.
    """
    
    # Ensure the date columns are in a monthly format
    perf_data[perf_date_col] = pd.to_datetime(perf_data[perf_date_col]).dt.to_period('M')
    macro_data[macro_date_col] = pd.to_datetime(macro_data[macro_date_col]).dt.to_period('M')
    
    # Merge datasets based on the date
    merged_data = pd.merge(perf_data, macro_data, left_on=perf_date_col, right_on=macro_date_col, how='left')
    return merged_data

# Sample usage of the functions can be provided if datasets are available.
# For now, these functions are generic and can be adapted to actual data.

In [None]:
from datetime import datetime
#Load enso_mei_long.csv
enso_mei_long = pd.read_csv("../Data/enso_mei_long.csv")
#Transform Month Dec to 12, Jan to 1, Feb to 2, etc.
enso_mei_long['Month'] = enso_mei_long['Month'].apply(lambda x: datetime.strptime(x, "%b").month)
enso_mei_long.head()

Unnamed: 0,Year,Month,MEI
0,1979,12,0.47
1,1979,1,0.27
2,1979,2,-0.04
3,1979,3,0.26
4,1979,4,0.35


In [None]:
from datetime import datetime

def transform_date_column(df, column_month_name, column_year_name):
    """
    Transform a DataFrame column with various date formats to its month numerical representation.
    
    Parameters:
    - df: The input DataFrame.
    - column_name: The name of the column to be transformed.
    
    Returns:
    - df: DataFrame with the transformed column.
    """
    def extract_year(value):
        if len(str(value)) == 4:
            return value
        else:
            try:
                # Handle dates like "2022-01-15"
                return datetime.strptime(value, "%Y-%m-%d").year
            except:
                try:
                    # Handle dates like "202201"
                    return datetime.strptime(value, "%Y%m").year
                except:
                    return value

    def extract_month(value):
        #Check if value is already a YYYY format
        
        try:
            # Handle month names like "Dec", "Jan", etc.
            return datetime.strptime(value, "%b").month
        except:
            try:
                # Handle dates like "2022-01-15"
                return datetime.strptime(value, "%Y-%m-%d").month
            except:
                try:
                    # Handle dates like "202201"
                    return datetime.strptime(value, "%Y%m").month
                except:
                    return value  #or some default value
        
    df[column_month_name] = df[column_month_name].apply(extract_month)
    df[column_year_name] = df[column_year_name].apply(extract_year)
    return df
#Load enso_mei_long.csv
enso_mei_long = pd.read_csv("../Data/enso_mei_long.csv")
#Transform Month Dec to 12, Jan to 1, Feb to 2, etc.
enso_mei_long = transform_date_column(enso_mei_long, "Month", "Year")
enso_mei_long.head()




Unnamed: 0,Year,Month,MEI
0,1979,12,0.47
1,1979,1,0.27
2,1979,2,-0.04
3,1979,3,0.26
4,1979,4,0.35


In [None]:
fm_2022 = merged_datasets_demo['fm_2022']
fm_2022.head()

NameError: name 'fm_2022' is not defined

In [None]:
#merge enso_mei_long Year 2022 onto fm_2022 where Month of enso_mei_long = Monthly Reporting Period of fm_2022
merged_2022 = merge_macro_data(merged_datasets_demo["fm_2022"], enso_mei_long, "Monthly Reporting Period", "Month")
merged_2022.head()

## Function to merge zip data to performance

In [None]:
def merge_zip_data(perf_data, zip_data, perf_zip_col, zip_data_col):
    """
    Merge external data with the performance dataset based on the 3zip code.
    
    Parameters:
    - perf_data: The performance dataset.
    - zip_data: The external dataset with 3zip level information.
    - perf_zip_col: The zip column name in the performance dataset (might be 5-digit zip).
    - zip_data_col: The 3zip column name in the external dataset.
    
    Returns:
    - Merged dataset.
    """
    # Convert 5-digit zip code to 3zip format
    perf_data['3zip'] = perf_data[perf_zip_col].astype(str).str[:3]
    zip_data['3zip'] = zip_data[zip_data_col].astype(str).str[:3]
    
    # Merge datasets based on the 3zip code
    merged_data = pd.merge(perf_data, zip_data, left_on='3zip', right_on='3zip', how='left')
    
    return merged_data