# Load data

In [None]:
# Mount Google Drive locally
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the dataset into a pandas dataframe
import pandas as pd
df = pd.read_pickle("/content/drive/MyDrive/liveproject/mortgage_data_balanced.pkl.gz")

# Remove redundant columns

In [None]:
 keep_vars = ['agency_abbr', 'loan_type_name', 'loan_amount_000s', 'owner_occupancy_name', 'loan_purpose_name', 'property_type_name', 'applicant_ethnicity_name', 
              'applicant_race_name_1', 'applicant_sex_name', 'applicant_income_000s', 'population', 'minority_population', 'hud_median_family_income', 
              'tract_to_msamd_income', 'number_of_owner_occupied_units', 'number_of_1_to_4_family_units', 'action_taken_name']
      
df = df[keep_vars].copy()

# One-hot encode the categorical columns

In [None]:
# categorical variables
cat_variables = ['applicant_ethnicity_name', 'applicant_race_name_1', 'applicant_sex_name', 'agency_abbr',
                                 'owner_occupancy_name', 'property_type_name', 'loan_purpose_name', 'loan_type_name']

# initialise an empty dataframe with the same number of rows as df
df_cat = pd.DataFrame(index=df.index)

# import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

# I will do a loop for pedagogical reasons, but it is not entirely necessary
for cat in cat_variables:
    # one-hot encoding fitting
    one_hot_func = OneHotEncoder().fit(df[[cat]])
    
    # mapping
    cat_mapped = one_hot_func.transform(df[[cat]]).toarray()
    
    # storing
    for (k, cat_label) in enumerate(one_hot_func.categories_[0]):
        df_cat[cat + "_" + cat_label] = cat_mapped[:, k]

# Check correct number of one hot encoded columns
assert(df_cat.shape == (165950,27))

# Consolidate a final dataset

In [None]:
# other integer variables
int_variables = ['loan_amount_000s', 'applicant_income_000s', 'population', 'minority_population', 
                 'hud_median_family_income', 'tract_to_msamd_income', 'number_of_owner_occupied_units', 
                 'number_of_1_to_4_family_units']


# target variable
output_variable = ['action_taken_name']

# consolidating a final dataset
df_final = pd.concat([df[int_variables], df_cat, (df[output_variable] == "Application denied by financial institution").copy()], axis=1)

assert(df_final.shape == (165950,36))

# Save the dataset as a pickled dataframe

In [None]:
df_final.to_pickle("/content/drive/My Drive/liveproject/mortgage_data_preprocessed.pkl.gz")