# Part 2: Feature Preprocessing

In [2]:
import numpy as np
import pandas as pd

In [6]:
# load clean data from part 1
df_cr = 'data/Clean_Mortgage_Lending_IL_Chicago.csv'
df_cr = pd.read_csv(df_cr, sep = ',')

In [7]:
df_cr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110882 entries, 0 to 110881
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   ethnicity              110882 non-null  object
 1   race                   110882 non-null  object
 2   gender                 110882 non-null  object
 3   action_taken           110882 non-null  int64 
 4   preapproval_requested  110882 non-null  object
 5   loan_type              110882 non-null  object
 6   loan_purpose           110882 non-null  object
 7   interest_only_payment  110882 non-null  object
 8   balloon_payment        110882 non-null  object
 9   debt_to_income_ratio   110882 non-null  object
 10  age                    110882 non-null  object
 11  income                 110882 non-null  object
 12  loan_to_value_ratio    110882 non-null  object
dtypes: int64(1), object(12)
memory usage: 11.0+ MB


In [8]:
from sklearn import model_selection
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

### Split Data

In [9]:
to_drop = ['action_taken']
X = df_cr.drop(to_drop, axis = 1)
# target variable
y = df_cr['action_taken']

In [10]:
# Splite data into training and testing
# 100 -> 75:y=1, 25:y=0
# training(80): 60 y=1; 20 y=0 
# testing(20):  15 y=1; 5 y=0

# Reserve 25% for testing
# stratify example:
# 100 -> y: 80 '0', 20 '1' -> 4:1
# 80% training 64: '0', 16:'1' -> 4:1
# 20% testing  16:'0', 4: '1' -> 4:1
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, stratify = y, random_state=1) #stratified sampling

print('training data has ' + str(X_train.shape[0]) + ' observation with ' + str(X_train.shape[1]) + ' features')
print('test data has ' + str(X_test.shape[0]) + ' observation with ' + str(X_test.shape[1]) + ' features')

training data has 83161 observation with 12 features
test data has 27721 observation with 12 features


In [11]:
X_train.head()

Unnamed: 0,ethnicity,race,gender,preapproval_requested,loan_type,loan_purpose,interest_only_payment,balloon_payment,debt_to_income_ratio,age,income,loan_to_value_ratio
63028,Not Hispanic or Latino,White,Male,No Preapproval Requested,Converntional,Refinancing,No Interest-only Payments,No Balloon Payment,20%-<30%,Middle-Aged,Middle Class,< Conventional Bar
46546,Not Hispanic or Latino,White,Male,No Preapproval Requested,Converntional,Refinancing,Interest-only Payments,No Balloon Payment,30%-<36%,Middle-Aged,Upper-Middle Class,> Conventional Bar and < FHA Bar
104683,Not Hispanic or Latino,Black or African American,Male,No Preapproval Requested,Converntional,Other Purpose,No Interest-only Payments,No Balloon Payment,>60%,Middle-Aged,Lower-Middle Class,< Conventional Bar
107250,Not Hispanic or Latino,White,Male,No Preapproval Requested,Converntional,Refinancing,No Interest-only Payments,No Balloon Payment,36%-<50%,Older,Upper-Middle Class,> Conventional Bar and < FHA Bar
32053,Not Hispanic or Latino,White,Male,No Preapproval Requested,Converntional,Home Purchase,No Interest-only Payments,No Balloon Payment,36%-<50%,Young,Upper-Middle Class,< Conventional Bar


In [12]:
X.dtypes

ethnicity                object
race                     object
gender                   object
preapproval_requested    object
loan_type                object
loan_purpose             object
interest_only_payment    object
balloon_payment          object
debt_to_income_ratio     object
age                      object
income                   object
loan_to_value_ratio      object
dtype: object

In [13]:
X_train.to_csv('data/X_train.csv', index = False)
y_train.to_csv('data/y_train.csv', index = False)

X_test.to_csv('data/X_test.csv', index = False)
y_test.to_csv('data/y_test.csv', index = False)

### One Hot, Oridinary Encoding

In [14]:
# One hot encoding (for categorical features with more than two categories itself (i.e. not 0 and 1))
def OneHotEncoding(df, enc, categories):  
  transformed = pd.DataFrame(enc.transform(df[categories]).toarray(), columns=enc.get_feature_names(categories))
  return pd.concat([df.reset_index(drop=True), transformed], axis=1).drop(categories, axis=1)

categories = ['race', 'loan_type', 'loan_purpose', 'debt_to_income_ratio', 'age', 'income', 'loan_to_value_ratio']
enc_ohe = OneHotEncoder()
enc_ohe.fit(X_train[categories])

X_train = OneHotEncoding(X_train, enc_ohe, categories)
X_test = OneHotEncoding(X_test, enc_ohe, categories)

In [15]:
X_train.head()

Unnamed: 0,ethnicity,gender,preapproval_requested,interest_only_payment,balloon_payment,race_Asian,race_Black or African American,race_Minority Races,race_White,loan_type_Converntional,...,age_Young,income_Lower-Middle Class,income_Middle Class,income_Poor and Near Poor,income_Rich,income_Upper-Middle Class,loan_to_value_ratio_< Conventional Bar,loan_to_value_ratio_> Conventional Bar and < FHA Bar,loan_to_value_ratio_> FHA Bar and < VA Bar,loan_to_value_ratio_Not Eligible
0,Not Hispanic or Latino,Male,No Preapproval Requested,No Interest-only Payments,No Balloon Payment,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,Not Hispanic or Latino,Male,No Preapproval Requested,Interest-only Payments,No Balloon Payment,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,Not Hispanic or Latino,Male,No Preapproval Requested,No Interest-only Payments,No Balloon Payment,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,Not Hispanic or Latino,Male,No Preapproval Requested,No Interest-only Payments,No Balloon Payment,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,Not Hispanic or Latino,Male,No Preapproval Requested,No Interest-only Payments,No Balloon Payment,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [16]:
# Ordinal Encoding (for categorical variables only have values 0 and 1)
categories = ['ethnicity', 'gender', 'preapproval_requested', 'interest_only_payment', 'balloon_payment']
enc_oe = OrdinalEncoder()
enc_oe.fit(X_train[categories])

X_train[categories] = enc_oe.transform(X_train[categories])
X_test[categories] = enc_oe.transform(X_test[categories])

In [17]:
X_train.head()

Unnamed: 0,ethnicity,gender,preapproval_requested,interest_only_payment,balloon_payment,race_Asian,race_Black or African American,race_Minority Races,race_White,loan_type_Converntional,...,age_Young,income_Lower-Middle Class,income_Middle Class,income_Poor and Near Poor,income_Rich,income_Upper-Middle Class,loan_to_value_ratio_< Conventional Bar,loan_to_value_ratio_> Conventional Bar and < FHA Bar,loan_to_value_ratio_> FHA Bar and < VA Bar,loan_to_value_ratio_Not Eligible
0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [18]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83161 entries, 0 to 83160
Data columns (total 34 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   ethnicity                                             83161 non-null  float64
 1   gender                                                83161 non-null  float64
 2   preapproval_requested                                 83161 non-null  float64
 3   interest_only_payment                                 83161 non-null  float64
 4   balloon_payment                                       83161 non-null  float64
 5   race_Asian                                            83161 non-null  float64
 6   race_Black or African American                        83161 non-null  float64
 7   race_Minority Races                                   83161 non-null  float64
 8   race_White                                            83

### Exploring Improtant Features

In [19]:
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

#### Logistic Regression Beta

In [68]:
# FIXED cov = 1 - removed 1 category from each columns that have more than 2 categories
# now P values look good

# used log_loan_to_value_ratio and dropped loan_to_value_ratio, income columns
# used debt, dropped median_debt_to_income_ratio, debt_to_income_ratio

category_log_reg = ["ethnicity", "gender", "preapproval_requested", "loan_type_FHA-insured", "loan_type_VA-guaranteed", 
                    "loan_purpose_Home Improvement", "loan_purpose_Home Purchase", "loan_purpose_Other Purpose", 
                     "log_loan_to_value_ratio", "interest_only_payment", "debt",
                   "balloon_payment", "derived_race_Asian", "derived_race_Black or African American", "derived_race_Minority Races", 
                    "applicant_age_25-34", "applicant_age_45-54", "applicant_age_55-64", "applicant_age_65-74", 
                    "applicant_age_<25", "applicant_age_>74"]
logic = sm.Logit(list(y_train), X_train)
result = logic.fit()
print(result.summary2())

         Current function value: 0.352057
         Iterations: 35
                                                  Results: Logit
Model:                               Logit                             Pseudo R-squared:                  0.359    
Dependent Variable:                  y                                 AIC:                               5943.1012
Date:                                2022-07-23 07:25                  BIC:                               6139.9785
No. Observations:                    8361                              Log-Likelihood:                    -2943.6  
Df Model:                            27                                LL-Null:                           -4595.3  
Df Residuals:                        8333                              LLR p-value:                       0.0000   
Converged:                           0.0000                            Scale:                             1.0000   
No. Iterations:                      35.0000             



# Part 3: Initial Model Training and Initial Results

In [69]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [70]:
# build models
# Logistic Regression
classifier_logistic = LogisticRegression()

# Random Forest
classifier_RF = RandomForestClassifier()

In [71]:
# helper function for printing out grid search results 
def print_grid_search_metrics(gs):
    print ("Best score: " + str(gs.best_score_))
    print ("Best parameters set:")
    best_parameters = gs.best_params_
    for param_name in sorted(best_parameters.keys()):
        print(param_name + ':' + str(best_parameters[param_name]))

### Logistic Regression

In [72]:
# Train the model
classifier_logistic.fit(X_train, y_train)

LogisticRegression()

In [73]:
# Prediction of test data
classifier_logistic.predict(X_test)

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [74]:
# Accuracy of test data
classifier_logistic.score(X_test, y_test)

0.8482238966630786

### Random Forest

In [75]:
# Possible hyperparamter options for Random Forest
# Choose the number of trees
parameters = {
    'n_estimators' : [60,80,100],
    'max_depth': [1,5,10]
}
Grid_RF = GridSearchCV(RandomForestClassifier(),parameters, cv=5)
Grid_RF.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [1, 5, 10],
                         'n_estimators': [60, 80, 100]})

In [76]:
# best number of tress
print_grid_search_metrics(Grid_RF)

Best score: 0.8557592869583619
Best parameters set:
max_depth:10
n_estimators:100


In [77]:
# best random forest
best_RF_model = Grid_RF.best_estimator_

In [78]:
best_RF_model

RandomForestClassifier(max_depth=10)