# Part 2: Feature Preprocessing

In [1]:
import numpy as np
import pandas as pd

In [2]:
# load clean data from part 1
df_cr = 'data/Clean_Mortgage_Lending_FL_PortSL.csv'
df_cr = pd.read_csv(df_cr, sep = ',')

In [3]:
df_cr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10606 entries, 0 to 10605
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ethnicity              10606 non-null  object
 1   race                   10606 non-null  object
 2   gender                 10606 non-null  object
 3   action_taken           10606 non-null  int64 
 4   preapproval_requested  10606 non-null  object
 5   loan_type              10606 non-null  object
 6   loan_purpose           10606 non-null  object
 7   interest_only_payment  10606 non-null  object
 8   balloon_payment        10606 non-null  object
 9   debt_to_income_ratio   10606 non-null  object
 10  age                    10606 non-null  object
 11  income                 10606 non-null  object
 12  loan_to_value_ratio    10606 non-null  object
dtypes: int64(1), object(12)
memory usage: 1.1+ MB


In [4]:
from sklearn import model_selection
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

### Split Data

In [5]:
to_drop = ['action_taken']
X = df_cr.drop(to_drop, axis = 1)
# target variable
y = df_cr['action_taken']

In [6]:
# Splite data into training and testing
# 100 -> 75:y=1, 25:y=0
# training(80): 60 y=1; 20 y=0 
# testing(20):  15 y=1; 5 y=0

# Reserve 25% for testing
# stratify example:
# 100 -> y: 80 '0', 20 '1' -> 4:1
# 80% training 64: '0', 16:'1' -> 4:1
# 20% testing  16:'0', 4: '1' -> 4:1
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, stratify = y, random_state=1) #stratified sampling

print('training data has ' + str(X_train.shape[0]) + ' observation with ' + str(X_train.shape[1]) + ' features')
print('test data has ' + str(X_test.shape[0]) + ' observation with ' + str(X_test.shape[1]) + ' features')

training data has 7954 observation with 12 features
test data has 2652 observation with 12 features


In [7]:
X_train.head()

Unnamed: 0,ethnicity,race,gender,preapproval_requested,loan_type,loan_purpose,interest_only_payment,balloon_payment,debt_to_income_ratio,age,income,loan_to_value_ratio
4101,Not Hispanic or Latino,White,Male,No Preapproval Requested,Converntional,Home Purchase,No Interest-only Payments,No Balloon Payment,20%-<30%,Young,Upper-Middle Class,< Conventional Bar
4293,Not Hispanic or Latino,White,Male,No Preapproval Requested,Converntional,Home Purchase,No Interest-only Payments,No Balloon Payment,36%-<50%,Young,Upper-Middle Class,> Conventional Bar and < FHA Bar
3444,Not Hispanic or Latino,White,Male,No Preapproval Requested,Converntional,Home Purchase,No Interest-only Payments,No Balloon Payment,36%-<50%,Middle-Aged,Middle Class,> Conventional Bar and < FHA Bar
7644,Not Hispanic or Latino,Black or African American,Male,No Preapproval Requested,Converntional,Refinancing,No Interest-only Payments,Balloon Payment,50%-60%,Older,Lower-Middle Class,< Conventional Bar
1956,Not Hispanic or Latino,White,Male,No Preapproval Requested,Converntional,Home Improvement,No Interest-only Payments,No Balloon Payment,36%-<50%,Middle-Aged,Middle Class,< Conventional Bar


In [8]:
X.dtypes

ethnicity                object
race                     object
gender                   object
preapproval_requested    object
loan_type                object
loan_purpose             object
interest_only_payment    object
balloon_payment          object
debt_to_income_ratio     object
age                      object
income                   object
loan_to_value_ratio      object
dtype: object

In [9]:
X_train.to_csv('data/X_train.csv', index = False)
y_train.to_csv('data/y_train.csv', index = False)

X_test.to_csv('data/X_test.csv', index = False)
y_test.to_csv('data/y_test.csv', index = False)

### One Hot, Oridinary Encoding

In [10]:
# One hot encoding (for categorical features with more than two categories itself (i.e. not 0 and 1))
def OneHotEncoding(df, enc, categories):  
  transformed = pd.DataFrame(enc.transform(df[categories]).toarray(), columns=enc.get_feature_names(categories))
  return pd.concat([df.reset_index(drop=True), transformed], axis=1).drop(categories, axis=1)

categories = ['race', 'loan_type', 'loan_purpose', 'debt_to_income_ratio', 'age', 'income', 'loan_to_value_ratio']
enc_ohe = OneHotEncoder()
enc_ohe.fit(X_train[categories])

X_train = OneHotEncoding(X_train, enc_ohe, categories)
X_test = OneHotEncoding(X_test, enc_ohe, categories)



In [11]:
X_train.head()

Unnamed: 0,ethnicity,gender,preapproval_requested,interest_only_payment,balloon_payment,race_Asian,race_Black or African American,race_Minority Races,race_White,loan_type_Converntional,...,age_Young,income_Lower-Middle Class,income_Middle Class,income_Poor and Near Poor,income_Rich,income_Upper-Middle Class,loan_to_value_ratio_< Conventional Bar,loan_to_value_ratio_> Conventional Bar and < FHA Bar,loan_to_value_ratio_> FHA Bar and < VA Bar,loan_to_value_ratio_Not Eligible
0,Not Hispanic or Latino,Male,No Preapproval Requested,No Interest-only Payments,No Balloon Payment,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,Not Hispanic or Latino,Male,No Preapproval Requested,No Interest-only Payments,No Balloon Payment,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,Not Hispanic or Latino,Male,No Preapproval Requested,No Interest-only Payments,No Balloon Payment,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,Not Hispanic or Latino,Male,No Preapproval Requested,No Interest-only Payments,Balloon Payment,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,Not Hispanic or Latino,Male,No Preapproval Requested,No Interest-only Payments,No Balloon Payment,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [12]:
# Ordinal Encoding (for categorical variables only have values 0 and 1)
categories = ['ethnicity', 'gender', 'preapproval_requested', 'interest_only_payment', 'balloon_payment']
enc_oe = OrdinalEncoder()
enc_oe.fit(X_train[categories])

X_train[categories] = enc_oe.transform(X_train[categories])
X_test[categories] = enc_oe.transform(X_test[categories])

In [13]:
X_train.head()

Unnamed: 0,ethnicity,gender,preapproval_requested,interest_only_payment,balloon_payment,race_Asian,race_Black or African American,race_Minority Races,race_White,loan_type_Converntional,...,age_Young,income_Lower-Middle Class,income_Middle Class,income_Poor and Near Poor,income_Rich,income_Upper-Middle Class,loan_to_value_ratio_< Conventional Bar,loan_to_value_ratio_> Conventional Bar and < FHA Bar,loan_to_value_ratio_> FHA Bar and < VA Bar,loan_to_value_ratio_Not Eligible
0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [14]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7954 entries, 0 to 7953
Data columns (total 34 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   ethnicity                                             7954 non-null   float64
 1   gender                                                7954 non-null   float64
 2   preapproval_requested                                 7954 non-null   float64
 3   interest_only_payment                                 7954 non-null   float64
 4   balloon_payment                                       7954 non-null   float64
 5   race_Asian                                            7954 non-null   float64
 6   race_Black or African American                        7954 non-null   float64
 7   race_Minority Races                                   7954 non-null   float64
 8   race_White                                            7954

### Exploring Improtant Features

In [15]:
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

#### Logistic Regression Beta

In [16]:
# FIXED cov = 1 - removed 1 category from each columns that have more than 2 categories
# now P values look good

# used log_loan_to_value_ratio and dropped loan_to_value_ratio, income columns
# used debt, dropped median_debt_to_income_ratio, debt_to_income_ratio

category_log_reg = ['ethnicity',
 'gender',
 # 'preapproval_requested',
 'interest_only_payment',
 'balloon_payment',
 'race_Asian',
 'race_Black or African American',
 'race_Minority Races',
 'loan_type_FHA-insured',
 'loan_type_VA-guaranteed',
 'loan_purpose_Home Purchase',
 'loan_purpose_Other Purpose',
 'loan_purpose_Refinancing',
 'debt_to_income_ratio_30%-<36%',
 'debt_to_income_ratio_36%-<50%',
 'debt_to_income_ratio_50%-60%',
 'debt_to_income_ratio_<20%',
 'debt_to_income_ratio_>60%',
 'age_Older',
 'age_Young',
 'income_Lower-Middle Class',
 'income_Middle Class',
 'income_Poor and Near Poor',
 'income_Upper-Middle Class',
 'loan_to_value_ratio_> Conventional Bar and < FHA Bar',
 'loan_to_value_ratio_> FHA Bar and < VA Bar',
 'loan_to_value_ratio_Not Eligible']
logic = sm.Logit(list(y_train), X_train[category_log_reg])
result = logic.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.409252
         Iterations 7
                                           Results: Logit
Model:                           Logit                        Pseudo R-squared:             0.301    
Dependent Variable:              y                            AIC:                          6562.3875
Date:                            2022-10-02 16:20             BIC:                          6743.9047
No. Observations:                7954                         Log-Likelihood:               -3255.2  
Df Model:                        25                           LL-Null:                      -4656.5  
Df Residuals:                    7928                         LLR p-value:                  0.0000   
Converged:                       1.0000                       Scale:                        1.0000   
No. Iterations:                  7.0000                                                              
------------------------

# Part 3: Initial Model Training and Initial Results

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [18]:
# build models
# Logistic Regression
classifier_logistic = LogisticRegression()

# Random Forest
classifier_RF = RandomForestClassifier()

In [19]:
# helper function for printing out grid search results 
def print_grid_search_metrics(gs):
    print ("Best score: " + str(gs.best_score_))
    print ("Best parameters set:")
    best_parameters = gs.best_params_
    for param_name in sorted(best_parameters.keys()):
        print(param_name + ':' + str(best_parameters[param_name]))

### Logistic Regression

In [20]:
# Train the model
classifier_logistic.fit(X_train, y_train)

LogisticRegression()

In [21]:
# Prediction of test data
classifier_logistic.predict(X_test)

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [22]:
# Accuracy of test data
classifier_logistic.score(X_test, y_test)

0.8257918552036199

### Random Forest

In [23]:
# Possible hyperparamter options for Random Forest
# Choose the number of trees
parameters = {
    'n_estimators' : [60,80,100],
    'max_depth': [1,5,10]
}
Grid_RF = GridSearchCV(RandomForestClassifier(),parameters, cv=5)
Grid_RF.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [1, 5, 10],
                         'n_estimators': [60, 80, 100]})

In [24]:
# best number of tress
print_grid_search_metrics(Grid_RF)

Best score: 0.8361849080322095
Best parameters set:
max_depth:10
n_estimators:100


In [25]:
# best random forest
best_RF_model = Grid_RF.best_estimator_

In [26]:
best_RF_model

RandomForestClassifier(max_depth=10)