# <div align = 'center'>`Model Building`</div>

###### Models to be used in this problem

* LogisticRegression
* MulltinomialNB
* DecisionTreeClassifier
* ExtraTreesClassifier
* RandomForestClassifier
* AdaBoostClassifier
* GradientBoostingClassifier
* XGBClassifier
* CatBoostClassifier

In [21]:
import warnings
warnings.simplefilter('ignore')
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

###### Creating a base model without any imputations or treatment. Only null values have been treated

In [29]:
df = pd.read_csv('Data/capstone_train.csv')
df.head()

Unnamed: 0,Loan_ID,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,Interest_Rate
0,10000001,7000,< 1 year,Rent,68000.0,not verified,car,18.37,0,,9,14,Female,1
1,10000002,30000,4 years,Mortgage,,VERIFIED - income,debt_consolidation,14.93,0,17.0,12,24,Female,3
2,10000003,24725,7 years,Mortgage,75566.4,VERIFIED - income source,debt_consolidation,15.88,0,,12,16,Male,3
3,10000004,16000,< 1 year,,56160.0,VERIFIED - income source,debt_consolidation,14.34,3,,16,22,Male,3
4,10000005,17000,8 years,Own,96000.0,VERIFIED - income source,debt_consolidation,22.17,1,,19,30,Female,1


In [30]:
#Basic immputations
df = df.drop('Loan_ID',axis  =1)
import string
def remove_punctuation(x):
    if len(x.split(','))>1:
        return int(x.split(',')[0]+x.split(',')[1])
    else:
        return int(x)
df['Loan_Amount_Requested'] = df['Loan_Amount_Requested'].apply(remove_punctuation)

In [33]:
#Filling Length Employed with mode since only 0.04% of values are missing
df['Length_Employed'] = df['Length_Employed'].fillna(df['Length_Employed'].mode().values[0])

In [36]:
#Filling home owner with max type of home for each employment level.
mapper = dict(df.groupby(['Length_Employed','Home_Owner'])['Home_Owner'].count().sort_values().groupby(level = 0).tail(1).index)
mapper
import math
for i in df[df['Home_Owner'].isna()].index:
    if math.isnan(df['Home_Owner'][i]) == True:
        df['Home_Owner'][i] = mapper[df['Length_Employed'][i]]

In [37]:
def len_emp(x):
    if x.split()[0]=='<':
        return 0
    else:
        return int(x.strip().strip('<').strip('year').strip('years').strip().strip('+'))
df['Length_Employed'] = df['Length_Employed'].apply(len_emp)

In [40]:
df['Months_Since_Deliquency'] = df['Months_Since_Deliquency'].fillna(360)

In [45]:
#Since Months_Since_Deliquency does not seem to show much of an impact on interest_rates we will drop it.
import time
start = time.time()
#Length_Employed can be filled based on annual_income.
#Annual_Income will be filled using KNNImputer on the numerical columns only.
#Home_Owner will be taken care of later.
knn_impute_data=df.select_dtypes(np.number)
#Standardizing the dataset for the KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
ss = StandardScaler()
scaled = pd.DataFrame(ss.fit_transform(knn_impute_data.drop(['Interest_Rate','Months_Since_Deliquency'],axis = 1)),columns = knn_impute_data.drop(['Interest_Rate','Months_Since_Deliquency'],axis = 1).columns)
filled_scaled = KNNImputer(n_neighbors = 5).fit_transform(X = scaled,y = knn_impute_data['Interest_Rate'])
end = time.time()
print('Execution Time:',end-start)

Execution Time: 269.1493818759918


In [63]:
filled_unscaled = pd.DataFrame(ss.inverse_transform(filled_scaled),columns = knn_impute_data.drop(['Interest_Rate','Months_Since_Deliquency'],axis = 1).columns)
fill_data = pd.concat((round(filled_unscaled.reset_index(drop = True)),df.select_dtypes('object').reset_index(drop = True),df['Months_Since_Deliquency'].reset_index(drop = True),df['Interest_Rate'].reset_index(drop = True)),axis = 1)

In [64]:
fill_data.head()

Unnamed: 0,Loan_Amount_Requested,Length_Employed,Annual_Income,Debt_To_Income,Inquiries_Last_6Mo,Number_Open_Accounts,Total_Accounts,Home_Owner,Income_Verified,Purpose_Of_Loan,Gender,Months_Since_Deliquency,Interest_Rate
0,7000.0,0.0,68000.0,18.0,0.0,9.0,14.0,Rent,not verified,car,Female,360.0,1
1,30000.0,4.0,110400.0,15.0,0.0,12.0,24.0,Mortgage,VERIFIED - income,debt_consolidation,Female,17.0,3
2,24725.0,7.0,75566.0,16.0,0.0,12.0,16.0,Mortgage,VERIFIED - income source,debt_consolidation,Male,360.0,3
3,16000.0,0.0,56160.0,14.0,3.0,16.0,22.0,Rent,VERIFIED - income source,debt_consolidation,Male,360.0,3
4,17000.0,8.0,96000.0,22.0,1.0,19.0,30.0,Own,VERIFIED - income source,debt_consolidation,Female,360.0,1


In [65]:
df_d = pd.get_dummies(fill_data,drop_first = True)
df_d.head()

Unnamed: 0,Loan_Amount_Requested,Length_Employed,Annual_Income,Debt_To_Income,Inquiries_Last_6Mo,Number_Open_Accounts,Total_Accounts,Months_Since_Deliquency,Interest_Rate,Home_Owner_None,...,Purpose_Of_Loan_house,Purpose_Of_Loan_major_purchase,Purpose_Of_Loan_medical,Purpose_Of_Loan_moving,Purpose_Of_Loan_other,Purpose_Of_Loan_renewable_energy,Purpose_Of_Loan_small_business,Purpose_Of_Loan_vacation,Purpose_Of_Loan_wedding,Gender_Male
0,7000.0,0.0,68000.0,18.0,0.0,9.0,14.0,360.0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,30000.0,4.0,110400.0,15.0,0.0,12.0,24.0,17.0,3,0,...,0,0,0,0,0,0,0,0,0,0
2,24725.0,7.0,75566.0,16.0,0.0,12.0,16.0,360.0,3,0,...,0,0,0,0,0,0,0,0,0,1
3,16000.0,0.0,56160.0,14.0,3.0,16.0,22.0,360.0,3,0,...,0,0,0,0,0,0,0,0,0,1
4,17000.0,8.0,96000.0,22.0,1.0,19.0,30.0,360.0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [66]:
X = df_d.drop('Interest_Rate',axis = 1)
y = df_d['Interest_Rate']

In [68]:
#Base Model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 2)
lr = LogisticRegression()
lr.fit(X_train,y_train)
pred = lr.predict(X_test)
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           1       0.21      0.00      0.00      6735
           2       0.45      0.68      0.54     14149
           3       0.48      0.45      0.47     11978

    accuracy                           0.46     32862
   macro avg       0.38      0.38      0.34     32862
weighted avg       0.41      0.46      0.40     32862



###### Importing the final data for model building

In [2]:
data = pd.read_csv('Data/filled_train_28-03.csv')
data.head()

Unnamed: 0,Loan_Amount_Requested,Length_Employed,Annual_Income,Debt_To_Income,Inquiries_Last_6Mo,Number_Open_Accounts,Total_Accounts,Home_Owner,Income_Verified,Purpose_Of_Loan,Gender,Months_Since_Deliquency,Interest_Rate
0,7000.0,0.0,68000.0,18.0,0.0,9.0,14.0,Rent,not verified,car,Female,,1
1,30000.0,4.0,98367.0,15.0,0.0,12.0,24.0,Mortgage,VERIFIED - income,debt_consolidation,Female,17.0,3
2,24725.0,7.0,75566.0,16.0,0.0,12.0,16.0,Mortgage,VERIFIED - income,debt_consolidation,Male,,3
3,17000.0,8.0,96000.0,22.0,1.0,19.0,30.0,Own,VERIFIED - income,debt_consolidation,Female,,1
4,4500.0,2.0,30000.0,11.0,1.0,12.0,15.0,Rent,VERIFIED - income,credit_card,Male,,3


In [3]:
data['Months_Since_Deliquency'] = data['Months_Since_Deliquency'].fillna(180) 

In [4]:
#Applying one hot encoding on rest of the categorical features
model_data = pd.get_dummies(data,drop_first = True)
model_data.head()

Unnamed: 0,Loan_Amount_Requested,Length_Employed,Annual_Income,Debt_To_Income,Inquiries_Last_6Mo,Number_Open_Accounts,Total_Accounts,Months_Since_Deliquency,Interest_Rate,Home_Owner_None,...,Purpose_Of_Loan_house,Purpose_Of_Loan_major_purchase,Purpose_Of_Loan_medical,Purpose_Of_Loan_moving,Purpose_Of_Loan_other,Purpose_Of_Loan_renewable_energy,Purpose_Of_Loan_small_business,Purpose_Of_Loan_vacation,Purpose_Of_Loan_wedding,Gender_Male
0,7000.0,0.0,68000.0,18.0,0.0,9.0,14.0,180.0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,30000.0,4.0,98367.0,15.0,0.0,12.0,24.0,17.0,3,0,...,0,0,0,0,0,0,0,0,0,0
2,24725.0,7.0,75566.0,16.0,0.0,12.0,16.0,180.0,3,0,...,0,0,0,0,0,0,0,0,0,1
3,17000.0,8.0,96000.0,22.0,1.0,19.0,30.0,180.0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,4500.0,2.0,30000.0,11.0,1.0,12.0,15.0,180.0,3,0,...,0,0,0,0,0,0,0,0,0,1


In [5]:
#Importing all model libraries
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier,ExtraTreesClassifier,RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score


In [6]:
#Initialize all models
clf1 = LogisticRegression(random_state = 2)
clf2 = MultinomialNB()
clf3 = DecisionTreeClassifier(random_state =2)
clf4 = ExtraTreesClassifier(random_state = 2)
clf5 = RandomForestClassifier(random_state = 2)
clf6 = AdaBoostClassifier(random_state = 2)
clf7 = GradientBoostingClassifier(random_state = 2)
clf8 = XGBClassifier(random_state = 2)
clf9 = CatBoostClassifier(random_state = 2)

###### Creating X,X_scaled,y for models

In [7]:
X = model_data.drop('Interest_Rate',axis = 1)
y = model_data['Interest_Rate']

In [8]:
from sklearn.preprocessing import StandardScaler
data_num = data.select_dtypes(np.number).drop('Interest_Rate',axis = 1)
scaled_num = pd.DataFrame(StandardScaler().fit_transform(data_num),columns = data_num.columns)
data_cat = pd.get_dummies(data.select_dtypes('object'),drop_first = True)
comb_data = pd.concat((scaled_num,data_cat),axis = 1)
X_scaled = pd.DataFrame(StandardScaler().fit_transform(comb_data),columns = comb_data.columns)

In [9]:
def cval(estimator,X,y):
    scores = cross_val_score(estimator,X,y,scoring = 'f1_weighted',n_jobs = -1,cv=5)
    avg_score = round(np.mean(scores),4)
    variance = round(np.std(scores)/avg_score,4)
    return estimator,avg_score,variance

# List to save all metrics

In [10]:
var = []
avg = []
est = []

var_sc = []
avg_sc = []
est_sc = []

# Logistic Regression Classifier

In [11]:
estimator,avg_score,variance = cval(clf1,X,y)
var.append(variance)
avg.append(avg_score)
est.append('Logistic Regression')

In [12]:
estimator,avg_score,variance = cval(clf1,X_scaled,y)
var_sc.append(variance)
avg_sc.append(avg_score)
est_sc.append('Logistic Regression')

# Gaussian Naive Bayes Classifier

In [13]:
estimator,avg_score,variance = cval(clf2,X,y)
var.append(variance)
avg.append(avg_score)
est.append('Gaussian Naive Base')

In [14]:
estimator,avg_score,variance = cval(clf2,X_scaled,y)
var_sc.append(variance)
avg_sc.append(avg_score)
est_sc.append('Gaussian Naive Base')

# DecisionTreeClassifier

In [15]:
estimator,avg_score,variance = cval(clf3,X,y)
var.append(variance)
avg.append(avg_score)
est.append('DecisionTreeClassifier')

In [16]:
estimator,avg_score,variance = cval(clf3,X_scaled,y)
var_sc.append(variance)
avg_sc.append(avg_score)
est_sc.append('DecisionTreeClassifier')

# ExtraTreesClassifier

In [17]:
estimator,avg_score,variance = cval(clf4,X,y)
var.append(variance)
avg.append(avg_score)
est.append('ExtraTreesClassifier')

In [18]:
estimator,avg_score,variance = cval(clf4,X_scaled,y)
var_sc.append(variance)
avg_sc.append(avg_score)
est_sc.append('ExtraTreesClassifier')

# RandomForestClassifier

In [19]:
estimator,avg_score,variance = cval(clf5,X,y)
var.append(variance)
avg.append(avg_score)
est.append('RandomForestClassifier')

In [20]:
estimator,avg_score,variance = cval(clf5,X_scaled,y)
var_sc.append(variance)
avg_sc.append(avg_score)
est_sc.append('RandomForestClassifier')

# AdaBoostClassifier

In [21]:
estimator,avg_score,variance = cval(clf6,X,y)
var.append(variance)
avg.append(avg_score)
est.append('AdaBoostClassifier')

In [22]:
estimator,avg_score,variance = cval(clf6,X_scaled,y)
var_sc.append(variance)
avg_sc.append(avg_score)
est_sc.append('AdaBoostClassifier')

# GradientBoostingClassifier

In [23]:
estimator,avg_score,variance = cval(clf7,X,y)
var.append(variance)
avg.append(avg_score)
est.append('GradientBoostingClassifier')

In [24]:
estimator,avg_score,variance = cval(clf7,X_scaled,y)
var_sc.append(variance)
avg_sc.append(avg_score)
est_sc.append('GradientBoostingClassifier')

# XGBClassifier

In [25]:
estimator,avg_score,variance = cval(clf8,X,y)
var.append(variance)
avg.append(avg_score)
est.append('XGBClassifier')

In [26]:
estimator,avg_score,variance = cval(clf8,X_scaled,y)
var_sc.append(variance)
avg_sc.append(avg_score)
est_sc.append('XGBClassifier')

# CatBoostClassifier

In [27]:
estimator,avg_score,variance = cval(clf9,X,y)
var.append(variance)
avg.append(avg_score)
est.append('CatBoostClassifier')

In [28]:
estimator,avg_score,variance = cval(clf9,X_scaled,y)
var_sc.append(variance)
avg_sc.append(avg_score)
est_sc.append('CatBoostClassifier')

###### Comparing metrics of all the above models.

In [29]:
metrics_unscaled = pd.DataFrame({'Estimator':est,'F1 Average':avg,'Variance Error': var})
metrics_scaled = pd.DataFrame({'F1 Average_sc':avg_sc,'Variance Error_sc': var_sc})

In [30]:
final_metrics = pd.concat((metrics_unscaled,metrics_scaled),axis  =1).sort_values('F1 Average',ascending = False)
final_metrics

Unnamed: 0,Estimator,F1 Average,Variance Error,F1 Average_sc,Variance Error_sc
7,XGBClassifier,0.5254,0.0032,0.5254,0.0032
8,CatBoostClassifier,0.5254,0.0044,0.5254,0.0044
5,AdaBoostClassifier,0.5096,0.0045,0.5096,0.0045
6,GradientBoostingClassifier,0.5095,0.0041,0.5095,0.0041
4,RandomForestClassifier,0.4985,0.0032,0.4981,0.0033
3,ExtraTreesClassifier,0.4889,0.0024,0.489,0.0022
2,DecisionTreeClassifier,0.4255,0.0053,0.4252,0.0048
0,Logistic Regression,0.4235,0.0229,0.5053,0.002
1,Gaussian Naive Base,0.3293,0.0032,,


### Base Model

* We have a range of f1 scores from various models in the range of 42-53%.
* Boosting Algorithms seem to give us f1 scores around the 50% mark.
* LogisticRegression and Decision Tree give us an f1 score of 42% and RandomForest gave us an f1 score of 49.85%.
* On Standardizing the data, LogisticRegression gives an f1 score of 50%. Standardization does not effect tree based models since they are probabilistic in nature.

###### Models to focus on for hyperparameter tuning:
* CatBoostClassifier
* XGBClassifier

# XGBClassifier (Working)

XGBoost, is extreme because it is a big machine learning model with many parts. First we look at the explanation from a regression perspective and then we will look at the explanation from a classification perspective. (Explanation is from StatQuest Video in my own words.)

### Regression
###### Image of example
<img src = "Images/Example for regression.png" width = "500px" height = "auto">

* **Step 1:** Make a base prediction which would be 0.5 in this case. This remains the same for regression and classification.

**Instead of using a regular tree like in gradient boosting, this algorithm uses an XGBoost Tree.**

* Each tree starts off as a single leaf and will hold all the residuals from the base prediction.

* Calculate a quality or Similarity Score for the residuals.
<img src = "Images/Similarity Score.png" width = "500px" height = "auto">

* The **lambda** in the above formula is a regularization parameter.



### Classification

In [10]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
xgbc = XGBClassifier(n_jobs = -1,random_state = 2)

In [11]:
# #Initializing parameters
# params = {'n_estimators':list(range(0,1000,100)),
#           'max_depth': list(range(1,25,5)),
#           'learning_rate': list(np.linspace(0,1,50))[1:], #(Eta)
#           'booster':['gbtree','gblinear','dart'],
#           'gamma': list(np.linspace(0,1,50))[1:] #For tree pruning              
#          }
# rscv  = RandomizedSearchCV(xgbc,params,5,'f1_weighted')
# rscv.fit(X,y)

In [13]:
# rscv.best_params_

###### We are using RandomizedSearchCV to get a baseline for grid search cv

###### Creating grid serach parameters around above parameters.

In [12]:
params_grid = {'n_estimators': [800,900,1000],
          'max_depth': [1,5,10],
          'learning_rate': [0.5,0.9], #(Eta)
          'booster':['gblinear'],
          'gamma': [0.1,0.3,0.5] #For tree pruning              
          }

In [13]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(xgbc,params_grid,scoring = 'f1_weighted',n_jobs = -1,cv = 5,verbose = 1)
grid_search.fit(X,y)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Parameters: { gamma, max_depth } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=-1,
                                     num_parallel_tree=None, random_state=2,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, vali

In [14]:
grid_search.best_params_

{'booster': 'gblinear',
 'gamma': 0.1,
 'learning_rate': 0.9,
 'max_depth': 1,
 'n_estimators': 1000}

In [16]:
xgbf = XGBClassifier(n_estimators = 1000,max_depth = 5,learning_rate = 0.9,gamma = 0.1,booster = 'gbtree',n_jobs = -1,random_state = 2)

In [23]:
estimator,avg_score,variance = cval(model,X,y)
print('Avg F1 Score:',avg_score)
print('Variance Error:',variance)

Avg F1 Score: nan
Variance Error: nan


https://effectiveml.com/using-grid-search-to-optimise-catboost-parameters.html