# <div align = 'center'>`Model Building`</div>

###### Models to be used in this problem

* LogisticRegression
* MulltinomialNB
* DecisionTreeClassifier
* ExtraTreesClassifier
* RandomForestClassifier
* AdaBoostClassifier
* GradientBoostingClassifier
* XGBClassifier
* CatBoostClassifier

In [1]:
import warnings
warnings.simplefilter('ignore')
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

###### Importing the final data for model building

In [2]:
data = pd.read_csv('Data/filled_train_28-03.csv')
data.head()

Unnamed: 0,Loan_Amount_Requested,Length_Employed,Annual_Income,Debt_To_Income,Inquiries_Last_6Mo,Number_Open_Accounts,Total_Accounts,Home_Owner,Income_Verified,Purpose_Of_Loan,Gender,Months_Since_Deliquency,Interest_Rate
0,7000.0,0.0,68000.0,18.0,0.0,9.0,14.0,Rent,not verified,car,Female,,1
1,30000.0,4.0,98367.0,15.0,0.0,12.0,24.0,Mortgage,VERIFIED - income,debt_consolidation,Female,17.0,3
2,24725.0,7.0,75566.0,16.0,0.0,12.0,16.0,Mortgage,VERIFIED - income,debt_consolidation,Male,,3
3,17000.0,8.0,96000.0,22.0,1.0,19.0,30.0,Own,VERIFIED - income,debt_consolidation,Female,,1
4,4500.0,2.0,30000.0,11.0,1.0,12.0,15.0,Rent,VERIFIED - income,credit_card,Male,,3


In [3]:
data['Months_Since_Deliquency'] = data['Months_Since_Deliquency'].fillna(180) 

In [4]:
#Applying one hot encoding on rest of the categorical features
model_data = pd.get_dummies(data,drop_first = True)
model_data.head()

Unnamed: 0,Loan_Amount_Requested,Length_Employed,Annual_Income,Debt_To_Income,Inquiries_Last_6Mo,Number_Open_Accounts,Total_Accounts,Months_Since_Deliquency,Interest_Rate,Home_Owner_None,...,Purpose_Of_Loan_house,Purpose_Of_Loan_major_purchase,Purpose_Of_Loan_medical,Purpose_Of_Loan_moving,Purpose_Of_Loan_other,Purpose_Of_Loan_renewable_energy,Purpose_Of_Loan_small_business,Purpose_Of_Loan_vacation,Purpose_Of_Loan_wedding,Gender_Male
0,7000.0,0.0,68000.0,18.0,0.0,9.0,14.0,180.0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,30000.0,4.0,98367.0,15.0,0.0,12.0,24.0,17.0,3,0,...,0,0,0,0,0,0,0,0,0,0
2,24725.0,7.0,75566.0,16.0,0.0,12.0,16.0,180.0,3,0,...,0,0,0,0,0,0,0,0,0,1
3,17000.0,8.0,96000.0,22.0,1.0,19.0,30.0,180.0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,4500.0,2.0,30000.0,11.0,1.0,12.0,15.0,180.0,3,0,...,0,0,0,0,0,0,0,0,0,1


In [5]:
#Importing all model libraries
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier,ExtraTreesClassifier,RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score


In [6]:
#Initialize all models
clf1 = LogisticRegression(random_state = 2)
clf2 = MultinomialNB()
clf3 = DecisionTreeClassifier(random_state =2)
clf4 = ExtraTreesClassifier(random_state = 2)
clf5 = RandomForestClassifier(random_state = 2)
clf6 = AdaBoostClassifier(random_state = 2)
clf7 = GradientBoostingClassifier(random_state = 2)
clf8 = XGBClassifier(random_state = 2)
clf9 = CatBoostClassifier(random_state = 2)

###### Creating X,X_scaled,y for models

In [7]:
X = model_data.drop('Interest_Rate',axis = 1)
y = model_data['Interest_Rate']

In [8]:
from sklearn.preprocessing import StandardScaler
data_num = data.select_dtypes(np.number).drop('Interest_Rate',axis = 1)
scaled_num = pd.DataFrame(StandardScaler().fit_transform(data_num),columns = data_num.columns)
data_cat = pd.get_dummies(data.select_dtypes('object'),drop_first = True)
comb_data = pd.concat((scaled_num,data_cat),axis = 1)
X_scaled = pd.DataFrame(StandardScaler().fit_transform(comb_data),columns = comb_data.columns)

In [9]:
def cval(estimator,X,y):
    scores = cross_val_score(estimator,X,y,scoring = 'f1_weighted',n_jobs = -1,cv=5)
    avg_score = round(np.mean(scores),4)
    variance = round(np.std(scores)/avg_score,4)
    return estimator,avg_score,variance

# List to save all metrics

In [10]:
var = []
avg = []
est = []

var_sc = []
avg_sc = []
est_sc = []

# Logistic Regression Classifier

In [11]:
estimator,avg_score,variance = cval(clf1,X,y)
var.append(variance)
avg.append(avg_score)
est.append('Logistic Regression')

In [12]:
estimator,avg_score,variance = cval(clf1,X_scaled,y)
var_sc.append(variance)
avg_sc.append(avg_score)
est_sc.append('Logistic Regression')

# Gaussian Naive Bayes Classifier

In [13]:
estimator,avg_score,variance = cval(clf2,X,y)
var.append(variance)
avg.append(avg_score)
est.append('Gaussian Naive Base')

In [14]:
estimator,avg_score,variance = cval(clf2,X_scaled,y)
var_sc.append(variance)
avg_sc.append(avg_score)
est_sc.append('Gaussian Naive Base')

# DecisionTreeClassifier

In [15]:
estimator,avg_score,variance = cval(clf3,X,y)
var.append(variance)
avg.append(avg_score)
est.append('DecisionTreeClassifier')

In [16]:
estimator,avg_score,variance = cval(clf3,X_scaled,y)
var_sc.append(variance)
avg_sc.append(avg_score)
est_sc.append('DecisionTreeClassifier')

# ExtraTreesClassifier

In [17]:
estimator,avg_score,variance = cval(clf4,X,y)
var.append(variance)
avg.append(avg_score)
est.append('ExtraTreesClassifier')

In [18]:
estimator,avg_score,variance = cval(clf4,X_scaled,y)
var_sc.append(variance)
avg_sc.append(avg_score)
est_sc.append('ExtraTreesClassifier')

# RandomForestClassifier

In [19]:
estimator,avg_score,variance = cval(clf5,X,y)
var.append(variance)
avg.append(avg_score)
est.append('RandomForestClassifier')

In [20]:
estimator,avg_score,variance = cval(clf5,X_scaled,y)
var_sc.append(variance)
avg_sc.append(avg_score)
est_sc.append('RandomForestClassifier')

# AdaBoostClassifier

In [21]:
estimator,avg_score,variance = cval(clf6,X,y)
var.append(variance)
avg.append(avg_score)
est.append('AdaBoostClassifier')

In [22]:
estimator,avg_score,variance = cval(clf6,X_scaled,y)
var_sc.append(variance)
avg_sc.append(avg_score)
est_sc.append('AdaBoostClassifier')

# GradientBoostingClassifier

In [23]:
estimator,avg_score,variance = cval(clf7,X,y)
var.append(variance)
avg.append(avg_score)
est.append('GradientBoostingClassifier')

In [24]:
estimator,avg_score,variance = cval(clf7,X_scaled,y)
var_sc.append(variance)
avg_sc.append(avg_score)
est_sc.append('GradientBoostingClassifier')

# XGBClassifier

In [25]:
estimator,avg_score,variance = cval(clf8,X,y)
var.append(variance)
avg.append(avg_score)
est.append('XGBClassifier')

In [26]:
estimator,avg_score,variance = cval(clf8,X_scaled,y)
var_sc.append(variance)
avg_sc.append(avg_score)
est_sc.append('XGBClassifier')

# CatBoostClassifier

In [27]:
estimator,avg_score,variance = cval(clf9,X,y)
var.append(variance)
avg.append(avg_score)
est.append('CatBoostClassifier')

In [28]:
estimator,avg_score,variance = cval(clf9,X_scaled,y)
var_sc.append(variance)
avg_sc.append(avg_score)
est_sc.append('CatBoostClassifier')

###### Comparing metrics of all the above models.

In [29]:
metrics_unscaled = pd.DataFrame({'Estimator':est,'F1 Average':avg,'Variance Error': var})
metrics_scaled = pd.DataFrame({'F1 Average_sc':avg_sc,'Variance Error_sc': var_sc})

In [30]:
final_metrics = pd.concat((metrics_unscaled,metrics_scaled),axis  =1).sort_values('F1 Average',ascending = False)
final_metrics

Unnamed: 0,Estimator,F1 Average,Variance Error,F1 Average_sc,Variance Error_sc
7,XGBClassifier,0.5254,0.0032,0.5254,0.0032
8,CatBoostClassifier,0.5254,0.0044,0.5254,0.0044
5,AdaBoostClassifier,0.5096,0.0045,0.5096,0.0045
6,GradientBoostingClassifier,0.5095,0.0041,0.5095,0.0041
4,RandomForestClassifier,0.4985,0.0032,0.4981,0.0033
3,ExtraTreesClassifier,0.4889,0.0024,0.489,0.0022
2,DecisionTreeClassifier,0.4255,0.0053,0.4252,0.0048
0,Logistic Regression,0.4235,0.0229,0.5053,0.002
1,Gaussian Naive Base,0.3293,0.0032,,


### Base Model

* We have a range of f1 scores from various models in the range of 42-53%.
* Boosting Algorithms seem to give us f1 scores around the 50% mark.
* LogisticRegression and Decision Tree give us an f1 score of 42% and RandomForest gave us an f1 score of 49.85%.
* On Standardizing the data, LogisticRegression gives an f1 score of 50%. Standardization does not effect tree based models since they are probabilistic in nature.

###### Models to focus on for hyperparameter tuning:
* CatBoostClassifier
* XGBClassifier

# XGBClassifier (Working)

XGBoost, is extreme because it is a big machine learning model with many parts. First we look at the explanation from a regression perspective and then we will look at the explanation from a classification perspective. (Explanation is from StatQuest Video in my own words.)

### Regression
###### Image of example
<img src = "Images/Example for regression.png" width = "500px" height = "auto">

* **Step 1:** Make a base prediction which would be 0.5 in this case. This remains the same for regression and classification.

**Instead of using a regular tree like in gradient boosting, this algorithm uses an XGBoost Tree.**

* Each tree starts off as a single leaf and will hold all the residuals from the base prediction.

* Calculate a quality or Similarity Score for the residuals.
<img src = "Images/Similarity Score.png" width = "500px" height = "auto">

* The **lambda** in the above formula is a regularization parameter.



### Classification

In [10]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
xgbc = XGBClassifier(n_jobs = -1,random_state = 2)

In [15]:
#Initializing parameters
params = {'n_estimators':list(range(0,1000,25)),
          'max_depth': list(range(1,100)),
          'learning_rate': list(np.linspace(0,1,100))[1:], #(Eta)
          'booster':['gbtree','gblinear','dart'],
          'gamma': list(np.linspace(0,1,100))[1:] #For tree pruning              
         }
rscv  = RandomizedSearchCV(xgbc,params,5,'f1_weighted')
rscv.fit(X,y)



KeyboardInterrupt: 

In [None]:
rscv.best_params_

In [None]:
rscv.param_distributions

###### We are using RandomizedSearchCV to get a baseline for grid search cv