<a href="https://colab.research.google.com/github/taegeonyu/hds5210-2023/blob/main/Week%201/Model_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing Necessary Libraries

In [1]:
# To help with reading and manipulating data
import pandas as pd
import numpy as np

# To help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# To be used for missing value imputation
from sklearn.impute import SimpleImputer

# To help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    BaggingClassifier,
)
from xgboost import XGBClassifier

# To get different metric scores, and split data
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score,
    ConfusionMatrixDisplay,
)

# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)

# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# To supress warnings
import warnings

warnings.filterwarnings("ignore")

## Loading the Dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Reading the dataset
data = pd.read_csv('/content/drive/MyDrive/Project/Model Tuning/loan_data.csv')

In [4]:
# Copying data to another variable to avoid any changes to original data
df = data.copy()

## Data Overview

In [5]:
# Checking the shape of the dataset
print(f'There are {df.shape[0]} number of rows and {df.shape[1]} number of columns.')

There are 45000 number of rows and 14 number of columns.


In [6]:
# Checking the first 5 rows of the dataset
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      45000 non-null  float64
 1   person_gender                   45000 non-null  object 
 2   person_education                45000 non-null  object 
 3   person_income                   45000 non-null  float64
 4   person_emp_exp                  45000 non-null  int64  
 5   person_home_ownership           45000 non-null  object 
 6   loan_amnt                       45000 non-null  float64
 7   loan_intent                     45000 non-null  object 
 8   loan_int_rate                   45000 non-null  float64
 9   loan_percent_income             45000 non-null  float64
 10  cb_person_cred_hist_length      45000 non-null  float64
 11  credit_score                    45000 non-null  int64  
 12  previous_loan_defaults_on_file  

In [8]:
# Checking the missing values
df.isnull().sum()

Unnamed: 0,0
person_age,0
person_gender,0
person_education,0
person_income,0
person_emp_exp,0
person_home_ownership,0
loan_amnt,0
loan_intent,0
loan_int_rate,0
loan_percent_income,0


* There are not any missing values.
* However, checking categorical variables that might have ambiguous values for analysis must be checked.

In [9]:
# List of categorical variables in the data
cat_cols = df.select_dtypes(include = 'object').columns.tolist()

# Checking the values of categorical variables
for col in df[cat_cols].columns:
    print(df[col].value_counts())
    print('-' * 30)

person_gender
male      24841
female    20159
Name: count, dtype: int64
------------------------------
person_education
Bachelor       13399
Associate      12028
High School    11972
Master          6980
Doctorate        621
Name: count, dtype: int64
------------------------------
person_home_ownership
RENT        23443
MORTGAGE    18489
OWN          2951
OTHER         117
Name: count, dtype: int64
------------------------------
loan_intent
EDUCATION            9153
MEDICAL              8548
VENTURE              7819
PERSONAL             7552
DEBTCONSOLIDATION    7145
HOMEIMPROVEMENT      4783
Name: count, dtype: int64
------------------------------
previous_loan_defaults_on_file
Yes    22858
No     22142
Name: count, dtype: int64
------------------------------


**Observations:**
* The 'OTHER' in person_home_ownership column might impose bias and does not convey any valuable information, so it will be dropped.

In [10]:
# Deleting the unnecessary values
df = df[df['person_home_ownership'] != 'OTHER']

# Checking if the values have been deleted
print(df['person_home_ownership'].unique())

['RENT' 'OWN' 'MORTGAGE']


In [11]:
# Checking the duplicated values
df.duplicated().sum()

0

* No duplicated rows.

In [12]:
df1 = df.copy()

In [13]:
# Processing the predictor and target varaibles for modeling
X = df1.drop('loan_status', axis = 1)
y = df1['loan_status']

# Converting categorical features to dummy variables
X = pd.get_dummies(X)

In [14]:
# Splitting data into training, validation and test set:
# first we split data into 2 parts, say temporary and test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1, stratify = y)

# Splitting the temporary set into train and validation
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size = 0.25, random_state = 1, stratify = y_temp)
print(X_train.shape, X_val.shape, X_test.shape)

(26929, 26) (8977, 26) (8977, 26)


In [15]:
for val in X_train.columns:
  if X_train[val].dtype != np.number:
    X_train[val] = X_train[val].astype(np.int64)

In [16]:
for val in X_temp.columns:
  if X_temp[val].dtype != np.number:
    X_temp[val] = X_temp[val].astype(np.int64)

In [17]:
for val in X_test.columns:
  if X_test[val].dtype != np.number:
    X_test[val] = X_test[val].astype(np.int64)

In [18]:
scorer = metrics.make_scorer(metrics.recall_score)

In [19]:
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):

    # predicting using the independent variables
    pred = model.predict(predictors)

    acc = accuracy_score(target, pred)  # to compute Accuracy
    recall = recall_score(target, pred)  # to compute Recall
    precision = precision_score(target, pred)  # to compute Precision
    f1 = f1_score(target, pred)  # to compute F1-score

    # creating a dataframe of metrics
    df_perf = pd.DataFrame(
        {
            "Accuracy": acc,
            "Recall": recall,
            "Precision": precision,
            "F1": f1,
        },
        index=[0],
    )

    return df_perf

In [20]:
def confusion_matrix_sklearn(model, predictors, target):

    y_pred = model.predict(predictors)
    cm = confusion_matrix(target, y_pred)
    labels = np.asarray(
        [
            ["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
            for item in cm.flatten()
        ]
    ).reshape(2, 2)

    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=labels, fmt="")
    plt.ylabel("True label")
    plt.xlabel("Predicted label")

In [21]:
# Define the scorer if not already defined
scorer = metrics.make_scorer(recall_score)

In [22]:
models = []  # Empty list to store all the models

# Appending models into the list
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Random forest", RandomForestClassifier(random_state=1)))
models.append(("GBM", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("dtree", DecisionTreeClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss", use_label_encoder = False)))

results1 = []  # Empty list to store all model's CV scores
names = []  # Empty list to store name of the models

In [23]:
# Cross-validation performance
print("\nCross-Validation performance on training dataset:\n")
for name, model in models:
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    cv_result = cross_val_score(estimator=model, X=X_train, y=y_train, scoring=scorer, cv=kfold)
    results1.append(cv_result)
    names.append(name)
    print(f"{name}: {cv_result.mean():.4f}")


Cross-Validation performance on training dataset:

Bagging: 0.7546
Random forest: 0.7688
GBM: 0.7723
Adaboost: 0.7462
dtree: 0.7750


AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [24]:
# Validation performance
print("\nValidation Performance:\n")
for name, model in models:
    model.fit(X_train, y_train)
    scores = recall_score(y_val, model.predict(X_val))
    print(f"{name}: {scores:.4f}")


Validation Performance:

Bagging: 0.7575
Random forest: 0.7570
GBM: 0.7540
Adaboost: 0.7666
dtree: 0.7841
Xgboost: 0.7947
