This notbook is used to demonstrate building classification models using tree-based classifiers. The full version of this code is available in `09_classification_model.ipynb`. The dataset used for this exercise is borrowed from [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients).

### Import packages

In [None]:
import pandas as pd

### Set-up

In [None]:
# input file location and name
infile = 'https://raw.githubusercontent.com/vishal-git/dapt-631/main/data/credit_default_model_data.csv'

# target variable (column name)
target = 'default payment next month'

### Read data

In [None]:
df = 

In [None]:
y = 
X = 

X_train = 
X_test = 
X_valid =

y_train = 
y_test = 
y_valid = 

### Initial Models

In [None]:
# import sklearn classifiers
#--

# define classifiers
logit = 
tree = 
forest = 
gboost = 

Fit the models.

In [None]:
models = 

#--

Save the predicted probabilities (scores) for the train and test paritions for each model.

In [None]:
logit_scores_train = 
logit_scores_test = 

In [None]:
tree_scores_train = tree.predict_proba(X_train)[:, 1]
tree_scores_test = tree.predict_proba(X_test)[:, 1]

forest_scores_train = forest.predict_proba(X_train)[:, 1]
forest_scores_test = forest.predict_proba(X_test)[:, 1]

gboost_scores_train = gboost.predict_proba(X_train)[:, 1]
gboost_scores_test = gboost.predict_proba(X_test)[:, 1]

### ROC Curves

In [None]:
#--

# calculate the false positive and true positive rates
logit_fpr_train, logit_tpr_train, _ = 
logit_fpr_test, logit_tpr_test, _ = 

In [None]:
tree_fpr_train, tree_tpr_train, _ = roc_curve(y_train, tree_scores_train)
tree_fpr_test, tree_tpr_test, _ = roc_curve(y_test, tree_scores_test)

forest_fpr_train, forest_tpr_train, _ = roc_curve(y_train, forest_scores_train)
forest_fpr_test, forest_tpr_test, _ = roc_curve(y_test, forest_scores_test)

gboost_fpr_train, gboost_tpr_train, _ = roc_curve(y_train, gboost_scores_train)
gboost_fpr_test, gboost_tpr_test, _ = roc_curve(y_test, gboost_scores_test)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

#--

In [None]:
plt.figure(figsize=(12, 9))

plt.plot(logit_fpr_train, logit_tpr_train, color='royalblue', lw=2, alpha=0.3,
         label=f'LR Train (AUC = {roc_auc_score(y_train, logit_scores_train):0.3f})')

plt.plot(tree_fpr_train, tree_tpr_train, color='green', lw=2, alpha=0.4,
         label=f'DT Train (AUC = {roc_auc_score(y_train, tree_scores_train):0.3f})')

plt.plot(forest_fpr_train, forest_tpr_train, color='tomato', lw=2, alpha=0.4,
         label=f'RF Train (AUC = {roc_auc_score(y_train, forest_scores_train):0.3f})')

plt.plot(gboost_fpr_train, gboost_tpr_train, color='purple', lw=2, alpha=0.2,
         label=f'GB Train (AUC = {roc_auc_score(y_train, gboost_scores_train):0.3f})')

plt.plot([0, 1], [0, 1], color='gray', lw=1)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate', fontsize = 14)
plt.ylabel('True Positive Rate', fontsize = 14)
plt.title('Default Risk Model: Trainig Accuracy', fontsize = 16)
plt.legend(loc="lower right", fontsize = 14);

In [None]:
plt.figure(figsize=(12, 9))

plt.plot(logit_fpr_test, logit_tpr_test, color='royalblue', lw=2,
         label=f'LR Test  (AUC = {roc_auc_score(y_test, logit_scores_test):0.3f})')

plt.plot(tree_fpr_test, tree_tpr_test, color='green', lw=2,
         label=f'DT Test  (AUC = {roc_auc_score(y_test, tree_scores_test):0.3f})')

plt.plot(forest_fpr_test, forest_tpr_test, color='darkorange', lw=2,
         label=f'RF Test  (AUC = {roc_auc_score(y_test, forest_scores_test):0.3f})')

plt.plot(gboost_fpr_test, gboost_tpr_test, color='purple', lw=2,
         label=f'GB Test  (AUC = {roc_auc_score(y_test, gboost_scores_test):0.3f})')

plt.plot([0, 1], [0, 1], color='gray', lw=1, alpha=.5)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title('Default Risk Model', fontsize=16)
plt.legend(loc="lower right", fontsize=14);

We haven't standardized the input features. Let's see what happens when we do that.

### Logistic Regression *with Standardization*

In [None]:
#--

# define a scaler
X_scaler = 

# fit the model after stadardizing all variables
logit.fit()

In [None]:
# get model scores
logit_scores_train = logit.predict_proba(X_scaler.transform(X_train.astype(float)))[:, 1]
logit_scores_test = logit.predict_proba(X_scaler.transform(X_test.astype(float)))[:, 1]

# calculate False Positive Rates and True Positive Rates
logit_fpr_train, logit_tpr_train, _ = roc_curve(y_train, logit_scores_train)
logit_fpr_test, logit_tpr_test, _ = roc_curve(y_test, logit_scores_test)

In [None]:
plt.figure(figsize=(12, 9))

plt.plot(logit_fpr_test, logit_tpr_test, color='royalblue', lw=2,
         label=f'LR Test  (AUC = {roc_auc_score(y_test, logit_scores_test):0.3f})')

plt.plot(tree_fpr_test, tree_tpr_test, color='green', lw=2,
         label=f'DT Test  (AUC = {roc_auc_score(y_test, tree_scores_test):0.3f})')

plt.plot(forest_fpr_test, forest_tpr_test, color='darkorange', lw=2,
         label=f'RF Test  (AUC = {roc_auc_score(y_test, forest_scores_test):0.3f})')

plt.plot(gboost_fpr_test, gboost_tpr_test, color='purple', lw=2,
         label=f'GB Test  (AUC = {roc_auc_score(y_test, gboost_scores_test):0.3f})')

plt.plot([0, 1], [0, 1], color='gray', lw=1, alpha=.5)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title('Default Risk Model', fontsize=16)
plt.legend(loc="lower right", fontsize=14);

In the next session, we will work on optimizing the model hyper-parameters to try to improve their performance. 