In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report,confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

####### XGBoost
from xgboost import XGBClassifier

####### Neural network 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

np.random.seed(1234)

# Table of contents
* [Data Dictionary](#datadic)
* [Define preprocessing steps](#preprocessing)
* [Modeling](#model)
    * [1. Logistic Regression](#logit)
    * [2. Neural Network](#nn)
    * [3. XGBoost](#xgb)
    * [4. Random Forest](#rf)
* [Model evaluation](#eva)
* [Adjusting threshold](#threshold)
* [Feature Selection](#fs)

# Data Dictionary <a class="anchor" id="datadic"></a>

In [2]:
from pandas import option_context
data_dic=pd.read_csv('Data_dictionary.csv',index_col=0 ,keep_default_na=False)
data_dic.columns = ['Description','']
with option_context('display.max_colwidth',400):
    display(data_dic)

In [3]:
X_train=pd.read_csv('X_train_NEWWWW.csv',index_col=0)
X_test=pd.read_csv('X_test_NEWWWW.csv',index_col=0)
y_train=pd.read_csv('y_train.csv')
y_test=pd.read_csv('y_test.csv')

In [4]:
X_train.head(3)

Unnamed: 0_level_0,DISBURSED_AMOUNT,ASSET_COST,LTV,BRANCH_ID,SUPPLIER_ID,MANUFACTURER_ID,CURRENT_PINCODE_ID,EMPLOYMENT_TYPE,STATE_ID,EMPLOYEE_CODE_ID,...,PRIMARY_INSTAL_AMT,SEC_INSTAL_AMT,NEW_ACCTS_IN_LAST_SIX_MONTHS,DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS,AVERAGE_ACCT_AGE,CREDIT_HISTORY_LENGTH,NO_OF_INQUIRIES,DISBURSAL_MONTH,DISBURSAL_DAY,AGE
UNIQUEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
502118,48849,67000,73.88,68,18332,86,854,Salaried,6,626,...,8319,0,2,0,3,7,2,12,9,26
508189,33341,89163,39.25,65,16166,48,6852,Self employed,13,2414,...,0,0,0,0,0,0,0,9,15,33
555903,55259,72900,76.82,2,23351,86,2382,Salaried,4,24,...,1999,0,1,0,12,19,0,3,10,30


### Categorical columns

In [5]:
X_train[['EMPLOYMENT_TYPE','PERFORM_CNS_SCORE_DESCRIPTION']].head(3)

Unnamed: 0_level_0,EMPLOYMENT_TYPE,PERFORM_CNS_SCORE_DESCRIPTION
UNIQUEID,Unnamed: 1_level_1,Unnamed: 2_level_1
502118,Salaried,F-Low Risk
508189,Self employed,No Bureau History Available
555903,Salaried,B-Very Low Risk


### Numeric columns

In [6]:
num_col=X_train.select_dtypes(include='number').columns.tolist()
num_col

['DISBURSED_AMOUNT',
 'ASSET_COST',
 'LTV',
 'BRANCH_ID',
 'SUPPLIER_ID',
 'MANUFACTURER_ID',
 'CURRENT_PINCODE_ID',
 'STATE_ID',
 'EMPLOYEE_CODE_ID',
 'AADHAR_FLAG',
 'PAN_FLAG',
 'VOTERID_FLAG',
 'DRIVING_FLAG',
 'PASSPORT_FLAG',
 'PERFORM_CNS_SCORE',
 'PRI_NO_OF_ACCTS',
 'PRI_ACTIVE_ACCTS',
 'PRI_OVERDUE_ACCTS',
 'PRI_CURRENT_BALANCE',
 'PRI_SANCTIONED_AMOUNT',
 'PRI_DISBURSED_AMOUNT',
 'SEC_NO_OF_ACCTS',
 'SEC_ACTIVE_ACCTS',
 'SEC_OVERDUE_ACCTS',
 'SEC_CURRENT_BALANCE',
 'SEC_SANCTIONED_AMOUNT',
 'SEC_DISBURSED_AMOUNT',
 'PRIMARY_INSTAL_AMT',
 'SEC_INSTAL_AMT',
 'NEW_ACCTS_IN_LAST_SIX_MONTHS',
 'DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS',
 'AVERAGE_ACCT_AGE',
 'CREDIT_HISTORY_LENGTH',
 'NO_OF_INQUIRIES',
 'DISBURSAL_MONTH',
 'DISBURSAL_DAY',
 'AGE']

# Define preprocessing steps <a class="anchor" id="preprocessing"></a>

In preprocessing, we would do **OneHotEncoding** for `'EMPLOYMENT_TYPE` and `PERFORM_CNS_SCORE_DESCRIPTION`, and **StandardScaling** the numeric columns.

In [7]:
categorical_features = ['EMPLOYMENT_TYPE','PERFORM_CNS_SCORE_DESCRIPTION']
numeric_features = num_col

ohe = Pipeline(steps=[("encoder", OneHotEncoder(drop = 'first'))])

scale = Pipeline(steps=[("scaler", StandardScaler())])

preprocessor = ColumnTransformer(
    transformers=[("ohe", ohe, categorical_features),
                  ("scale", scale, numeric_features)])

# Modeling  <a class="anchor" id="model"></a>

## 1. Logistic regression  <a class="anchor" id="logit"></a>

### Model fitting

In [8]:
pipe = Pipeline(
    steps=[("preprocessor", preprocessor),\
           ("classifier", LogisticRegression(random_state=17, max_iter=1000))])

param_grid = {'classifier__C': [0.0001,0.001,0.01,0.1]}

logit = GridSearchCV(pipe, param_grid, n_jobs = -1, verbose=3, return_train_score=True)
logit_fitted =logit.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [9]:
# best param
logit_fitted.best_params_

{'classifier__C': 0.0001}

In [10]:
# best validation score
logit_fitted.best_score_

0.7827409716066679

In [11]:
# score
print("Train accuracy: %.3f" % logit_fitted.score(X_train, y_train))
print("Test accuracy: %.3f" % logit_fitted.score(X_test, y_test))
print(f'Accuracy difference: {(logit_fitted.score(X_train, y_train)-logit_fitted.score(X_test, y_test)).round(4)}')

Train accuracy: 0.783
Test accuracy: 0.783
Accuracy difference: -0.0001


The accuracies are not bad and close to each other.

### Model evaluation

In [12]:
# look at confusion matrix
ConfusionMatrixDisplay.from_estimator(logit_fitted, X_test, y_test);

In [13]:
# look at classification_report
y_pred =  logit_fitted.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      1.00      0.88     45636
           1       0.46      0.00      0.00     12653

    accuracy                           0.78     58289
   macro avg       0.62      0.50      0.44     58289
weighted avg       0.71      0.78      0.69     58289



Based on the precision, recall and f1-score, we see that logistic regression predicts pretty well on class 0 but poorly on class 1.

|  Logistic                |Class       |Precision |Recall   |F1
|--------------------------|-------------|----------|---------|---
| Train accuracy = 0.783  | Class  0   | 0.78     | 1.00    |  0.88
| Test accuracy  = 0.783  | Class  1   | 0.46    | 0.00    |  0.00

## 2. Neural Network  <a class="anchor" id="nn"></a>

### Model fitting

Let's start a baseline model with **4 hidden layers, 5 nodes each and 50 epoches**.

In [14]:
# random seeds for reproducibility
tf.random.set_seed(123)

# Create a new sequential model
model = keras.Sequential()

# Declare the hidden layers
model.add(layers.Dense(5, activation="relu"))
model.add(layers.BatchNormalization()) 

model.add(layers.Dense(5, activation="relu"))
model.add(layers.BatchNormalization()) 

model.add(layers.Dense(5, activation="relu"))
model.add(layers.BatchNormalization()) 

model.add(layers.Dense(5, activation="relu"))

# Declare the output layer
model.add(layers.Dense(1, activation="sigmoid"))

# Compile the model
model.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.BinaryCrossentropy(),
    metrics= keras.metrics.BinaryAccuracy())

# define the pipeline
pipe = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit the pipeline to the training data
X_train_transformed = pipe.fit_transform(X_train)

# transform test set
X_test_transformed = pipe.transform(X_test)

# Train the model
history = model.fit(X_train_transformed, y_train, epochs=50, verbose=2)

2023-05-17 20:14:52.774212: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/50
5465/5465 - 8s - loss: 0.5187 - binary_accuracy: 0.7816 - 8s/epoch - 1ms/step
Epoch 2/50
5465/5465 - 6s - loss: 0.5068 - binary_accuracy: 0.7829 - 6s/epoch - 1ms/step
Epoch 3/50
5465/5465 - 6s - loss: 0.5048 - binary_accuracy: 0.7829 - 6s/epoch - 1ms/step
Epoch 4/50
5465/5465 - 6s - loss: 0.5039 - binary_accuracy: 0.7829 - 6s/epoch - 1ms/step
Epoch 5/50
5465/5465 - 6s - loss: 0.5034 - binary_accuracy: 0.7829 - 6s/epoch - 1ms/step
Epoch 6/50
5465/5465 - 6s - loss: 0.5029 - binary_accuracy: 0.7829 - 6s/epoch - 1ms/step
Epoch 7/50
5465/5465 - 6s - loss: 0.5028 - binary_accuracy: 0.7829 - 6s/epoch - 1ms/step
Epoch 8/50
5465/5465 - 6s - loss: 0.5023 - binary_accuracy: 0.7829 - 6s/epoch - 1ms/step
Epoch 9/50
5465/5465 - 6s - loss: 0.5023 - binary_accuracy: 0.7829 - 6s/epoch - 1ms/step
Epoch 10/50
5465/5465 - 7s - loss: 0.5021 - binary_accuracy: 0.7829 - 7s/epoch - 1ms/step
Epoch 11/50
5465/5465 - 6s - loss: 0.5018 - binary_accuracy: 0.7829 - 6s/epoch - 1ms/step
Epoch 12/50
5465/54

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 3/5] END classifier__C=0.001;, score=(train=0.783, test=0.783) total time=   2.9s
[CV 5/5] END classifier__C=0.01;, score=(train=0.783, test=0.783) total time=   4.6s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 1/5] END classifier__C=0.001;, score=(train=0.783, test=0.783) total time=   2.8s
[CV 4/5] END classifier__C=0.01;, score=(train=0.783, test=0.783) total time=   4.8s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 2/5] END classifier__C=0.001;, score=(train=0.783, test=0.782) total time=   3.1s
[CV 1/5] END classifier__C=0.1;, score=(train=0.783, test=0.783) total time=   5.7s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


5465/5465 - 6s - loss: 0.5006 - binary_accuracy: 0.7830 - 6s/epoch - 1ms/step
Epoch 45/50
[CV 5/5] END classifier__C=0.0001;, score=(train=0.783, test=0.783) total time=   1.7s
[CV 4/5] END classifier__C=0.001;, score=(train=0.783, test=0.783) total time=   2.9s
[CV 3/5] END classifier__C=0.1;, score=(train=0.783, test=0.782) total time=   4.5s
[CV 2/5] END classifier__C=0.0001;, score=(train=0.783, test=0.783) total time=   1.9s
[CV 5/5] END classifier__C=0.001;, score=(train=0.783, test=0.783) total time=   2.8s
[CV 2/5] END classifier__C=0.1;, score=(train=0.783, test=0.783) total time=   4.7s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV 1/5] END classifier__C=0.0001;, score=(train=0.783, test=0.783) total time=   2.3s
[CV 1/5] END classifier__C=0.01;, score=(train=0.783, test=0.783) total time=   4.2s
[CV 4/5] END classifier__C=0.1;, score=(train=0.783, test=0.782) total time=   3.7s
[CV 4/5] END classifier__C=0.0001;, score=(train=0.783, test=0.783) total time=   2.2s
[CV 3/5] END classifier__C=0.01;, score=(train=0.783, test=0.782) total time=   4.6s
[CV 5/5] END classifier__C=0.1;, score=(train=0.783, test=0.783) total time=   3.2s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


5465/5465 - 6s - loss: 0.5008 - binary_accuracy: 0.7829 - 6s/epoch - 1ms/step
Epoch 46/50
5465/5465 - 6s - loss: 0.5005 - binary_accuracy: 0.7829 - 6s/epoch - 1ms/step
Epoch 47/50
5465/5465 - 6s - loss: 0.5004 - binary_accuracy: 0.7829 - 6s/epoch - 1ms/step
Epoch 48/50
5465/5465 - 6s - loss: 0.5002 - binary_accuracy: 0.7829 - 6s/epoch - 1ms/step
Epoch 49/50
5465/5465 - 6s - loss: 0.5003 - binary_accuracy: 0.7828 - 6s/epoch - 1ms/step
Epoch 50/50
5465/5465 - 6s - loss: 0.5004 - binary_accuracy: 0.7830 - 6s/epoch - 1ms/step


In [15]:
# Score
train_accuracy = history.history["binary_accuracy"][-1]
test_accuracy = model.evaluate(X_test_transformed,y_test)

print(f"Train Accuracy: {np.round(train_accuracy,3)}")
print(f"Test Accuracy: {np.round(test_accuracy[1],3)}")
print(f"Accuracy difference: {np.round(train_accuracy,3)-np.round(test_accuracy[1],3)}")

Train Accuracy: 0.783
Test Accuracy: 0.783
Accuracy difference: 0.0


We see that this basic neural network gives the same result as logistic regression, **train accuracy = test accuracy = 0.783**, this is probably because our dataset is simple to learn.

### Model evaluation

In [16]:
# Confusion matrix
y_pred = model.predict(X_test_transformed).round().astype(int)
confusion_matrix(y_test, y_pred)

array([[45562,    74],
       [12588,    65]])

In [17]:
# look at classification_report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      1.00      0.88     45636
           1       0.47      0.01      0.01     12653

    accuracy                           0.78     58289
   macro avg       0.63      0.50      0.44     58289
weighted avg       0.71      0.78      0.69     58289



|  Logistic                |Class       |Precision |Recall   |F1
|--------------------------|-------------|----------|---------|---
| Train accuracy = 0.783  | Class  0   | 0.78     | 1.00    |  0.88
| Test accuracy  = 0.783  | Class  1   | 0.46    | 0.00    |  0.00

|  Neuron Network          |Class  |Precision     |Recall      |F1
|--------------------------|----------------|--------------|--------------|---
| Train accuracy = 0.783 | Class  0    | 0.78    |  1.00   |   0.88
| Test accuracy  = 0.783  | Class  1    | 0.47     | 0.01    |  0.01

## 3. XGBoost  <a class="anchor" id="xgb"></a>

In [18]:
pipe = Pipeline(steps=[("preprocessor", preprocessor), 
                       ("classifier", XGBClassifier(random_state=0))])
parameters = {
    'classifier__learning_rate': [0.0001, 0.001, 0.01, 0.1],
    'classifier__n_estimators': [50,70,80]}
    
xgb = GridSearchCV(pipe, parameters, n_jobs = -1, verbose = 3)
xgb_fitted=xgb.fit(X_train, y_train);

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [19]:
# best parameters 
print(f"best parameters: {xgb_fitted.best_params_}")

best parameters: {'classifier__learning_rate': 0.1, 'classifier__n_estimators': 50}


In [20]:
# best score
print(f"best score: {xgb_fitted.best_score_}")

best score: 0.7836216509879049


In [21]:
# score
print(f"Train Accuracy: {xgb_fitted.score(X_train, y_train).round(3)}")
print(f"Test Accuracy: {xgb_fitted.score(X_test, y_test).round(3)}")
print(f"Accuracy difference: {xgb_fitted.score(X_train, y_train).round(3) - xgb_fitted.score(X_test, y_test).round(3)}")

Train Accuracy: 0.785
Test Accuracy: 0.784
Accuracy difference: 0.0010000000000000009


We see the same accuracy scores as Neural Network and Logistics Regression.

### Model evaluation

In [22]:
# Confusion matrix
y_pred = xgb_fitted.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[45593,    43],
       [12567,    86]])

In [23]:
# classification report
print(classification_report(y_test, y_pred));

              precision    recall  f1-score   support

           0       0.78      1.00      0.88     45636
           1       0.67      0.01      0.01     12653

    accuracy                           0.78     58289
   macro avg       0.73      0.50      0.45     58289
weighted avg       0.76      0.78      0.69     58289



|  Logistic                |Class       |Precision |Recall   |F1
|--------------------------|-------------|----------|---------|---
| Train accuracy = 0.783  | Class  0   | 0.78     | 1.00    |  0.88
| Test accuracy  = 0.783  | Class  1   | 0.46    | 0.00    |  0.00

|  Neuron Network          |Class  |Precision     |Recall      |F1
|--------------------------|----------------|--------------|--------------|---
| Train accuracy = 0.783 | Class  0    | 0.78    |  1.00   |   0.88
| Test accuracy  = 0.783  | Class  1    | 0.47     | 0.01    |  0.01

|  XGBoost          |Class  |Precision     |Recall      |F1
|--------------------------|----------------|--------------|--------------|---
| Train accuracy = 0.785 | Class  0    |  0.78   |   1.00   |   0.88
| Test accuracy  = 0.784  | Class  1    | 0.67    |  0.01    |  0.01

Given the same accuracies, XGBoost tends to perform better than Neural Network and Logistics Regression in terms of classifying class 1.

## 4. Random Forest  <a class="anchor" id="rf"></a>

### Model fitting

In [24]:
rf_preprocessor = ColumnTransformer(transformers=[("ohe", ohe, categorical_features)])

# instantiate
pipe = Pipeline(steps=[("preprocessor", rf_preprocessor), 
                       ("classifier", RandomForestClassifier(random_state=10))])

param_grid = {'classifier__n_estimators':[35,37,39],
              'classifier__max_depth':[10,12,14],
              'classifier__min_samples_leaf':[3,5,7]}

rf = GridSearchCV(pipe, param_grid, verbose=3, return_train_score=True)

# fit
rf_fitted=rf.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END classifier__max_depth=10, classifier__min_samples_leaf=3, classifier__n_estimators=35;, score=(train=0.783, test=0.783) total time=   2.3s
[CV 2/5] END classifier__max_depth=10, classifier__min_samples_leaf=3, classifier__n_estimators=35;, score=(train=0.783, test=0.783) total time=   1.8s
[CV 3/5] END classifier__max_depth=10, classifier__min_samples_leaf=3, classifier__n_estimators=35;, score=(train=0.783, test=0.783) total time=   1.8s
[CV 4/5] END classifier__max_depth=10, classifier__min_samples_leaf=3, classifier__n_estimators=35;, score=(train=0.783, test=0.783) total time=   1.9s
[CV 5/5] END classifier__max_depth=10, classifier__min_samples_leaf=3, classifier__n_estimators=35;, score=(train=0.783, test=0.783) total time=   1.8s
[CV 1/5] END classifier__max_depth=10, classifier__min_samples_leaf=3, classifier__n_estimators=37;, score=(train=0.783, test=0.783) total time=   1.8s
[CV 2/5] END classifier__m

  from pandas import MultiIndex, Int64Index
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[CV 2/5] END classifier__max_depth=12, classifier__min_samples_leaf=7, classifier__n_estimators=37;, score=(train=0.783, test=0.783) total time=   2.0s
[CV 3/5] END classifier__max_depth=12, classifier__min_samples_leaf=7, classifier__n_estimators=37;, score=(train=0.783, test=0.783) total time=   2.0s
[CV 4/5] END classifier__max_depth=12, classifier__min_samples_leaf=7, classifier__n_estimators=37;, score=(train=0.783, test=0.783) total time=   2.0s
[CV 5/5] END classifier__max_depth=12, classifier__min_samples_leaf=7, classifier__n_estimators=37;, score=(train=0.783, test=0.783) total time=   2.0s
[CV 1/5] END classifier__max_depth=12, classifier__min_samples_leaf=7, classifier__n_estimators=39;, score=(train=0.783, test=0.783) total time=   2.2s
[CV 5/5] END classifier__learning_rate=0.0001, classifier__n_estimators=50;, score=0.783 total time= 1.5min
[CV 3/5] END classifier__learning_rate=0.0001, classifier__n_estimators=80;, score=0.783 total time= 2.3min
[CV 2/5] END classifier_

  from pandas import MultiIndex, Int64Index
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[CV 2/5] END classifier__max_depth=12, classifier__min_samples_leaf=7, classifier__n_estimators=39;, score=(train=0.783, test=0.783) total time=   2.1s
[CV 3/5] END classifier__max_depth=12, classifier__min_samples_leaf=7, classifier__n_estimators=39;, score=(train=0.783, test=0.783) total time=   2.1s
[CV 4/5] END classifier__max_depth=12, classifier__min_samples_leaf=7, classifier__n_estimators=39;, score=(train=0.783, test=0.783) total time=   2.1s
[CV 5/5] END classifier__max_depth=12, classifier__min_samples_leaf=7, classifier__n_estimators=39;, score=(train=0.783, test=0.783) total time=   2.1s
[CV 1/5] END classifier__max_depth=14, classifier__min_samples_leaf=3, classifier__n_estimators=35;, score=(train=0.783, test=0.783) total time=   2.0s
[CV 1/5] END classifier__learning_rate=0.0001, classifier__n_estimators=50;, score=0.783 total time= 1.5min
[CV 2/5] END classifier__learning_rate=0.0001, classifier__n_estimators=80;, score=0.783 total time= 2.3min
[CV 5/5] END classifier_

  from pandas import MultiIndex, Int64Index
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[CV 2/5] END classifier__max_depth=14, classifier__min_samples_leaf=3, classifier__n_estimators=35;, score=(train=0.783, test=0.783) total time=   2.0s
[CV 3/5] END classifier__max_depth=14, classifier__min_samples_leaf=3, classifier__n_estimators=35;, score=(train=0.783, test=0.783) total time=   2.1s
[CV 4/5] END classifier__max_depth=14, classifier__min_samples_leaf=3, classifier__n_estimators=35;, score=(train=0.783, test=0.783) total time=   2.0s
[CV 5/5] END classifier__max_depth=14, classifier__min_samples_leaf=3, classifier__n_estimators=35;, score=(train=0.783, test=0.783) total time=   2.1s
[CV 1/5] END classifier__max_depth=14, classifier__min_samples_leaf=3, classifier__n_estimators=37;, score=(train=0.783, test=0.783) total time=   2.1s
[CV 2/5] END classifier__max_depth=14, classifier__min_samples_leaf=3, classifier__n_estimators=37;, score=(train=0.783, test=0.783) total time=   2.1s
[CV 3/5] END classifier__max_depth=14, classifier__min_samples_leaf=3, classifier__n_est

  from pandas import MultiIndex, Int64Index
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[CV 4/5] END classifier__max_depth=14, classifier__min_samples_leaf=3, classifier__n_estimators=37;, score=(train=0.783, test=0.783) total time=   2.1s
[CV 4/5] END classifier__learning_rate=0.0001, classifier__n_estimators=50;, score=0.782 total time= 1.5min
[CV 5/5] END classifier__learning_rate=0.0001, classifier__n_estimators=70;, score=0.783 total time= 2.0min
[CV 2/5] END classifier__learning_rate=0.001, classifier__n_estimators=50;, score=0.783 total time= 1.4min
[CV 5/5] END classifier__learning_rate=0.001, classifier__n_estimators=70;, score=0.783 total time= 2.1min
[CV 3/5] END classifier__learning_rate=0.01, classifier__n_estimators=50;, score=0.783 total time= 1.4min
[CV 1/5] END classifier__learning_rate=0.01, classifier__n_estimators=80;, score=0.783 total time= 2.9min
[CV 4/5] END classifier__learning_rate=0.1, classifier__n_estimators=50;, score=0.783 total time= 1.8min
[CV 2/5] END classifier__learning_rate=0.1, classifier__n_estimators=80;, score=0.784 total time= 2.3

  from pandas import MultiIndex, Int64Index
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[CV 5/5] END classifier__max_depth=14, classifier__min_samples_leaf=3, classifier__n_estimators=37;, score=(train=0.783, test=0.783) total time=   2.1s
[CV 1/5] END classifier__max_depth=14, classifier__min_samples_leaf=3, classifier__n_estimators=39;, score=(train=0.783, test=0.783) total time=   2.2s
[CV 2/5] END classifier__max_depth=14, classifier__min_samples_leaf=3, classifier__n_estimators=39;, score=(train=0.783, test=0.783) total time=   2.2s
[CV 3/5] END classifier__max_depth=14, classifier__min_samples_leaf=3, classifier__n_estimators=39;, score=(train=0.783, test=0.783) total time=   2.3s
[CV 2/5] END classifier__learning_rate=0.0001, classifier__n_estimators=70;, score=0.783 total time= 2.1min
[CV 1/5] END classifier__learning_rate=0.001, classifier__n_estimators=50;, score=0.783 total time= 1.5min
[CV 3/5] END classifier__learning_rate=0.001, classifier__n_estimators=50;, score=0.783 total time= 1.4min
[CV 2/5] END classifier__learning_rate=0.001, classifier__n_estimators

  from pandas import MultiIndex, Int64Index
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[CV 4/5] END classifier__max_depth=14, classifier__min_samples_leaf=3, classifier__n_estimators=39;, score=(train=0.783, test=0.783) total time=   2.2s
[CV 5/5] END classifier__max_depth=14, classifier__min_samples_leaf=3, classifier__n_estimators=39;, score=(train=0.783, test=0.783) total time=   2.2s
[CV 1/5] END classifier__max_depth=14, classifier__min_samples_leaf=5, classifier__n_estimators=35;, score=(train=0.783, test=0.783) total time=   2.0s
[CV 2/5] END classifier__max_depth=14, classifier__min_samples_leaf=5, classifier__n_estimators=35;, score=(train=0.783, test=0.783) total time=   2.0s
[CV 3/5] END classifier__max_depth=14, classifier__min_samples_leaf=5, classifier__n_estimators=35;, score=(train=0.783, test=0.783) total time=   2.0s
[CV 4/5] END classifier__max_depth=14, classifier__min_samples_leaf=5, classifier__n_estimators=35;, score=(train=0.783, test=0.783) total time=   2.0s
[CV 5/5] END classifier__max_depth=14, classifier__min_samples_leaf=5, classifier__n_est

  from pandas import MultiIndex, Int64Index
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[CV 1/5] END classifier__max_depth=14, classifier__min_samples_leaf=5, classifier__n_estimators=37;, score=(train=0.783, test=0.783) total time=   2.2s
[CV 2/5] END classifier__max_depth=14, classifier__min_samples_leaf=5, classifier__n_estimators=37;, score=(train=0.783, test=0.783) total time=   2.1s
[CV 3/5] END classifier__max_depth=14, classifier__min_samples_leaf=5, classifier__n_estimators=37;, score=(train=0.783, test=0.783) total time=   2.2s
[CV 4/5] END classifier__max_depth=14, classifier__min_samples_leaf=5, classifier__n_estimators=37;, score=(train=0.783, test=0.783) total time=   2.1s
[CV 1/5] END classifier__learning_rate=0.0001, classifier__n_estimators=70;, score=0.783 total time= 2.1min
[CV 4/5] END classifier__learning_rate=0.0001, classifier__n_estimators=80;, score=0.782 total time= 2.3min
[CV 4/5] END classifier__learning_rate=0.001, classifier__n_estimators=70;, score=0.783 total time= 2.1min
[CV 2/5] END classifier__learning_rate=0.01, classifier__n_estimators

  from pandas import MultiIndex, Int64Index
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[CV 5/5] END classifier__max_depth=14, classifier__min_samples_leaf=5, classifier__n_estimators=37;, score=(train=0.783, test=0.783) total time=   2.1s
[CV 1/5] END classifier__max_depth=14, classifier__min_samples_leaf=5, classifier__n_estimators=39;, score=(train=0.783, test=0.783) total time=   2.2s
[CV 2/5] END classifier__max_depth=14, classifier__min_samples_leaf=5, classifier__n_estimators=39;, score=(train=0.783, test=0.783) total time=   2.2s
[CV 3/5] END classifier__max_depth=14, classifier__min_samples_leaf=5, classifier__n_estimators=39;, score=(train=0.783, test=0.783) total time=   2.3s
[CV 4/5] END classifier__max_depth=14, classifier__min_samples_leaf=5, classifier__n_estimators=39;, score=(train=0.783, test=0.783) total time=   2.2s
[CV 5/5] END classifier__max_depth=14, classifier__min_samples_leaf=5, classifier__n_estimators=39;, score=(train=0.783, test=0.783) total time=   2.2s
[CV 1/5] END classifier__max_depth=14, classifier__min_samples_leaf=7, classifier__n_est

In [25]:
# best model
rf_fitted.best_params_

{'classifier__max_depth': 10,
 'classifier__min_samples_leaf': 3,
 'classifier__n_estimators': 35}

In [26]:
# best validation score
rf_fitted.best_score_

0.7829296886169329

In [27]:
# score
print(f"Train Accuracy: {rf_fitted.score(X_train, y_train).round(3)}")
print(f"Test Accuracy: {rf_fitted.score(X_test, y_test).round(3)}")
print(f"Accuracy difference: {(rf_fitted.score(X_train, y_train) - rf_fitted.score(X_test, y_test)).round(3)}")

Train Accuracy: 0.783
Test Accuracy: 0.783
Accuracy difference: 0.0


### Model evaluation

In [28]:
# Confusion matrix
y_pred = rf_fitted.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[45636,     0],
       [12653,     0]])

In [29]:
# look at classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      1.00      0.88     45636
           1       0.00      0.00      0.00     12653

    accuracy                           0.78     58289
   macro avg       0.39      0.50      0.44     58289
weighted avg       0.61      0.78      0.69     58289



|  Logistic                |Class       |Precision |Recall   |F1
|--------------------------|-------------|----------|---------|---
| Train accuracy = 0.783  | Class  0   | 0.78     | 1.00    |  0.88
| Test accuracy  = 0.783  | Class  1   | 0.46    | 0.00    |  0.00

|  Neuron Network          |Class  |Precision     |Recall      |F1
|--------------------------|----------------|--------------|--------------|---
| Train accuracy = 0.783 | Class  0    | 0.78    |  1.00   |   0.88
| Test accuracy  = 0.783  | Class  1    | 0.47     | 0.01    |  0.01

|  XGBoost          |Class  |Precision     |Recall      |F1
|--------------------------|----------------|--------------|--------------|---
| Train accuracy = 0.785 | Class  0    |  0.78   |   1.00   |   0.88
| Test accuracy  = 0.784  | Class  1    | 0.67    |  0.01    |  0.01

|  Random Forest          |Class  |Precision     |Recall      |F1
|--------------------------|----------------|--------------|--------------|---
| Train accuracy = 0.783 | Class  0    | 0.78    |  1.00   |   0.88
| Test accuracy  = 0.783  | Class  1    | 0.00     | 0.00    |  0.00

We see that throughout 4 models, the accuracies are constant around **78.3%**, this means that this is the best accuracy that could be learned from this dataset. At threshold 0.5, this level of accuracy is not ideal at all, so to improve the accuracy or to more precisely classify the target, the dataset would require additional important features that are highly meaningful to the target. This necessitates domain expertise and is a room for improvement.

Misclassifying class 1 would cause significantly financial losses, therefore recall of class 1 should be prioritized. Let's adjust threshold to see which one would give the ideal learning result for class 1.

# Threshold Adjustment <a class="anchor" id="#threshold"></a>

In [30]:
thresholds = np.arange(0.1, 0.51, 0.01)

##################### LOGISTIC
lg_precision_1 = []
lg_recall_1 = []
lg_precision_0 = []
lg_recall_0 = []

# Make probabilistic predictions (scores)
logit_y_proba_train = logit_fitted.predict_proba(X_train)[:,1]

for threshold in thresholds:

    # Apply threshold
    y_threshold = np.where(logit_y_proba_train >= threshold, 1, 0)
    logit_report=classification_report(y_train, y_threshold,output_dict=True)

    # Class 1
    lg_precision_1.append(logit_report['1']['precision'])
    lg_recall_1.append(logit_report['1']['recall'])
    
    # Class 0
    lg_precision_0.append(logit_report['0']['precision'])
    lg_recall_0.append(logit_report['0']['recall'])
    
##################### NEURAL NETWORK
nn_precision_1=[]
nn_recall_1=[]
nn_precision_0=[]
nn_recall_0=[]

nn_y_proba_train= model.predict(X_train_transformed)

for threshold in thresholds:

    # Apply threshold
    y_threshold = np.where(nn_y_proba_train >= threshold, 1, 0)
    nn_report=classification_report(y_train, y_threshold,output_dict=True)

    # Class 1   
    nn_precision_1.append(nn_report['1']['precision'])
    nn_recall_1.append(nn_report['1']['recall'])
    
    # Class 0
    nn_precision_0.append(nn_report['0']['precision'])
    nn_recall_0.append(nn_report['0']['recall'])

##################### XGBoost
xg_precision_1 = []
xg_recall_1 = []
xg_precision_0 = []
xg_recall_0 = []

xg_y_proba_train=xgb_fitted.predict_proba(X_train)[:, 1]

for threshold in thresholds:

    # Apply threshold
    y_threshold = np.where(xg_y_proba_train >= threshold, 1, 0)
    xg_report=classification_report(y_train, y_threshold,output_dict=True)

    # class 1
    xg_precision_1.append(xg_report['1']['precision'])
    xg_recall_1.append(xg_report['1']['recall'])
    
    # class 0
    xg_precision_0.append(xg_report['0']['precision'])
    xg_recall_0.append(xg_report['0']['recall'])
    
##################### RANDOM FOREST    
rf_precision_1 = []
rf_recall_1 = []
rf_precision_0 = []
rf_recall_0 = []

rf_y_proba_train=rf_fitted.predict_proba(X_train)[:, 1]

for threshold in thresholds:

    # Apply threshold
    y_threshold = np.where(rf_y_proba_train >= threshold, 1, 0)
    rf_report=classification_report(y_train, y_threshold,output_dict=True)
    
    # class 1 
    rf_precision_1.append(rf_report['1']['precision'])
    rf_recall_1.append(rf_report['1']['recall'])
    
    # class 0 
    rf_precision_0.append(rf_report['0']['precision'])
    rf_recall_0.append(rf_report['0']['recall'])

In [31]:
class_1=pd.DataFrame({'LG_precision_1':lg_precision_1, 'LG_recall_1': lg_recall_1,\
                 'NN_precision_1':nn_precision_1,'NN_recall_1':nn_recall_1,\
                 'XG_precision_1':xg_precision_1,'XG_recall_1':xg_recall_1,\
                 'RF_precision_1':rf_precision_1,'RF_recall_1':rf_recall_1},index=thresholds)
class_1

Unnamed: 0,LG_precision_1,LG_recall_1,NN_precision_1,NN_recall_1,XG_precision_1,XG_recall_1,RF_precision_1,RF_recall_1
0.1,0.219469,0.993914,0.231542,0.962406,0.228712,0.988224,0.21707,1.0
0.11,0.221004,0.989884,0.236926,0.943622,0.233743,0.979451,0.21707,1.0
0.12,0.222938,0.983087,0.24277,0.921993,0.239305,0.967043,0.21707,1.0
0.13,0.225801,0.974366,0.24852,0.899231,0.245628,0.952026,0.219204,0.989936
0.14,0.229056,0.962195,0.253979,0.878287,0.252537,0.934322,0.219204,0.989936
0.15,0.233156,0.945571,0.259167,0.853522,0.259218,0.913062,0.221248,0.974498
0.16,0.23819,0.9241,0.264672,0.827862,0.266877,0.889957,0.222771,0.961695
0.17,0.244196,0.896754,0.269847,0.799673,0.274376,0.864113,0.226633,0.920755
0.18,0.250773,0.860741,0.274964,0.769772,0.282271,0.83516,0.227523,0.911112
0.19,0.25832,0.81714,0.280208,0.740634,0.290284,0.80539,0.234415,0.836398


In [32]:
class_0=pd.DataFrame({'LG_precision_0':lg_precision_0, 'LG_recall_0': lg_recall_0,\
                 'NN_precision_0':nn_precision_0,'NN_recall_0':nn_recall_0,\
                 'XG_precision_0':xg_precision_0,'XG_recall_0':xg_recall_0,\
                 'RF_precision_0':rf_precision_0,'RF_recall_0':rf_recall_0},index=thresholds)
class_0

Unnamed: 0,LG_precision_0,LG_recall_0,NN_precision_0,NN_recall_0,XG_precision_0,XG_recall_0,RF_precision_0,RF_recall_0
0.1,0.922065,0.019962,0.916511,0.114421,0.958821,0.076022,0.0,0.0
0.11,0.920825,0.032621,0.909655,0.157384,0.950664,0.109783,0.0,0.0
0.12,0.914194,0.049961,0.903576,0.20267,0.941746,0.147721,0.0,0.0
0.13,0.912105,0.073751,0.898054,0.246116,0.934364,0.189348,0.889115,0.022373
0.14,0.906909,0.102113,0.894042,0.284733,0.927592,0.233275,0.889115,0.022373
0.15,0.901266,0.13775,0.888481,0.323555,0.919831,0.27656,0.873909,0.049004
0.16,0.895616,0.180553,0.883606,0.362312,0.913495,0.322182,0.867842,0.069741
0.17,0.889522,0.230478,0.8781,0.400089,0.906762,0.366402,0.854341,0.128869
0.18,0.881427,0.287012,0.872609,0.437238,0.899981,0.411235,0.852425,0.142352
0.19,0.873323,0.349522,0.867916,0.472518,0.89379,0.45406,0.842506,0.242646


>It depends on the business's risk tolerance that the approriate threshold should be chosen. However, at threshold **0.2** in **random forest** model, precision and recall for both classes look quite balanced. Therefore, I would choose this threshold for classifying this dataset.

### Scoring test set - Random Forest - Threshold 0.2

In [33]:
rf_precision_1 = []
rf_recall_1 = []
rf_precision_0 = []
rf_recall_0 = []

rf_y_proba_test=rf_fitted.predict_proba(X_test)[:, 1]

# Apply threshold
y_threshold = np.where(rf_y_proba_test >= 0.2, 1, 0)
rf_report=classification_report(y_test, y_threshold,output_dict=True)

print('======TEST======')
print('Class 0 - Precision: ', np.round(rf_report['0']['precision'],4))
print('Class 0 - Recall: ', np.round(rf_report['0']['recall'],4))

print('\nClass 1 - Precision: ',np.round(rf_report['1']['precision'],4))
print('Class 1 - Recall: ',np.round(rf_report['1']['recall'],4))

Class 0 - Precision:  0.8361
Class 0 - Recall:  0.298

Class 1 - Precision:  0.2377
Class 1 - Recall:  0.7894


# Feature Selection <a class="anchor" id="fs"></a>

In [34]:
X_train_copy=X_train.drop(columns=['EMPLOYMENT_TYPE','PERFORM_CNS_SCORE_DESCRIPTION'])

In [43]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

kbest = SelectKBest(f_regression, k=5).fit(X_train_copy, y_train)

# See selected features
kbest_features = X_train_copy.columns[kbest.get_support()]

print(f"===== {len(kbest_features)} features were selected =====")
print(f"{', '.join(kbest_features)}")

===== 5 features were selected =====
DISBURSED_AMOUNT, LTV, STATE_ID, PERFORM_CNS_SCORE, CREDIT_HISTORY_LENGTH


In [36]:
X_train_5=X_train[['DISBURSED_AMOUNT', 'LTV', 'STATE_ID', 'PERFORM_CNS_SCORE', 'CREDIT_HISTORY_LENGTH']]

X_test_5=X_test[['DISBURSED_AMOUNT', 'LTV', 'STATE_ID', 'PERFORM_CNS_SCORE', 'CREDIT_HISTORY_LENGTH']]

In [37]:
# instantiate and fit the chosen classifier
rf_new = RandomForestClassifier(n_estimators=35, min_samples_leaf=3, max_depth=10, random_state=100)
rf_new.fit(X_train_5,y_train)

In [38]:
# train score
rf_new.score(X_train_5,y_train)

0.7829468447087753

In [39]:
# test score
rf_new.score(X_test_5,y_test)

0.7829264526754619

We see that these 5 features produced the same accuracy **~0.783** with the original 40 ones.

In [40]:
rf_y_proba_train=rf_new.predict_proba(X_train_5)[:, 1]

# Apply threshold
y_threshold = np.where(rf_y_proba_train >= 0.2, 1, 0)
rf_report=classification_report(y_train, y_threshold,output_dict=True)

print('======TRAIN======')
print('Class 0 - Precision: ', np.round(rf_report['0']['precision'],4))
print('Class 0 - Recall: ', np.round(rf_report['0']['recall'],4))

print('\nClass 1 - Precision: ',np.round(rf_report['1']['precision'],4))
print('Class 1 - Recall: ',np.round(rf_report['1']['recall'],4))

Class 0 - Precision:  0.8843
Class 0 - Recall:  0.4845

Class 1 - Precision:  0.2932
Class 1 - Recall:  0.7713


In [41]:
rf_y_proba_test=rf_new.predict_proba(X_test_5)[:, 1]

# Apply threshold
y_threshold = np.where(rf_y_proba_test >= 0.2, 1, 0)
rf_report=classification_report(y_test, y_threshold,output_dict=True)

print('======TEST======')
print('Class 0 - Precision: ', np.round(rf_report['0']['precision'],4))
print('Class 0 - Recall: ', np.round(rf_report['0']['recall'],4))

print('\nClass 1 - Precision: ',np.round(rf_report['1']['precision'],4))
print('Class 1 - Recall: ',np.round(rf_report['1']['recall'],4))

Class 0 - Precision:  0.8556
Class 0 - Recall:  0.4666

Class 1 - Precision:  0.2712
Class 1 - Recall:  0.7159
