In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import bentoml
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import confusion_matrix,recall_score,precision_score, f1_score, classification_report
import pickle
import pickle
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import pickle

In [25]:
#Read train,test files here
X=pickle.load(open('features.pkl', 'rb'))
y=pickle.load(open('target.pkl', 'rb'))

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print('Train and test samples')
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

dv = DictVectorizer(sparse=False)

train_dicts = X_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

test_dicts = X_test.to_dict(orient='records')
X_test = dv.transform(test_dicts)

print('\nAfter one hot encoding')
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

Train and test samples
(10499, 11) (10499,)
(4500, 11) (4500,)

After one hot encoding
(10499, 25) (10499,)
(4500, 25) (4500,)


# Model Selection

The dataset is fairly simple. We will be able to achieve satisfactory results with random forest

In [31]:
rf= RandomForestClassifier(random_state=42)

In [32]:
rf.fit(X_train,y_train)

RandomForestClassifier(random_state=42)

In [33]:
rf_pred=rf.predict(X_test)
rf_pred_prob=rf.predict_proba(X_test)[:1]

confustion matrix!

In [34]:
tn, fp, fn, tp = confusion_matrix(y_test, rf_pred).ravel()

In [35]:
con_matrix=pd.DataFrame({"Predicted Churn":[tp,fp],"Predicted not Churn":[fn,tn]},index=["Actual Churn","Actual not Churn"])
con_matrix

Unnamed: 0,Predicted Churn,Predicted not Churn
Actual Churn,1023,49
Actual not Churn,11,3417


In [36]:
print(classification_report(y_test,rf_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3428
           1       0.99      0.95      0.97      1072

    accuracy                           0.99      4500
   macro avg       0.99      0.98      0.98      4500
weighted avg       0.99      0.99      0.99      4500



In [37]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [38]:
#Trying different tree depths and n_estimators

max_depth = [20, 25, 50, 100]
conf_matrix = []

for depth in max_depth:

    for n in tqdm(range(10, 1000, 100)):
        
        rf_test = RandomForestClassifier(bootstrap=False,n_estimators=n, random_state=1, n_jobs=-1, max_depth=depth)
        rf_test.fit(X_train, y_train)
        y_pred = rf_test.predict(X_test)

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
     
        conf_matrix.append((depth, n, tn, fp, fn, tp))

df_scores = pd.DataFrame(conf_matrix, columns=['max_depth','n_estimators', 'tn', 'fp', 'fn', 'tp'])

100%|██████████| 10/10 [00:20<00:00,  2.06s/it]
100%|██████████| 10/10 [00:21<00:00,  2.14s/it]
100%|██████████| 10/10 [00:22<00:00,  2.20s/it]
100%|██████████| 10/10 [00:22<00:00,  2.24s/it]


In [39]:
#False negative; failed to catch positive class
df_scores[df_scores.fn == df_scores.fn.min()].head(3)

Unnamed: 0,max_depth,n_estimators,tn,fp,fn,tp
7,20,710,3418,10,48,1024
8,20,810,3418,10,48,1024
9,20,910,3418,10,48,1024


In [40]:
#False Positive; Wrongly predicted as a Positive Class
df_scores[df_scores.fp == df_scores.fp.min()].head(3)

Unnamed: 0,max_depth,n_estimators,tn,fp,fn,tp
1,20,110,3419,9,49,1023
2,20,210,3419,9,49,1023
6,20,610,3419,9,49,1023


Both false negatives and false positives are undesirable, however based on our problem we can let go false positives, however we can't let our employees leave by being unable to catch them. Therefore we will choose the parameters that led to improvement in False negative rate by 1 point

### Final Model

In [41]:
model = RandomForestClassifier(n_estimators=710, random_state=1, n_jobs=-1, max_depth=20, bootstrap=False)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [42]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

In [43]:
con_matrix=pd.DataFrame({"Predicted Churn":[tp,fp],"Predicted not Churn":[fn,tn]},index=["Actual Churn","Actual not Churn"])
con_matrix

Unnamed: 0,Predicted Churn,Predicted not Churn
Actual Churn,1024,48
Actual not Churn,10,3418


In [44]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3428
           1       0.99      0.96      0.97      1072

    accuracy                           0.99      4500
   macro avg       0.99      0.98      0.98      4500
weighted avg       0.99      0.99      0.99      4500



1% improvement in our recall. We are good to go from here. Let's save our model to deploy as a webservice

In [45]:
bentoml.sklearn.save_model(
    
    'employee_churn_model',
    model,

    custom_objects={
        'dictVectorizer': dv
    },

    signatures={
    
        "predict": {
            "batchable": True,
            "batch_dim": 0
        },

        "predict_proba": {
            "batchable": True,
            "batch_dim": 0
        }
    }
    )


Model(tag="employee_churn_model:dsf37ns7oclduv66", path="/home/yastaha/bentoml/models/employee_churn_model/dsf37ns7oclduv66/")