In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import KFold 
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score, recall_score, precision_score, f1_score, average_precision_score 
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report
from sklearn.metrics import plot_precision_recall_curve, precision_recall_curve


sns.set(color_codes = True)

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# plt.style.use('fivethirtyeight')

%matplotlib inline

In [23]:
data_file = 'IBM-Employee-Attrition.csv'

# import cafe listings into dataframe
emp = pd.read_csv(data_file)

emp = emp.drop(columns=['EmployeeCount', 'EmployeeNumber','Over18'])


attrition_map={'Yes':1,'No':0}
emp['Attrition'] = emp['Attrition'].map(attrition_map)

In [3]:
emp.shape

(1470, 32)

In [24]:
# get all numerical columns
numerical_dtypes = ['int16','int32', 'int64','float16','float32','float64']
num_cols = []
for i in emp.columns:
    if emp[i].dtype in numerical_dtypes:
        num_cols.append(i)
        
print(len(num_cols))
print(num_cols)

25
['Age', 'Attrition', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']


In [25]:
# get all category columns

# cat_cols = list(set(emp.columns) - set(num_cols))

cat_cols = emp.columns.difference(num_cols)

print(len(cat_cols))
print(cat_cols)

7
Index(['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime'], dtype='object')


In [26]:
attri_yes = emp[emp.Attrition==1]
attri_no =  emp[emp.Attrition==0]

## FEATURES ENGINEERING

In [27]:
gender_map={'Male':1,'Female':0}
emp['GenderMale'] = emp['Gender'].map(gender_map)

overtime_map={'Yes':1,'No':0}
emp['OverTime'] = emp['OverTime'].map(overtime_map)

emp.head(3)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,GenderMale
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,1,11,3,1,80,0,8,0,1,6,4,0,5,0
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,0,23,4,4,80,1,10,3,3,10,7,1,7,1
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,1,15,3,2,80,0,7,3,3,0,0,0,0,1


In [28]:
# get all numerical columns
numerical_dtypes = ['int16','int32', 'int64','float16','float32','float64']
        
num_cols = [i for i in emp.columns if emp[i].dtype in numerical_dtypes]        
        
print(len(num_cols))
print(num_cols)

27
['Age', 'Attrition', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'GenderMale']


In [29]:
# get all category columns
# emp = emp.drop(columns=['Gender'])

# cat_cols = list(set(emp.columns) - set(num_cols))

cat_cols = emp.columns.difference(num_cols)

print(len(cat_cols))
print(cat_cols)

6
Index(['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus'], dtype='object')


In [30]:
# HOT ENCODE nominal cols

emp = pd.get_dummies(data = emp, columns = cat_cols)
emp.head()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,GenderMale,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,Gender_Female,Gender_Male,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,41,1,1102,1,2,2,94,3,2,4,5993,19479,8,1,11,3,1,80,0,8,0,1,6,4,0,5,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1
1,49,0,279,8,1,3,61,2,2,2,5130,24907,1,0,23,4,4,80,1,10,3,3,10,7,1,7,1,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0
2,37,1,1373,2,2,4,92,2,1,3,2090,2396,6,1,15,3,2,80,0,7,3,3,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1
3,33,0,1392,3,4,4,56,3,1,3,2909,23159,1,1,11,3,3,80,0,8,3,3,8,7,3,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
4,27,0,591,2,1,1,40,3,1,2,3468,16632,9,0,12,3,4,80,1,6,3,3,2,2,2,2,1,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0


In [31]:
emp_income_fitler = emp[(emp.MonthlyIncome < 15000)]

In [32]:
emp['MonthlyIncome2'] = np.log1p(emp['MonthlyIncome'])

In [33]:
emp_income_fitler['MonthlyIncome3'] = np.log1p(emp_income_fitler['MonthlyIncome'])
emp_income_fitler['MonthlyIncome3']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emp_income_fitler['MonthlyIncome3'] = np.log1p(emp_income_fitler['MonthlyIncome'])


0       8.698514
1       8.543056
2       7.645398
3       7.975908
4       8.151622
          ...   
1465    7.852439
1466    9.209540
1467    8.723069
1468    8.592486
1469    8.390496
Name: MonthlyIncome3, Length: 1337, dtype: float64

In [34]:
JobRole_encoded_cols = [c for c in emp if c.startswith('JobRole_')]
Department_encoded_cols = [c for c in emp if c.startswith('Department_')]
EducationField_encoded_cols = [c for c in emp if c.startswith('EducationField_')]
BusinessTravel_encoded_cols = [c for c in emp if c.startswith('BusinessTravel_')]
MaritalStatus_encoded_cols = [c for c in emp if c.startswith('MaritalStatus_')]
JobRole_encoded_cols

['JobRole_Healthcare Representative',
 'JobRole_Human Resources',
 'JobRole_Laboratory Technician',
 'JobRole_Manager',
 'JobRole_Manufacturing Director',
 'JobRole_Research Director',
 'JobRole_Research Scientist',
 'JobRole_Sales Executive',
 'JobRole_Sales Representative']

In [35]:
cols1 = ['WorkLifeBalance',
         'JobSatisfaction',
         'JobInvolvement',
         'YearsAtCompany',
         'StockOptionLevel',
         'YearsWithCurrManager',
         'Age',
         'MonthlyIncome',
         'YearsInCurrentRole',
         'JobLevel',
         'TotalWorkingYears']

cols2 = ['DistanceFromHome',
        'NumCompaniesWorked',
        'PerformanceRating',
        'HourlyRate',
        'PercentSalaryHike',
        'Education',
        'YearsSinceLastPromotion',
        'RelationshipSatisfaction',
        'DailyRate',
        'TrainingTimesLastYear']

cols3=['OverTime','MaritalStatus_Single','YearsInCurrentRole', 'JobLevel','JobRole_Sales Representative',
       'TotalWorkingYears','MonthlyIncome','YearsWithCurrManager','StockOptionLevel']

cols4=['Age','YearsAtCompany','JobInvolvement','JobSatisfaction','MaritalStatus_Married','MaritalStatus_Divorced',
        'JobRole_Healthcare Representative', 'JobRole_Laboratory Technician','JobRole_Research Director',
           'Department_Research & Development','JobRole_Manufacturing Director','Department_Sales','JobRole_Manager',
       'EnvironmentSatisfaction','BusinessTravel_Non-Travel','DistanceFromHome','WorkLifeBalance','EducationField_Technical Degree',
       'NumCompaniesWorked','TrainingTimesLastYear','BusinessTravel_Travel_Rarely','EducationField_Marketing']


cols5=['BusinessTravel_Travel_Rarely', 'WorkLifeBalance', 'EducationField_Marketing', 
       'TrainingTimesLastYear', 'JobInvolvement', 'Department_Research & Development', 
       'EnvironmentSatisfaction', 'EducationField_Technical Degree', 'JobSatisfaction', 
       'Department_Sales', 'BusinessTravel_Non-Travel', 'MaritalStatus_Married', 'JobRole_Manager', 
       'JobRole_Manufacturing Director', 'JobRole_Research Director', 'MaritalStatus_Divorced', 
       'JobRole_Laboratory Technician', 'JobRole_Healthcare Representative', 'NumCompaniesWorked', 
       'JobLevel', 'StockOptionLevel', 'JobRole_Sales Representative', 'MaritalStatus_Single', 
       'DistanceFromHome', 'Age', 'OverTime', 'YearsWithCurrManager', 'YearsAtCompany', 'YearsInCurrentRole', 
       'TotalWorkingYears', 'MonthlyIncome']

cols6 = ['MonthlyIncome3','JobSatisfaction','EnvironmentSatisfaction','WorkLifeBalance', 'JobInvolvement',
           'TrainingTimesLastYear','NumCompaniesWorked','JobLevel', 'StockOptionLevel',
           'DistanceFromHome', 'YearsWithCurrManager', 'YearsAtCompany', 'YearsInCurrentRole', 
           'TotalWorkingYears','Age','OverTime','GenderMale']

# feature_cols = cols1 + cols2

filter_cols = ['HourlyRate','DailyRate','MonthlyRate']


target_col = 'Attrition'

# feature_cols = [c for c in emp.columns if c != target_col]

# feature_cols = [c for c in emp.columns if (c != target_col) & (c not in filter_cols)]

# feature_cols = cols3 + cols4

# feature_cols = cols5

feature_cols = cols6 + JobRole_encoded_cols + Department_encoded_cols + EducationField_encoded_cols + BusinessTravel_encoded_cols + MaritalStatus_encoded_cols



X = emp_income_fitler[feature_cols]
 
y = emp_income_fitler['Attrition']

# X = emp[feature_cols]

# y = emp['Attrition']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)


In [36]:
X_train.head()

Unnamed: 0,MonthlyIncome3,JobSatisfaction,EnvironmentSatisfaction,WorkLifeBalance,JobInvolvement,TrainingTimesLastYear,NumCompaniesWorked,JobLevel,StockOptionLevel,DistanceFromHome,YearsWithCurrManager,YearsAtCompany,YearsInCurrentRole,TotalWorkingYears,Age,OverTime,GenderMale,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
1232,8.836955,3,4,3,3,3,2,2,1,27,7,7,7,17,35,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0
1069,7.355002,3,1,1,2,2,1,1,1,1,0,1,0,1,28,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0,0
657,7.83716,4,1,3,2,5,6,1,3,7,3,4,3,8,29,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0
182,8.052296,2,2,2,3,5,1,1,0,20,2,4,3,4,41,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1
303,8.843615,4,2,2,4,3,4,2,1,7,7,8,7,10,31,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0


In [37]:
ss = StandardScaler()
Xs = ss.fit_transform(X)

Xs_train, Xs_test, ys_train, ys_test = train_test_split(Xs,y,test_size=0.2,random_state=42)

len(Xs)
# Counter(ys_test)

1337

In [38]:
from imblearn.over_sampling import SMOTE
from collections import Counter

smote = SMOTE(random_state = 42)
X_sm, y_sm = smote.fit_resample(Xs_train, ys_train)

from collections import Counter
Counter(y_sm)

Counter({0: 876, 1: 876})

In [39]:
from sklearn.model_selection import RepeatedStratifiedKFold

kf = KFold(n_splits=5, random_state=42, shuffle=True)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

def cross_val_metrics(model, X_train, X_test, y_train, y_test) :
    scores = ['accuracy', 'precision', 'recall', 'f1', 'average_precision','roc_auc']
    print('\n Model:', model)
    for sc in scores:
        scores = cross_val_score(model, X_train, y_train, cv = cv, scoring = sc)
        print('[%s] : %0.5f (+/- %0.5f)'%(sc, scores.mean(), scores.std()))

    model.fit( X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))


In [40]:
def plot_ROC_curve(model,X_test,y_test):

    # Generate the prediction values for each of the test observations using predict_proba() function rather than just predict
    preds = model.predict_proba(X_test)[:,1]

    # Store the false positive rate(fpr), true positive rate (tpr) in vectors for use in the graph
    fpr, tpr, _ = roc_curve(y_test, preds)

    # Store the Area Under the Curve (AUC) so we can annotate our graph with theis metric
    roc_auc = auc(fpr, tpr)

    # Plot the ROC Curve
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange', lw = lw, label = 'ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color = 'navy', lw = lw, linestyle = '--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc = "lower right")
    plt.show()


In [41]:
def display_scores(model, X_test, y_test, y_pred):
    
     # predict probabilities
    pred_probs = model.predict_proba(X_test)
    # keep probabilities for the positive outcome only
    pred_probs = pred_probs[:, 1]
    
    print('Accuracy is: ',round(accuracy_score(y_test, y_pred),2))
    print('F1 score is: ',round(f1_score(y_test, y_pred),2))
    print('Ave PR score: ',round(average_precision_score(y_test, pred_probs),2))

    cm = confusion_matrix(y_test,y_pred)
#     sns.heatmap(cm,annot=True,fmt="d")
    
    sns.heatmap(cm/np.sum(cm),annot=True,fmt='.2%', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    
    print(classification_report(y_test,y_pred,target_names=('Stay','Leave')))
    
    plot_precision_recall_curve(model, X_test, y_test)
    
    plot_ROC_curve(model,X_test,y_test)

In [47]:
def cross_val_models(X_train, X_test, y_train, y_test,imbalancedData=False,hyperTune=False):
    
    if not hyperTune:
        if imbalancedData:
            # models with hyperparams for imbalanced dataset
            models = [
                      ('LogReg', LogisticRegression(solver='liblinear')), 
                      ('RF', RandomForestClassifier(class_weight='balanced')),
                      ('GB', GradientBoostingClassifier()),
                      ('XGB', xgb.XGBClassifier(scale_pos_weight=5.2)),
                      ('KNN', KNeighborsClassifier()),
                      ('NB', GaussianNB())
                     ] 
        else:
             models = [
                      ('LogReg', LogisticRegression()), 
                      ('RF', RandomForestClassifier()),
                      ('GB', GradientBoostingClassifier()),
                      ('XGB', xgb.XGBClassifier()),
                      ('KNN', KNeighborsClassifier()),
                      ('NB', GaussianNB())
                     ] 
            
    else: 
        XGB_clf = xgb.XGBClassifier(colsample_bytree=0.6,
                                     gamma=1,
                                     learning_rate= 0.1,
                                     max_depth=12,
                                     min_child_weight=1,
                                     n_estimators=200,
                                     subsample=1.0)

        RF_clf = RandomForestClassifier(max_depth=25,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         n_estimators=1200)

        GB_clf = GradientBoostingClassifier(random_state=42, 
                                            learning_rate=1, 
                                            max_depth=9, 
                                            n_estimators=50)
        
        log_model = LogisticRegression(solver='newton-cg',
                                       C=1.0,
                                       penalty='l2')
        
        models = [
                  ('LogReg', log_model), 
                  ('RF', RF_clf),
                  ('GB', GB_clf),
                  ('XGB', XGB_clf)
                ] 
            
    
    scoring = ['accuracy', 'precision', 'recall', 'f1', 'average_precision','roc_auc']
    train_score_dfs = []
    test_score_dfs = []
    test_score_dict = {}
    target_names = ['No Churn', 'Churn']
    for name, model in models:
        
        # GET TRAIN SCORES
        kfold = KFold(n_splits=5, shuffle=True, random_state=42)
        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
        cv_results = cross_validate(model, X_train, y_train, cv=cv, scoring=scoring)
        df1 = pd.DataFrame(cv_results)
        df1['model'] = name
        train_score_dfs.append(df1)
    
        # GET TEST SCORES
        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print(name)
        print(classification_report(y_test, y_pred, target_names=target_names))
        
        # predict probabilities
        pred_probs = model.predict_proba(X_test)
        # keep probabilities for the positive class only
        pred_probs = pred_probs[:, 1]
        
        test_score_dict = {'Accuracy': round(accuracy_score(y_test, y_pred),2),
                           'Precision': round(precision_score(y_test, y_pred),2),
                           'Recall': round(recall_score(y_test, y_pred),2),
                           'F1': round(f1_score(y_test, y_pred),2),
                           'Average PC': round(average_precision_score(y_test, pred_probs),2),
                           'ROC_AUC': round(roc_auc_score(y_test, y_pred),2)
                          }
        
        df2 = pd.DataFrame.from_dict(test_score_dict, orient='index').transpose()
        df2['Model'] = name
        test_score_dfs.append(df2)
    
    # combine all score sets into final df
    final_train_scores = pd.concat(train_score_dfs, ignore_index=True)
    final_test_scores = pd.concat(test_score_dfs, ignore_index=True)
    
    return final_train_scores, final_test_scores

In [42]:
# MonthlyIncome3 - scaler - no smote

XGB_clf = xgb.XGBClassifier(scale_pos_weight=5.2)
cross_val_metrics(XGB_clf,Xs_train,Xs_test,ys_train,ys_test) 

LG_clf = LogisticRegression()
cross_val_metrics(LG_clf,Xs_train,Xs_test,ys_train,ys_test)   

RF_clf = RandomForestClassifier(class_weight='balanced')
cross_val_metrics(RF_clf,Xs_train,Xs_test,ys_train,ys_test)   

GB_clf = GradientBoostingClassifier()
cross_val_metrics(GB_clf,Xs_train,Xs_test,ys_train,ys_test)   



 Model: XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)
[accuracy] : 0.84413 (+/- 0.02772)
[precision] : 0.63956 (+/- 0.14680)
[recall] : 0.33482 (+/- 0.10185)
[f1] : 0.43186 (+/- 0.11322)
[average_precision] : 0.55035 (+/- 0.08546)
[roc_auc] : 0.78587 (+/- 0.04717)
              precision    recall  f1-score   support

           0       0.91      0.95      0.93       229
           1       0.62      0.46      

  _warn_prf(average, modifier, msg_start, len(result))


[precision] : 0.80774 (+/- 0.23903)
[recall] : 0.15886 (+/- 0.07145)
[f1] : 0.23056 (+/- 0.10761)
[average_precision] : 0.53928 (+/- 0.07858)
[roc_auc] : 0.79560 (+/- 0.04951)
              precision    recall  f1-score   support

           0       0.89      1.00      0.94       229
           1       1.00      0.28      0.44        39

    accuracy                           0.90       268
   macro avg       0.95      0.64      0.69       268
weighted avg       0.91      0.90      0.87       268


 Model: GradientBoostingClassifier()
[accuracy] : 0.84723 (+/- 0.02639)
[precision] : 0.67288 (+/- 0.16456)
[recall] : 0.31886 (+/- 0.10959)
[f1] : 0.41893 (+/- 0.11749)
[average_precision] : 0.55147 (+/- 0.08329)
[roc_auc] : 0.78868 (+/- 0.05313)
              precision    recall  f1-score   support

           0       0.92      0.97      0.94       229
           1       0.71      0.51      0.60        39

    accuracy                           0.90       268
   macro avg       0.82      0

In [43]:
# MonthlyIncome3 - scaler - yes smote

XGB_clf = xgb.XGBClassifier(scale_pos_weight=5.2)
cross_val_metrics(XGB_clf,X_sm,Xs_test,y_sm,ys_test) 

LG_clf = LogisticRegression()
cross_val_metrics(LG_clf,X_sm,Xs_test,y_sm,ys_test) 

RF_clf = RandomForestClassifier(class_weight='balanced')
cross_val_metrics(RF_clf,X_sm,Xs_test,y_sm,ys_test) 

GB_clf = GradientBoostingClassifier()
cross_val_metrics(GB_clf,X_sm,Xs_test,y_sm,ys_test) 


 Model: XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)
[accuracy] : 0.91913 (+/- 0.01994)
[precision] : 0.94668 (+/- 0.02788)
[recall] : 0.88926 (+/- 0.03111)
[f1] : 0.91657 (+/- 0.02072)
[average_precision] : 0.97655 (+/- 0.00845)
[roc_auc] : 0.96907 (+/- 0.01196)
              precision    recall  f1-score   support

           0       0.92      0.94      0.93       229
           1       0.59      0.51      

In [48]:
# MonthlyIncome3 - scaler - yes smote

cv_scores1, test_scores1 = cross_val_models(X_sm, Xs_test, y_sm, ys_test,imbalancedData=False,hyperTune=True)
test_scores1.sort_values(['ROC_AUC','Recall'],ascending=False)

LogReg
              precision    recall  f1-score   support

    No Churn       0.96      0.75      0.84       229
       Churn       0.36      0.82      0.50        39

    accuracy                           0.76       268
   macro avg       0.66      0.78      0.67       268
weighted avg       0.87      0.76      0.79       268



KeyboardInterrupt: 