In [401]:
# Library Imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler,LabelEncoder 
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_predict, GridSearchCV,cross_val_score,RandomizedSearchCV
from pandas.plotting import scatter_matrix
from sklearn.metrics import f1_score,precision_score,roc_auc_score,confusion_matrix,recall_score

from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier

In [None]:
csv_path =  '../datasets/public_dataset/emp_attrition.csv'
attrition = pd.read_csv(csv_path)

In [None]:
attrition.info()

In [None]:
attrition = attrition.drop(['Over18','BusinessTravel','DailyRate','DistanceFromHome','EducationField','EmployeeCount','EmployeeNumber','MonthlyRate','StandardHours','StockOptionLevel','Department','JobRole','OverTime','HourlyRate'],axis=1)

In [None]:
attrition.info()

In [None]:
categorical = ['Attrition','Gender','MaritalStatus']

for cat in categorical:
    print(attrition[cat].value_counts(),'\n')

In [None]:
%matplotlib inline
attrition.hist(bins=20, figsize=(20,20))
plt.show()

In [None]:
# Salary and Years at Company Categories

attrition['YearsAtCompany'].hist()

In [None]:
attrition['MonthlyIncome'].hist()

In [None]:
attrition['MonthlyIncome_cat'] = np.ceil(attrition['MonthlyIncome']/1500)
attrition['MonthlyIncome_cat'].where(attrition['MonthlyIncome_cat']<5,5.0,inplace = True )
attrition['MonthlyIncome_cat'].hist()

In [None]:
split = StratifiedShuffleSplit(n_splits = 1, test_size= 0.2, random_state=42)
for train_index, test_index in split.split(attrition,  attrition['MonthlyIncome_cat']):
    strat_train_set = attrition.loc[train_index]
    strat_test_set = attrition.loc[test_index]

In [None]:
strat_train_set['MonthlyIncome_cat'].value_counts()/len(strat_train_set)

In [None]:
attrition['MonthlyIncome_cat'].value_counts()/len(attrition)

In [None]:
for set_ in (strat_train_set,strat_test_set):
    set_.drop('MonthlyIncome_cat',axis=1,inplace=True)

In [None]:
attrition = strat_train_set.copy()

In [None]:
attrition = strat_train_set.drop('Attrition',axis=1)
attrition_labels = strat_train_set['Attrition'].copy()

In [None]:
# Find Columns with null values
attrition_incomplete_rows = attrition[attrition.isnull().any(axis=1)].head()
attrition_incomplete_rows.shape

In [None]:
attrition_num = attrition.drop(['Gender','MaritalStatus'], axis=1)

num_attribs = list(attrition_num)
cat_attribs = ['Gender','MaritalStatus']

full_pipeline = ColumnTransformer([
    ('num',StandardScaler(),num_attribs),
    ('cat',OneHotEncoder(),cat_attribs)
])


attrition_prepared = full_pipeline.fit_transform(attrition)

In [130]:
attrition_prepared


array([[-0.65817219,  0.06235435,  0.25242642, ...,  0.        ,
         0.        ,  1.        ],
       [-0.65817219,  1.04007059,  1.17147742, ...,  0.        ,
         1.        ,  0.        ],
       [-0.33199836,  1.04007059, -1.58567557, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-1.41924447, -0.91536189,  1.17147742, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.42907391,  0.06235435,  0.25242642, ...,  1.        ,
         0.        ,  0.        ],
       [-0.87562142,  0.06235435, -0.66662457, ...,  1.        ,
         0.        ,  0.        ]])

In [132]:
attrition_prepared_dataframe = pd.DataFrame(
    attrition_prepared,
    columns = list(attrition_num) + ['Female','Male','Divorced','Married','Single'],
    index = attrition_num.index
)

In [None]:
attrition_prepared_dataframe.to_csv('prepared.csv')

In [None]:
test_ = strat_test_set.copy()

In [None]:
test_ = strat_test_set.drop('Attrition', axis=1)

In [None]:
test_labels = strat_test_set['Attrition'].copy()

In [128]:
test_prepared = full_pipeline.transform(test_)

In [140]:
test_prepared_dataframe = pd.DataFrame(
    test_prepared,
    columns = list(attrition_num) + ['Female','Male','Divorced','Married','Single'],
    index = test_.index
)

In [164]:
test_prepared_dataframe.to_csv('test.csv')

In [165]:
# ============================================================================================Pre Processing Ends Here

In [208]:
rfc = RandomForestClassifier(max_features = 4, n_estimators= 10)

In [209]:
cross_val_score(rfc,attrition_prepared,attrition_labels,cv=10)

array([0.78151261, 0.84033613, 0.8487395 , 0.86324786, 0.84615385,
       0.88888889, 0.86324786, 0.84615385, 0.84615385, 0.85470085])

In [210]:
rfc.fit(attrition_prepared,attrition_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=4, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [219]:
rfc.predict(test_prepared[100:110])

array(['No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No'],
      dtype=object)

In [487]:
f1_score(list(test_labels),rfc.predict(test_prepared),pos_label="Yes")

0.3055555555555555

In [None]:
# =============================================================================================RFC Ends Here

In [237]:
from sklearn.ensemble import GradientBoostingClassifier

In [426]:
gbc  = GradientBoostingClassifier()

In [427]:
gbc.fit(attrition_prepared,attrition_labels)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [437]:
parameters ={'learning_rate':[0.1,0.2,0.5,0.8], 'subsample':[0.5,1], 'max_depth':[1,2,3,4,5,10,15]}

gridSearch = RandomizedSearchCV(gbc,parameters,cv=10,scoring='roc_auc')
gridSearch.fit(attrition_prepared, le.transform(list(attrition_labels)))




['Yes',
 'No',
 'No',
 'Yes',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'Yes',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'N

In [438]:
gridSearch.best_params_

{'subsample': 0.5, 'max_depth': 1, 'learning_rate': 0.2}

In [439]:
gbc_tuned=gridSearch.best_estimator_
gbc_tuned

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.2, loss='deviance', max_depth=1,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=0.5, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [440]:
print(gbc_tuned.predict(attrition_prepared[:10]))

[0 0 1 0 0 0 0 0 0 0]


In [441]:
le.transform(list(attrition_labels[:10]))

array([1, 0, 0, 1, 0, 1, 0, 0, 0, 0], dtype=int64)

In [442]:
cross_val_score(gbc_tuned,attrition_prepared,le.transform(list(attrition_labels)))



array([0.83673469, 0.84693878, 0.85969388])

In [488]:
f1_score(le.transform(list(test_labels)),gbc_tuned.predict(test_prepared))

0.3188405797101449

In [451]:
gbc_tuned.predict(test_prepared[:10])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [453]:
le.transform(list(test_labels[:10]))

array([0, 0, 0, 0, 0, 0, 1, 1, 0, 1], dtype=int64)

In [455]:
#==================================================================================================================gbc Ends here

In [458]:
from sklearn.svm import LinearSVC

In [472]:
lsvc = LinearSVC(random_state=133,max_iter =10000)

In [484]:
parameters = {'loss' : [ 'hinge','squared_hinge'], 'C':[0.5,1,5]}
randomSearch = RandomizedSearchCV(lsvc,parameters,scoring='f1')
randomSearch.fit(attrition_prepared,le.transform(list(attrition_labels)))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                       fit_intercept=True, intercept_scaling=1,
                                       loss='squared_hinge', max_iter=10000,
                                       multi_class='ovr', penalty='l2',
                                       random_state=133, tol=0.0001,
                                       verbose=0),
                   iid='warn', n_iter=10, n_jobs=None,
                   param_distributions={'C': [0.5, 1, 5],
                                        'loss': ['hinge', 'squared_hinge']},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring='f1', verbose=0)

In [536]:
randomSearch.best_estimator_.feature_importances_

AttributeError: 'LinearSVC' object has no attribute 'feature_importances_'

In [485]:
lsvc_tuned = randomSearch.best_estimator_

In [489]:
lsvc_tuned.predict(test_prepared[:10])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [492]:
f1_score(le.transform(list(test_labels)),lsvc_tuned.predict(test_prepared))

0.22580645161290322

In [493]:
# ==============================================================================================================LSVC Ends Here

In [495]:
 from sklearn.neural_network import MLPClassifier

In [510]:
mlpc = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(500, 500), random_state=1)


In [514]:
mlpc.fit(attrition_prepared,list(attrition_labels))

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(500, 500), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [515]:
cross_val_score(mlpc,attrition_prepared,le.transform(list(attrition_labels)))



array([0.78571429, 0.76785714, 0.78826531])

In [516]:
mlpc.predict(test_prepared[:10])

array(['No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No'],
      dtype='<U3')

In [518]:
list(test_labels[:10])

['No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes']

In [534]:
print(recall_score(list(test_labels),mlpc.predict(test_prepared),pos_label="No"),'\n',
recall_score(list(test_labels),mlpc.predict(test_prepared),pos_label="Yes"))


0.9208333333333333 
 0.2222222222222222


In [525]:
attrition_labels.value_counts()

No     993
Yes    183
Name: Attrition, dtype: int64

In [526]:
test_labels.value_counts()

No     240
Yes     54
Name: Attrition, dtype: int64