In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [3]:
train_df = train_df.set_index("employee_id")
test_df = test_df.set_index('employee_id')

In [4]:
mean = train_df["previous_year_rating"].mean()

In [5]:
test_df["previous_year_rating"] = test_df["previous_year_rating"].fillna(mean)

In [6]:
train_df["previous_year_rating"] = train_df["previous_year_rating"].fillna(mean)

In [7]:
most_frequent_category = train_df.groupby('education')['education'].count().sort_values(ascending=False).index[0]
test_df['education'] = test_df['education'].fillna(most_frequent_category)

In [8]:
train_df['education'] = train_df['education'].fillna(most_frequent_category)

In [9]:
prob_labels = train_df.groupby(['education'])['is_promoted'].mean().to_dict()

In [10]:
prob_df = train_df.groupby(['education'])['is_promoted'].mean()
prob_df = pd.DataFrame(prob_df)

In [11]:
prob_df['Not promoted'] = 1-prob_df.is_promoted

In [12]:
prob_df['ratio'] = prob_df['is_promoted']/prob_df['Not promoted']

In [13]:
odds_ratio_labels = prob_df['ratio'].to_dict()

In [14]:
train_df['education'] = train_df.education.map(odds_ratio_labels)
test_df['education'] = test_df.education.map(odds_ratio_labels)

In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(train_df['region'])

LabelEncoder()

In [16]:
test_df['region'] = le.transform(test_df['region'])
train_df['region'] = le.transform(train_df['region'])

In [17]:
le.fit(train_df['department'])
test_df['department'] = le.transform(test_df['department'])
train_df['department'] = le.transform(train_df['department'])

In [18]:
le.fit(train_df['recruitment_channel'])
test_df['recruitment_channel'] = le.transform(test_df['recruitment_channel'])
train_df['recruitment_channel'] = le.transform(train_df['recruitment_channel'])

In [19]:
le.fit(train_df['gender'])
test_df['gender'] = le.transform(test_df['gender'])
train_df['gender'] = le.transform(train_df['gender'])

In [20]:
X = train_df.iloc[:, [0,1,2,3,4,5,6,7,8,9,10,11]].values
y = train_df.iloc[:, -1].values

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, 
                                                    random_state = 0)

In [22]:
import matplotlib.pyplot as plt

In [23]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [24]:
train_X = sc.fit_transform(X_train)

In [25]:
test_X = sc.transform(X_test)

# Gradient Boost

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

In [27]:
gb = GradientBoostingClassifier(learning_rate=0.30,n_estimators=300,random_state= 2019, warm_start=True)
gb.fit(train_X, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.3, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=300,
                           n_iter_no_change=None, presort='auto',
                           random_state=2019, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0, warm_start=True)

In [28]:
from sklearn.metrics import (confusion_matrix, 
                             accuracy_score, 
                             precision_score, 
                             recall_score, 
                             f1_score)

In [29]:
models = []
acc = []
precision = []
recall = []
f1 = []

In [30]:
print('Confusion Matrix for gb: \n',confusion_matrix(y_test, gb.predict(test_X)))
print('Accuracy for gb: \n',accuracy_score(y_test, gb.predict(test_X)))
acc.append(accuracy_score(y_test, gb.predict(test_X)))
print('Precision for gb: \n',precision_score(y_test, gb.predict(test_X)))
precision.append(precision_score(y_test, gb.predict(test_X)))
print('Recall for gb: \n',recall_score(y_test, gb.predict(test_X)))
recall.append(recall_score(y_test, gb.predict(test_X)))
print('f1_score for gb: \n',f1_score(y_test, gb.predict(test_X)))
f1.append(f1_score(y_test, gb.predict(test_X)))

Confusion Matrix for gb: 
 [[10010    31]
 [  586   335]]
Accuracy for gb: 
 0.9437146506112023
Precision for gb: 
 0.9153005464480874
Recall for gb: 
 0.36373507057546145
f1_score for gb: 
 0.5205905205905206


In [31]:
# sub_X = test_df.iloc[:, [0,1,2,3,4,5,6,7,8,9,10,11]].values
# sub_X = sc.transform(sub_X)
# pred_y = gb.predict(sub_X)
# test_df = pd.read_csv("test.csv")
# submission = pd.DataFrame({'employee_id':test_df['employee_id'],'is_promoted':pred_y})
# filename = 'Hr analytics sol final.csv'
# submission.to_csv(filename,index=False)

In [32]:
import xgboost as xgb

In [39]:
xgbc = xgb.XGBClassifier(learning_rate=0.30, random_state=2019, n_estimators=500)

In [40]:
xgbc.fit(train_X, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.3, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=2019,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [41]:
print('Confusion Matrix for xgbc: \n',confusion_matrix(y_test, xgbc.predict(test_X)))
print('Accuracy for xgbc: \n',accuracy_score(y_test, xgbc.predict(test_X)))
acc.append(accuracy_score(y_test, xgbc.predict(test_X)))
print('Precision for xgbc: \n',precision_score(y_test, xgbc.predict(test_X)))
precision.append(precision_score(y_test, xgbc.predict(test_X)))
print('Recall for xgbc: \n',recall_score(y_test, xgbc.predict(test_X)))
recall.append(recall_score(y_test, xgbc.predict(test_X)))
print('f1_score for xgbc: \n',f1_score(y_test, xgbc.predict(test_X)))
f1.append(f1_score(y_test, xgbc.predict(test_X)))

Confusion Matrix for xgbc: 
 [[10002    39]
 [  588   333]]
Accuracy for xgbc: 
 0.9428024083196497
Precision for xgbc: 
 0.8951612903225806
Recall for xgbc: 
 0.36156351791530944
f1_score for xgbc: 
 0.5150812064965197
