## Importing the Libraries

In [42]:
import numpy as np                  # Manupulation of Multidimensional array and matrix
import matplotlib.pyplot as plt     # For Visualisation
import pandas as pd                 # Loading Dataset and Manupulation
import seaborn as sns               # Plotting more attracting graphs and chart
from numpy import array 

from sklearn.preprocessing import MinMaxScaler # To Scale the data

#For Upsampling to reduce Imbalance in dataset
from sklearn.utils import resample

# For Splitting the Dataset into train-test and Performing Cross Validation Score
from sklearn.model_selection import train_test_split,cross_val_score, KFold

#For Performing the Logistic Regression
from sklearn.linear_model import LogisticRegression

#For Performing Random Forest Classifier 
from sklearn.ensemble import RandomForestClassifier

#For Performing Naive Bayes
from sklearn.naive_bayes import MultinomialNB

#For Performing KNN
from sklearn.neighbors import KNeighborsClassifier

#For Performing Support Vector Machine
from sklearn.svm import SVC,LinearSVC

#For Performing Decision Tree
from sklearn.tree import DecisionTreeClassifier

#For Performing Voting Classifier Algo
from sklearn.ensemble import VotingClassifier


#For getting the Confusion Matrix,Accuracy of the Model,Loss etc.
from sklearn.metrics import confusion_matrix,mean_squared_error,accuracy_score,classification_report
from sklearn.metrics import precision_score,recall_score,roc_auc_score,f1_score,cohen_kappa_score

## For Performing HyperParameter Tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

## To save the model
import pickle

## Creating Application  to Solve the Problem

In [2]:
class Analytics_Hack_Data:
    def __init__(self,df):
        ## To Get the Data
        self.df = df
        
    def standardize_data(self,sdata):        
        # initialising the MinMaxScaler
        scaler = MinMaxScaler(feature_range=(0, 1))
        # learning the statistical parameters for each of the data and transforming
        rescaled = scaler.fit_transform(sdata)
        return rescaled ## Return the rescaled data
    
    def rescale_the_data(self,ImpCol):
        ## Converting the date into number of days.
        self.df['Dateofjoining'] = pd.to_datetime(self.df['Dateofjoining']).sub(pd.Timestamp('2010-01-01')).dt.days
        ## converting Geder into integer
        self.df['Gender'].replace('Female',0,inplace=True)
        self.df['Gender'].replace('Male',1,inplace=True)
        
        ## converting City name and Education_level into integers
        change={
                    'Master':0,
                    "College":1,
                    "Bachelor":2
                }
        self.df['Education_Level'].replace(change,inplace=True)
        self.df["City"] = self.df["City"].astype('category') ## Using category to convert categorical data into integers
        self.df["City"] = self.df["City"].cat.codes
        
        size = len(self.df.columns) ## Finding number of columns
        X = self.df.iloc[:,0:size-1] ## Getting all the columns except last one i.e. Target
        X = X[ImpCol] ## Removing all the useless columns
        Y = self.df.iloc[:,-1] ## Getting the Target Value
        
        rescaledX = self.standardize_data(X) ## Standardize or scale the data
        
        return rescaledX,Y ## Return all the pre processed data
    
    
        
    def Split_the_data(self,X,Y):
        # Splitting the data into train and test
        X_train, X_test, y_train, y_test = train_test_split(X, Y , train_size=0.7, random_state=42)
        return X_train, X_test, y_train, y_test ## Return all the split data
    
    
    def random_forest(self,X_train, X_test, y_train, y_test):
        # classifier_rf = RandomForestClassifier(random_state=42, n_jobs=-1, max_depth=5,n_estimators=100, oob_score=True)

        # n_estimators = number of desission trees, max_depth=depth of the tree
        rf = RandomForestClassifier(n_estimators=300,max_depth=3) ## intiating the random forest 
        
        ## Fitting the train data
        rf.fit(X_train, y_train)

        print("score on test: " + str(rf.score(X_test, y_test)))
        print("score on train: "+ str(rf.score(X_train, y_train)))
        
        # save the model to disk
        filename = 'random_forest.sav'
        pickle.dump(rf, open(filename, 'wb'))
        
    
    def Naive_Bayes(self,X_train, X_test, y_train, y_test):
        ## Intiate the Naive Bayes
        mnb = MultinomialNB().fit(X_train, y_train)  ## Fitting the train data
        print("score on test: " + str(mnb.score(X_test, y_test)))
        print("score on train: "+ str(mnb.score(X_train, y_train)))
        # save the model to disk
        filename = 'NaiveBayes.sav'
        pickle.dump(mnb, open(filename, 'wb'))
        
        
    def Logistic_Regression(self,X_train, X_test, y_train, y_test):
        ### Intiate the Logistic regression Model
        log = LogisticRegression(penalty='l2', dual=False, tol=1e-4,
        C=1.0, fit_intercept=True,
        intercept_scaling=1, class_weight=None,
        random_state=None, solver='lbfgs',
        max_iter=100, multi_class='auto',
        verbose=0, warm_start=False, n_jobs=None)
        
        ## Fitting the train data
        log.fit(X_train, y_train)
        
        print("score on test: " + str(log.score(X_test, y_test)))
        print("score on train: "+ str(log.score(X_train, y_train)))
        
        # save the model to disk
        filename = 'Logistic.sav'
        pickle.dump(log, open(filename, 'wb'))
        
        
    def K_neigbours(self,X_train, X_test, y_train, y_test):
        
        #knn = KNeighborsClassifier(n_neighbors=5,algorithm = 'ball_tree')
        
        knn = KNeighborsClassifier(algorithm = 'brute', n_jobs=-1)
        
        ## Fitting the train data
        knn.fit(X_train, y_train)

        print("score on test: " + str(knn.score(X_test, y_test)))
        print("score on train: "+ str(knn.score(X_train, y_train)))
        
        
    def support_vector(self,X_train, X_test, y_train, y_test):
        svm=LinearSVC(C=0.0001)
        
        ## Fitting the train data
        svm.fit(X_train, y_train)

        ## save the model to disk
        filename = 'Support_Vector.sav'
        pickle.dump(svm, open(filename, 'wb'))
        
        print("score on test: " + str(svm.score(X_test, y_test)))
        print("score on train: "+ str(svm.score(X_train, y_train)))
    
    def decision_tree(self,X_train, X_test, y_train, y_test):
        clf = DecisionTreeClassifier(min_samples_split=10,max_depth=3)
        
        ## Fitting the train data
        clf.fit(X_train, y_train)

        print("score on test: " + str(clf.score(X_test, y_test)))
        print("score on train: "+ str(clf.score(X_train, y_train)))
    
    
    def voting_classifier(self,X_train, X_test, y_train, y_test):

        # 1) naive bias = mnb
        mnb = MultinomialNB().fit(X_train, y_train)
        # 2) logistic regression =lr
        lr=LogisticRegression(max_iter=5000)
        # 3) random forest =rf
        rf = RandomForestClassifier(n_estimators=30,max_depth=3)
        # 4) suport vecotr mnachine = svm
        svm=LinearSVC(max_iter=5000)

        evc=VotingClassifier(estimators=[('mnb',mnb),('lr',lr),('rf',rf),('svm',svm)])
        
        ## Fitting the train data
        evc.fit(X_train, y_train)
        
        # save the model to disk
        filename = 'voting_class.sav'
        pickle.dump(mnb, open(filename, 'wb'))

        print("score on test: " + str(evc.score(X_test, y_test)))
        print("score on train: "+ str(evc.score(X_train, y_train)))
        
        
    def grid_Serch(self,X_train, X_test, y_train, y_test):
        ## initiate the models
        rf = RandomForestClassifier(n_estimators=300,max_depth=3)
        params = {
            'max_depth': [2,3,5,10,20],
            'min_samples_leaf': [5,10,20,50,100,200],
            'n_estimators': [10,25,30,50,100,200]
        }

        # Instantiate the grid search model
        grid_search = GridSearchCV(estimator=rf,
                                   param_grid=params,
                                   cv = 4,
                                   n_jobs=-1, verbose=1, scoring="accuracy")
        
        ## Fitting the train data
        grid_search.fit(X_train, y_train)
        print("score on train: "+ str(grid_search.score(X_train, y_train)))

        #save the model to disk
        filename = 'grid_search.sav'
        pickle.dump(grid_search, open(filename, 'wb'))
        
    def accuracy_result(self,y_test, y_pred_test):
        confusion_matrix=metrics.confusion_matrix(y_test, y_pred_test)
        # USE THE IMPORTED CONFUSION MATRIX
        print('\n CONFUSION MATRIX:\n ', confusion_matrix,'\n')
        TP = confusion_matrix[1, 1]
        TN = confusion_matrix[0, 0]
        FP = confusion_matrix[0, 1]
        FN = confusion_matrix[1, 0]
        false_positive_rate = round(FP / float(TN + FP),3)
        print('FPR: ', false_positive_rate)
        print('TPR/ RECALL/ SENSTIVITY: ', round(metrics.recall_score(y_test, y_pred_test), 3))
        print('PRECISION:' ,round(metrics.precision_score(y_test, y_pred_test), 3))
        specificity = round(TN / (TN + FP),3)
        print('SPECIFICITY: ',specificity)
        print('ACCURACY: ', np.round(metrics.accuracy_score(y_test, y_pred_test),3))
        print('ROC AUC: ', np.round(roc_auc_score(y_test, y_pred_test),3))
        print('Cohens kappa: ',np.round(cohen_kappa_score(y_test, y_pred_test),3))
        print('F1 score: ', np.round(f1_score(y_test, y_pred_test),3))
        print('\n CLASSIFICATION REPORT: \n',classification_report(y_test,y_pred_test))
        return 
        
    

In [18]:
data = pd.read_csv("New_Train.csv") ## Reading the New_Train_csv File

In [19]:
data.EmployeeLeaveOrNot.value_counts() ## Getting count of Target Values

1    1616
0     765
Name: EmployeeLeaveOrNot, dtype: int64

In [20]:
def upsampling_of_data(df):
    data_major = df[df.EmployeeLeaveOrNot==1]
    data_minor = df[df.EmployeeLeaveOrNot==0]
    # Upsample minority class
    data_minority_upsampled = resample(data_minor, 
                                         replace=True,     # sample with replacement
                                         n_samples=int(len(data)*0.5),    # to match majority class for 12% proportion
                                         random_state=123) # reproducible results

    # Combine majority class with upsampled minority class
    data_upsampled = pd.concat([data_major, data_minority_upsampled])
        
    return data_upsampled

In [21]:
sampled_data = upsampling_of_data(data) ## Calling upsampling of data.
print(sampled_data.EmployeeLeaveOrNot.value_counts())

1    1616
0    1190
Name: EmployeeLeaveOrNot, dtype: int64


In [22]:
obj1 = Analytics_Hack_Data(sampled_data) ## initiate the object 
ImpCol = ['Age', 'Gender', 'City', 'Education_Level',
       'Salary', 'Dateofjoining', 'Joining Designation',
       'Designation', 'Total Business Value', 'Quarterly Rating'] ## important columns from the data
x,y=obj1.rescale_the_data(ImpCol) ## calling rescaling of data
print(y)
X_train, X_test, y_train, y_test = obj1.Split_the_data(x,y) ## Calling plitting of data

## Calling  the algorithm we wrote get model saved and get accuracy

#obj1.random_forest(X_train, X_test, y_train, y_test)
#obj1.Logistic_Regression(X_train, X_test, y_train, y_test)
#obj1.Naive_Bayes(X_train, X_test, y_train, y_test)
#obj1.voting_classifier(X_train, X_test, y_train, y_test)
#obj1.support_vector(X_train, X_test, y_train, y_test)
obj1.grid_Serch(X_train, X_test, y_train, y_test)

  return self.partial_fit(X, y)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0       1
2       1
3       1
5       1
7       1
       ..
2067    0
970     0
2233    0
2042    0
203     0
Name: EmployeeLeaveOrNot, Length: 2806, dtype: int64
Fitting 4 folds for each of 180 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   16.0s


score on train: 0.9241344195519349


[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:   25.2s finished


##  Converting the data to Train

In [7]:
## converting the data to new train.
def convert_new_Train(data):
    gk = data.groupby("Emp_ID") ## Group the data by Emp_ID
    temp = gk.first()
    mgk = gk.mean().round(2) ## Getting average of data
    mgk['EmployeeLeaveOrNot'] = mgk['EmployeeLeaveOrNot'].apply(np.ceil) ## Solving the issue if data has floating value convert into integer
    
    ## appending all the missing i.e. Categorical  column to mgk since mean function applied to integers only
    mgk["City"] = temp["City"]
    mgk["Gender"] = temp["Gender"]
    mgk["Education_Level"] = temp["Education_Level"]
    mgk["Dateofjoining"] = temp["Dateofjoining"]
    mgk.to_csv("New_Train.csv") ## Saving new Train file in csv format

In [None]:
data = pd.read_csv("Train.csv") ## Reading the Train data
convert_new_Train(data) ## calling the convert_new_Train function we have made previously

In [None]:
df.shape ## Checking the shape of data

# Generating the Submission .csv file

In [23]:
Test = pd.read_csv("Test.csv") ## Reading Test csv File
temp = pd.DataFrame()  ## Generating blank datframe

## finding the Emp_ID into new_train datasets
for i in range(len(Test)):
    temp1=data.loc[data['Emp_ID'] == Test["Emp_ID"][i]] ## Getting rows with Emp_ID using Test file into Train
    temp2=temp1.head(1) ## Getting the row
    temp = temp.append(temp2, ignore_index = True) ## append into empty Dataframe
    #print(Test["Emp_ID"][i])

df=temp ## Copying all the data into new DataFrame

## Perform the Pre-Processing which we apply previosly
df['Dateofjoining'] = pd.to_datetime(df['Dateofjoining']).sub(pd.Timestamp('2010-01-01')).dt.days
df['Gender'].replace('Female',0,inplace=True)
df['Gender'].replace('Male',1,inplace=True)
change={
                    'Master':0,
                    "College":1,
                    "Bachelor":2
 }
df['Education_Level'].replace(change,inplace=True)
df["City"] = df["City"].astype('category')
df["City"] = df["City"].cat.codes
df = df[ImpCol]
# initialising the MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
# learning the statistical parameters for each of the data and transforming
rescaled = scaler.fit_transform(df)

  return self.partial_fit(X, y)


## Loading the mdel and predicting the test data

In [24]:
# filename = 'random_forest.sav'
# filename = 'Logistic.sav'
# filename = 'voting_class.sav'
# filename = "Support_Vector.sav"
filename = 'grid_search.sav'    
# load the model from disk

loaded_model = pickle.load(open(filename, 'rb'))
y_pred = loaded_model.predict(rescaled)

In [25]:
y_pred

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,

In [26]:
tt=pd.read_csv("temp.csv") ## Lading Temp data which contains submission data
tt["Target"]=y_pred ## append the result we calculated using Model we trained

In [27]:
tt.to_csv("subb.csv") ## Convert into .csv file so that we can submit