# Classification

In [37]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [38]:
try:
    df = pd.read_csv("cleandata.csv")
except FileNotFoundError:
    print("No such file exists on your current working directory")

In [39]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157.0,3.0,0,1,0,sales,low
1,0.8,0.86,5,262.0,6.0,0,1,0,sales,medium
2,0.11,0.88,7,272.0,4.0,0,1,0,sales,medium
3,0.72,0.87,5,223.0,5.0,0,1,0,sales,low
4,0.37,0.52,2,200.51,3.38,0,1,0,sales,low


In [40]:
# Converting the string values into catagorical values.
df['salary'] = df['salary'].astype('category').cat.codes
df['department'] = df['department'].astype('category').cat.codes

In [41]:
#Separating features and lable.
X =df.drop(['left'], axis=1) #X is the feature data
y = df['left']       #y is the target data

In [42]:
#Seeing which feature impacts the most
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train,y_train)
features = pd.Series(clf.feature_importances_,index=X.columns).sort_values(ascending=False)
features

satisfaction_level       0.361064
number_project           0.184815
time_spend_company       0.157006
average_montly_hours     0.146170
last_evaluation          0.122722
department               0.014615
salary                   0.007981
work_accident            0.004936
promotion_last_5years    0.000691
dtype: float64

In [43]:
#Taking account of only top 6 features that played in role whether an employee stayed or left and converting them to numpy array.
X= np.array(df.drop(['left','promotion_last_5years', 'work_accident', 'salary'], axis=1))
y = np.array(df['left'])  

In [44]:
# Feature scaling
scaler = StandardScaler()
scaler.fit(X)

X= scaler.transform(X)
X

array([[-1.02590309, -1.10702002, -1.54204807, -0.90455753, -0.28192698,
         0.38960187],
       [ 0.7104694 ,  0.85003893,  1.02178113,  1.27827112,  1.94353528,
         0.38960187],
       [-2.14214254,  0.96864856,  2.73100059,  1.48615957,  0.45989377,
         0.38960187],
       ...,
       [ 1.08254922,  0.96864856,  1.02178113,  0.57145041,  1.20171452,
         0.73527238],
       [ 0.54510059,  0.67212448,  1.87639086,  1.9227253 ,  1.94353528,
         0.73527238],
       [-1.06724529, -1.4035441 , -1.54204807, -0.842191  , -0.28192698,
         0.73527238]])

In [53]:
def classification(target, label,test):
    """Classification using RandomForestClassifier"""
    for value in test:
        print("The precision score, recall score, confusion matrix and f1 score for {:.0f} - {:.0f} training-test data set are:" .format(100-100*value, 100*value),"\n")
        
        #Split the data into train and test set.
        X_train, X_test, y_train, y_test = train_test_split(target, label, test_size=value, random_state=42)
        
        #Shuffling the data.
        shuffle_index = np.random.permutation(len(X_train))
        X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
        
        #Classifier
        #clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
        #clf = GaussianNB()
        clf = RandomForestClassifier(n_estimators=100)
        
        #clf = DecisionTreeClassifier()
        
        # 5 fold cross validation
        prediction=cross_val_predict(clf,X_train,y_train,cv=5)
        
        #Accuracy on training set. If satisfied, fit the classifier and predict on test set.
        print("Accuracy score of training set:{:.3f}".format( accuracy_score(y_train, prediction)))
        
        clf.fit(X_train, y_train)
        pred= clf.predict(X_test)
        
        #Accuracy on test set
      
        print("Accuracy score of test set: {:.3f} ".format( accuracy_score(y_test, pred)))
        
        
        #Calculate and print various measures on test set
        print("\n Test set: \n")
        print("Precision : {:.3f}" .format((precision_score(y_test,pred))))
        print("Recall: {:.3f}" .format(recall_score(y_test, pred)))
        print("F1 Score: {:.3f}".format(f1_score(y_test, pred, average='micro')))
        print("Confusion matrix:")
        print(confusion_matrix(y_test, pred), "\n")
        
    

In [54]:
#Invoking the function.

a=[0.15,0.25,0.35]
classification(X,y,a)

The precision score, recall score, confusion matrix and f1 score for 85 - 15 training-test data set are: 

Accuracy score of training set:0.983
Accuracy score of test set: 0.982 

 Test set: 

Precision : 0.996
Recall: 0.897
F1 Score: 0.982
Confusion matrix:
[[1528    1]
 [  32  278]] 

The precision score, recall score, confusion matrix and f1 score for 75 - 25 training-test data set are: 

Accuracy score of training set:0.983
Accuracy score of test set: 0.982 

 Test set: 

Precision : 0.996
Recall: 0.902
F1 Score: 0.982
Confusion matrix:
[[2532    2]
 [  52  479]] 

The precision score, recall score, confusion matrix and f1 score for 65 - 35 training-test data set are: 

Accuracy score of training set:0.983
Accuracy score of test set: 0.983 

 Test set: 

Precision : 0.991
Recall: 0.913
F1 Score: 0.983
Confusion matrix:
[[3540    6]
 [  65  680]] 

