In [None]:
#importing library
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
#loading the dataset
df=pd.read_csv('../input/human-resources-data-set/HRDataset_v14.csv')
#showing the dataset
df

In [None]:
#getting information about the dataset
df.info()

In [None]:
x.loc[:,[len(x[column].unique())==2 for column in x.columns]]

In [None]:
df['PerformanceScore']

In [None]:
#encode function
def encode_dates(df,column_with_prefixes):
    #copying the dataframe
    df=df.copy()
    #looping through column list 
    for column,prefix in column_with_prefixes:
        #converting date column to datetime 
        df[column]=pd.to_datetime(df[column])
        #extracting year for date
        df[prefix+'_year']=df[column].apply(lambda x:x.year)
        #extracting month from date
        df[prefix+'_month']=df[column].apply(lambda x:x.month)
        #extracting day from date
        df[prefix+'_day']=df[column].apply(lambda x:x.day)
        #dropping the original column
        df=df.drop(column,axis=1)
    #returning the dataframe
    return df
        

In [None]:
#creating a function
def ordinal_encode(df,column_with_orderings):
    #copying the dataframe
    df=df.copy()
    #looping through the column
    for column,ordering in column_with_orderings:
        #Applying the lambda function 
        df[column]=df[column].apply(lambda x:ordering.index(x))
    #returning the dataframe
    return df
        

In [None]:
def onehot_encode(df,column_with_prefixes):
    df=df.copy()
    for column,prefix in column_with_prefixes:
        dummies=pd.get_dummies(df[column],prefix=prefix)
        df=pd.concat([df,dummies],axis=1)
        df=df.drop(column,axis=1)
    return df

In [None]:
pd.get_dummies(x['Zip'],prefix='ZIP')

In [None]:
#Preprocessing the dataset
def preprocess_inputs(df,scaler):
    #copying the data
    df=df.copy()
    #dropping unneeded columns
    df=df.drop(['DeptID','MaritalStatusID','Sex','EmpStatusID','EmploymentStatus','Employee_Name','PositionID','EmpID','DateofTermination','TermReason',"ManagerID"],axis=1)
    #Encode dates
    date_columns=[('DOB','DOB'),('DateofHire','DOH'),('LastPerformanceReview_Date','PRD')]
    df=encode_dates(df,column_with_prefixes=date_columns)
    #ordinal encoding
    ordinal_columns=[
        ('PerformanceScore',['PIP','Needs Improvement','Fully Meets','Exceeds'])]
    df=ordinal_encode(df,ordinal_columns)
    #onehot_encode
    nominal_columns=[('Position','PS'),
     ('State','ST'),
     ('MaritalDesc','MD'),
     ('CitizenDesc','CD'),
     ('HispanicLatino','HL'),
     ('RaceDesc','RD'),
     ('Department','DE'),
     ('ManagerName','MN'),
     ('RecruitmentSource','RS')]
    df=onehot_encode(df,column_with_prefixes=nominal_columns)
    #returning the dataframe
    
    #Split df into x and y
    y=df['Termd'].copy()
    x=df.drop('Termd',axis=1)
    #train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=1)
    #scaling the dataset
    
    scaler.fit(x_train)
    x_train=pd.DataFrame(scaler.transform(x_train),columns=x_train.columns,index=x_train.index)
    x_test=pd.DataFrame(scaler.transform(x_test),columns=x_test.columns,index=x_test.index)
    return x_train,x_test,y_train,y_test

In [None]:
{column:len(x[column].unique()) for column in x.select_dtypes('object').columns}

In [None]:
x['PerformanceScore']

In [None]:
x_train,x_test,y_train,y_test=preprocess_inputs(df,StandardScaler())
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
x_train

In [None]:
x

In [None]:
x['Sex'].apply(lambda x:1 if x=='M ' else 0)

In [None]:
x['EmploymentStatus'].unique()

In [None]:
x['PerformanceScore'].unique()

In [None]:
pd.to_datetime(df['DateofHire'])

In [None]:
#checking for number of unique value in each columns
{column:len(x[column].unique()) for column in x.columns}

In [None]:
x=preprocess_inputs(df)
x

In [None]:
#checking for missing values
x.isna().sum()

In [None]:
#training
model={
    'K-Nearest Neighbors':KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine':SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Neural Network':MLPClassifier()
}

In [None]:
for name,model in model.items():
    model.fit(x_train,y_train)
    print(name+'trained')

In [None]:
#Results
for name,model in model.items():
    print(model.score(x_test,y_test))