  ANN PRACTICE ON ATTRITION DATA

In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [3]:
dataset.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


CLEANING THE DATASET

In [4]:
#removing standard hours and employeeID + attrition as first row
dataset = dataset[['Attrition',
                   'Age',
                   'BusinessTravel',
                   'DailyRate',
                   'Department',
                   'DistanceFromHome',
                   'Education',
                   'EducationField',
                   'EmployeeCount',
                   'EnvironmentSatisfaction',
                   'Gender',
                   'HourlyRate',
                   'JobInvolvement',
                   'JobLevel',
                   'JobRole',
                   'JobSatisfaction',
                   'MaritalStatus',
                   'MonthlyIncome',
                   'MonthlyRate',
                   'NumCompaniesWorked',
                   'OverTime',
                   'PercentSalaryHike',
                   'PerformanceRating',
                   'RelationshipSatisfaction',
                   'StockOptionLevel',
                   'TotalWorkingYears',
                   'TrainingTimesLastYear',
                   'WorkLifeBalance',
                   'YearsAtCompany',
                   'YearsInCurrentRole',
                   'YearsSinceLastPromotion',
                   'YearsWithCurrManager']]

Adding some more derived information

Playing with the data we can add some more information to help the network.

 providing some ratios and extra information can help the network to converge faster.

In [5]:
dataset['JobInvolment_On_Salary']= dataset['JobInvolvement'] / dataset['MonthlyIncome'] * 1000
dataset['MarriedAndBad_Worklife_Balance'] = np.where(dataset['MaritalStatus']=='Married',
                                               dataset['WorkLifeBalance']-2,
                                               dataset['WorkLifeBalance']+1)
dataset['DistanceFromHome_rootedTo_JobSatisfaction'] = dataset['DistanceFromHome']**(1/dataset['JobSatisfaction'])
dataset['TotalJobSatisfaction'] = dataset['EnvironmentSatisfaction'] + dataset['JobSatisfaction'] + dataset['RelationshipSatisfaction']
dataset['OldLowEmployeeTendToStay'] = dataset['YearsAtCompany'] / dataset['JobLevel']
dataset['Mothers'] = np.where((dataset['Gender']=='Female') & (dataset['Age']>=36), 1,0)
dataset['Rate'] = dataset['DailyRate'] * 20 + dataset['HourlyRate'] * 8 * 20 + dataset['MonthlyRate']
dataset['RateExtended'] = dataset['Rate'] * (8 - dataset['JobSatisfaction'] - dataset['EnvironmentSatisfaction'])

Separating the data from labels

In [6]:
X = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values

Encoding the data and the labels

Neural network only understand numbers. We need to transform the columns with strings into numbers.

here ill create categories.

ex: Gender: Male as 0 and Female as 1

In [11]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer

# Define the categorical features to be encoded
categorical_features = [1, 4, 8, 19, 28]  # Replace with your actual column indices

# Create a ColumnTransformer to apply OneHotEncoder to specific columns
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), categorical_features)],
    remainder='passthrough'  # Keep other columns unchanged
)

# Fit and transform the data
X = ct.fit_transform(X)#.toarray() # .toarray() might be needed depending on the desired output format

In [13]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_3 = LabelEncoder()
X[:, 3] = labelencoder_X_3.fit_transform(X[:, 3])
labelencoder_X_6= LabelEncoder()
X[:, 6] = labelencoder_X_6.fit_transform(X[:, 6])
labelencoder_X_9= LabelEncoder()
X[:, 9] = labelencoder_X_9.fit_transform(X[:, 9])
labelencoder_X_13= LabelEncoder()
X[:, 13] = labelencoder_X_13.fit_transform(X[:, 13])
labelencoder_X_15= LabelEncoder()
X[:, 15] = labelencoder_X_15.fit_transform(X[:, 15])
labelencoder_X_19= LabelEncoder()
X[:, 19] = labelencoder_X_19.fit_transform(X[:, 19])
X = X.astype(float)
labelencoder_y= LabelEncoder()
y = labelencoder_y.fit_transform(y)

Dummy variable and dummy trap

In [15]:
# Instead of using categorical_features, use handle_unknown='ignore' and specify the categories manually
# For onehotencoder1
from sklearn.preprocessing import OneHotEncoder
onehotencoder1 = OneHotEncoder(categories='auto', handle_unknown='ignore', sparse_output=False) # sparse=False for toarray() compatibility

# Assuming X[:, 1] contains the feature to be encoded:
encoded_feature1 = onehotencoder1.fit_transform(X[:, 1].reshape(-1, 1)) # Reshape to 2D array

# Remove the original feature and add the encoded features
X = np.concatenate((X[:, :1], encoded_feature1, X[:, 2:]), axis=1)
X = X[:,1:]

# Repeat this process for other one-hot encoders (onehotencoder3, onehotencoder6, etc.)
# Make sure to adjust the column indices accordingly.


# For onehotencoder3
onehotencoder3 = OneHotEncoder(categories='auto', handle_unknown='ignore', sparse_output=False)
encoded_feature3 = onehotencoder3.fit_transform(X[:, 4].reshape(-1, 1)) # Reshape to 2D array
X = np.concatenate((X[:, :4], encoded_feature3, X[:, 5:]), axis=1)
X = X[:,1:]

# For onehotencoder6
onehotencoder6 = OneHotEncoder(categories='auto', handle_unknown='ignore', sparse_output=False)
encoded_feature6 = onehotencoder6.fit_transform(X[:, 8].reshape(-1, 1)) # Reshape to 2D array
X = np.concatenate((X[:, :8], encoded_feature6, X[:, 9:]), axis=1)
X = X[:,1:]

# For onehotencoder13
onehotencoder13 = OneHotEncoder(categories='auto', handle_unknown='ignore', sparse_output=False)
encoded_feature13 = onehotencoder13.fit_transform(X[:, 19].reshape(-1, 1)) # Reshape to 2D array
X = np.concatenate((X[:, :19], encoded_feature13, X[:, 20:]), axis=1)
X = X[:,1:]

# For onehotencoder15
onehotencoder15 = OneHotEncoder(categories='auto', handle_unknown='ignore', sparse_output=False)
encoded_feature15 = onehotencoder15.fit_transform(X[:, 28].reshape(-1, 1)) # Reshape to 2D array
X = np.concatenate((X[:, :28], encoded_feature15, X[:, 29:]), axis=1)
X = X[:,1:]

Splitting the dataset into training and testing set


In [16]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

Scaling the features

In [17]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Hyperparameter tunning

In [18]:
dropout = 0.1
epochs = 100
batch_size = 30
optimizer = 'adam'
k = 20

Training the neural network using k-fold cross validation

In [20]:
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [35]:
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers import Dense, Dropout
import numpy as np

# Define the function to build the classifier
def build_classifier(input_dim, dropout_rate, optimizer):
    model = Sequential()
    model.add(Dense(16, kernel_initializer="truncated_normal", activation='relu', input_shape=(input_dim,)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, kernel_initializer="truncated_normal", activation='sigmoid'))  # Output layer
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Cross-validation
def evaluate_model(X_train, y_train, batch_size, epochs, dropout_rate, optimizer, n_splits=30):
    accuracies = []
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    for train_index, val_index in kf.split(X_train):
        X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
        y_fold_train, y_fold_val = y_train[train_index], y_train[val_index]

        # Build and train the model
        model = build_classifier(X_train.shape[1], dropout_rate, optimizer)
        model.fit(X_fold_train, y_fold_train, batch_size=batch_size, epochs=epochs, verbose=0)

        # Evaluate the model
        _, accuracy = model.evaluate(X_fold_val, y_fold_val, verbose=0)
        accuracies.append(accuracy)

    return np.array(accuracies)

# Parameters
dropout_rate = 0.5
optimizer = 'adam'
batch_size = 32
epochs = 50

# Get cross-validated accuracies
accuracies = evaluate_model(X_train, y_train, batch_size, epochs, dropout_rate, optimizer)
max_accuracy = accuracies.max()

print(f"Max accuracy: {max_accuracy}")


Max accuracy: 0.9487179517745972
