In [2]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Load data
df = pd.read_csv("EmployeeAttrition.csv")

In [4]:
# Drop the 'Over18' and 'EmployeeNumber' columns
df.drop('EmployeeNumber', axis=1, inplace=True)
df.drop('Over18', axis=1, inplace=True)

In [5]:
from sklearn.preprocessing import OrdinalEncoder
# Label categorical data

# Nominal variables
df['Attrition'] = df['Attrition'].astype('category')
df['Department'] = df['Department'].astype('category')
df['EducationField'] = df['EducationField'].astype('category')
df['Gender'] = df['Gender'].astype('category')
df['JobRole'] = df['JobRole'].astype('category')
df['MaritalStatus'] = df['MaritalStatus'].astype('category')
df['OverTime'] = df['OverTime'].astype('category')

# Convert string to numeric
df[['BusinessTravel']] = OrdinalEncoder(categories=[['Non-Travel', 'Travel_Rarely', 'Travel_Frequently']]).fit_transform(df[['BusinessTravel']])

# Ordinal variables
df['BusinessTravel'] = df['BusinessTravel'].astype(pd.CategoricalDtype(categories=[0, 1, 2], ordered=True))
df['Education'] = df['Education'].astype(pd.CategoricalDtype(categories=[1, 2, 3, 4, 5], ordered=True))
df['EnvironmentSatisfaction'] = df['EnvironmentSatisfaction'].astype(pd.CategoricalDtype(categories=[1, 2, 3, 4], ordered=True))
df['JobInvolvement'] = df['JobInvolvement'].astype(pd.CategoricalDtype(categories=[1, 2, 3, 4], ordered=True))
df['JobLevel'] = df['JobLevel'].astype(pd.CategoricalDtype(categories=[1, 2, 3, 4, 5], ordered=True))
df['JobSatisfaction'] = df['JobSatisfaction'].astype(pd.CategoricalDtype(categories=[1, 2, 3, 4], ordered=True))
df['PerformanceRating'] = df['PerformanceRating'].astype(pd.CategoricalDtype(categories=[1, 2, 3, 4], ordered=True))
df['RelationshipSatisfaction'] = df['RelationshipSatisfaction'].astype(pd.CategoricalDtype(categories=[1, 2, 3, 4], ordered=True))
df['StockOptionLevel'] = df['StockOptionLevel'].astype(pd.CategoricalDtype(categories=[0, 1, 2, 3], ordered=True))
df['WorkLifeBalance'] = df['WorkLifeBalance'].astype(pd.CategoricalDtype(categories=[1, 2, 3, 4], ordered=True))

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 33 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Age                       1470 non-null   int64   
 1   Attrition                 1470 non-null   category
 2   BusinessTravel            1470 non-null   category
 3   DailyRate                 1470 non-null   int64   
 4   Department                1470 non-null   category
 5   DistanceFromHome          1470 non-null   int64   
 6   Education                 1470 non-null   category
 7   EducationField            1470 non-null   category
 8   EmployeeCount             1470 non-null   int64   
 9   EnvironmentSatisfaction   1470 non-null   category
 10  Gender                    1470 non-null   category
 11  HourlyRate                1470 non-null   int64   
 12  JobInvolvement            1470 non-null   category
 13  JobLevel                  1470 non-null   catego

### Save to new csv - Data Cleaning

In [7]:
df.to_csv('IBMdataCleaning.csv')

In [7]:
df = pd.get_dummies(df, columns=['Attrition', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime', ], drop_first=True) 

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 46 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   Age                                1470 non-null   int64   
 1   BusinessTravel                     1470 non-null   category
 2   DailyRate                          1470 non-null   int64   
 3   DistanceFromHome                   1470 non-null   int64   
 4   Education                          1470 non-null   category
 5   EmployeeCount                      1470 non-null   int64   
 6   EnvironmentSatisfaction            1470 non-null   category
 7   HourlyRate                         1470 non-null   int64   
 8   JobInvolvement                     1470 non-null   category
 9   JobLevel                           1470 non-null   category
 10  JobSatisfaction                    1470 non-null   category
 11  MonthlyIncome                      1470 non

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop('Attrition_Yes', axis=1)
y = df['Attrition_Yes']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2407, shuffle=True)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# From here, you should have 4 variables to use to train/test your model
# X_train
# y_train
# X_test
# y_test

In [12]:
from imblearn.over_sampling import SMOTE
oversampler = SMOTE(random_state=2407)
X_smote_train, y_smote_train = oversampler.fit_resample(X_train, y_train)

In [13]:
from imblearn.over_sampling import ADASYN
oversampler = ADASYN(random_state=2407)
X_adasyn_train, y_adasyn_train = oversampler.fit_resample(X_train, y_train)

In [None]:
# If you're using SMOTE, you should have 4 variables to use to train/test your model
# X_smote_train
# y_smote_train
# X_test
# y_test

# If you're using ADASYN, you should have 4 variables to use to train/test your model
# X_adasyn_train
# y_adasyn_train
# X_test
# y_test