In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv("IBM_Attrition_v3.csv")
df.head()

Unnamed: 0,Age,JobSatisfaction,MaritalStatus,MonthlyIncome,YearsAtCompany,Attrition
0,36,3,Married,7094.0,7,No
1,34,3,Divorced,2351.0,2,Yes
2,30,3,Married,5294.0,7,No
3,41,1,Married,16595.0,18,No
4,57,4,Divorced,14118.0,1,No


In [3]:
df.isna().sum(axis=0)

Age                 0
JobSatisfaction     0
MaritalStatus       0
MonthlyIncome      13
YearsAtCompany      0
Attrition           0
dtype: int64

In [4]:
df = df.dropna()
df.isna().sum(axis=0)

Age                0
JobSatisfaction    0
MaritalStatus      0
MonthlyIncome      0
YearsAtCompany     0
Attrition          0
dtype: int64

In [5]:
income_bins = [0, 2900, 5000, 8500, max(df['MonthlyIncome'])+10]
income_labels = ['income1', 'income2', 'income3', 'income4']

df['MonthlyIncomeBinned'] = pd.cut(df['MonthlyIncome'], bins=income_bins, labels=income_labels)
df.head()

Unnamed: 0,Age,JobSatisfaction,MaritalStatus,MonthlyIncome,YearsAtCompany,Attrition,MonthlyIncomeBinned
0,36,3,Married,7094.0,7,No,income3
1,34,3,Divorced,2351.0,2,Yes,income1
2,30,3,Married,5294.0,7,No,income3
3,41,1,Married,16595.0,18,No,income4
4,57,4,Divorced,14118.0,1,No,income4


In [6]:
years_bins = [-1, 6, max(df['YearsAtCompany'])+2]
years_labels = ['not-senior', 'senior']

df['YearsAtCompanyBinned'] = pd.cut(df['YearsAtCompany'], bins=years_bins, labels=years_labels)
df.head()

Unnamed: 0,Age,JobSatisfaction,MaritalStatus,MonthlyIncome,YearsAtCompany,Attrition,MonthlyIncomeBinned,YearsAtCompanyBinned
0,36,3,Married,7094.0,7,No,income3,senior
1,34,3,Divorced,2351.0,2,Yes,income1,not-senior
2,30,3,Married,5294.0,7,No,income3,senior
3,41,1,Married,16595.0,18,No,income4,senior
4,57,4,Divorced,14118.0,1,No,income4,not-senior


In [7]:
age_bins = [0, 37, 100]
age_labels = ['young', 'mature']

df['AgeBinned'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels)
df.head()

Unnamed: 0,Age,JobSatisfaction,MaritalStatus,MonthlyIncome,YearsAtCompany,Attrition,MonthlyIncomeBinned,YearsAtCompanyBinned,AgeBinned
0,36,3,Married,7094.0,7,No,income3,senior,young
1,34,3,Divorced,2351.0,2,Yes,income1,not-senior,young
2,30,3,Married,5294.0,7,No,income3,senior,young
3,41,1,Married,16595.0,18,No,income4,senior,mature
4,57,4,Divorced,14118.0,1,No,income4,not-senior,mature


In [8]:
df = df.drop(columns=['Age', 'MonthlyIncome', 'YearsAtCompany'])
df.head()

Unnamed: 0,JobSatisfaction,MaritalStatus,Attrition,MonthlyIncomeBinned,YearsAtCompanyBinned,AgeBinned
0,3,Married,No,income3,senior,young
1,3,Divorced,Yes,income1,not-senior,young
2,3,Married,No,income3,senior,young
3,1,Married,No,income4,senior,mature
4,4,Divorced,No,income4,not-senior,mature


In [9]:
test_data = df.loc[::4]
train_data = df.drop(test_data.index, axis=0)

In [10]:
X_train = train_data.loc[:, df.columns!='Attrition']
y_train = train_data.loc[:, 'Attrition']
X_test = test_data.loc[:, df.columns!='Attrition']
y_test = test_data.loc[:, 'Attrition']

In [13]:
rf_clf = Pipeline(
    [('preprocessing', ColumnTransformer(
        [('cat', OrdinalEncoder(), ['MaritalStatus', 'MonthlyIncomeBinned', 'YearsAtCompanyBinned', 'AgeBinned'])]
    )),
    ('classifier', RandomForestClassifier())
    ]
)

rf_clf.fit(X_train, y_train)

rf_pred = rf_clf.predict(X_test)
print(f"Random Forest classifier accuracy score: {round(accuracy_score(y_test, rf_pred)*100, 2)}%")


Random Forest classifier accuracy score: 82.65%


In [14]:
dt_clf = Pipeline(
    [('preprocessing', ColumnTransformer(
        [('cat', OrdinalEncoder(), ['MaritalStatus', 'MonthlyIncomeBinned', 'YearsAtCompanyBinned', 'AgeBinned'])]
    )),
    ('classifier', DecisionTreeClassifier())
    ]
)

dt_clf.fit(X_train, y_train)

dt_pred = dt_clf.predict(X_test)
print(f"Decision tree classifier accuracy score: {round(accuracy_score(y_test, dt_pred)*100, 2)}%")


Decision tree classifier accuracy score: 81.23%
