In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
import zipfile

# Specify the path to your zip file
zip_file_path = 'dataset/playground-series-s3e3.zip'

# Specify the directory where you want to extract the contents
extracted_dir = 'dataset/'

with zipfile.ZipFile(zip_file_path,"r") as zip_ref:
    zip_ref.extractall(extracted_dir)

print("All Zip Files have been Extracted")

All Zip Files have been Extracted


In [4]:
df = pd.read_csv("dataset/train.csv")
test = pd.read_csv("dataset/test.csv")
df.head()

Unnamed: 0,id,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,0,36,Travel_Frequently,599,Research & Development,24,3,Medical,1,4,...,80,1,10,2,3,10,0,7,8,0
1,1,35,Travel_Rarely,921,Sales,8,3,Other,1,1,...,80,1,4,3,3,4,2,0,3,0
2,2,32,Travel_Rarely,718,Sales,26,3,Marketing,1,3,...,80,2,4,3,3,3,2,1,2,0
3,3,38,Travel_Rarely,1488,Research & Development,2,3,Medical,1,3,...,80,0,15,1,1,6,0,0,2,0
4,4,50,Travel_Rarely,1017,Research & Development,5,4,Medical,1,2,...,80,0,31,0,3,31,14,4,10,1


In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == "O":
        le.fit(df[col])
        df[col] = le.transform(df[col])
        test[col] = le.transform(test[col])

In [6]:
x_cols = [col for col in df.columns if col not in ["id","Attrition"]]
y_cols = ["Attrition"]

In [109]:
att = df[df["Attrition"] == 1]
dff = pd.concat([df,att,att,att], axis = 0)

# Shuffle the DataFrame
shuffled_df = dff.sample(frac=1.0, random_state=42)

# Reset the index if needed
shuffled_df.reset_index(drop=True, inplace=True)


In [87]:
X = df[x_cols]
y = df[y_cols]

In [110]:
Xd = shuffled_df[x_cols]
xtest = test[x_cols]
yd = shuffled_df[y_cols]

In [89]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn import tree

In [90]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def cross_validation(X,y,estimator,n_splits, xtest=None):

    skf = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)

    acc_scr = []
    f1_scr = []
    auc_scr = []
    
    if xtest is not None:
        preds = np.zeros(len(xtest))
    else:
        preds = None

    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        estimator.fit(X_train, y_train)

        y_preds = estimator.predict(X_test)

        if xtest is not None:
            test_preds = estimator.predict_proba(xtest)[:, 1] / n_splits
            if preds is None:
                preds = test_preds
            else:
                preds += test_preds

        acc_scr.append(accuracy_score(y_test, y_preds))
        f1_scr.append(f1_score(y_test, y_preds))
        auc_scr.append(roc_auc_score(y_test, estimator.predict_proba(X_test)[:, 1]))

    avg_acc = round(np.mean(acc_scr), 4)
    avg_f1 = round(np.mean(f1_scr), 4)
    avg_roc = round(np.mean(auc_scr), 4)

    print(f"Average accuracy score of {n_splits} is : {avg_acc}")
    print(f"Average f1 score of {n_splits} is       : {avg_f1}")
    print(f"Average roc score of {n_splits} is      : {avg_roc}")

    return preds

In [91]:
import warnings
warnings.filterwarnings("ignore")

In [92]:
param_grid = {
    'n_estimators':[100,200,300],
    'max_samples' : [100,500,X.shape[0]],
    'criterion'   : ['gini','entropy'],
    'min_samples_split' : [50,100,200,300,500],
    'max_leaf_nodes':[300,500,800,1000]   
    
}

In [111]:
param_grid = {
    'n_estimators':[1000],
    'criterion'   : ['gini']    
}

rf = RandomForestClassifier(random_state=42)

grid_rf = GridSearchCV(estimator=rf,
                       param_grid=param_grid,
                       verbose=True,
                       scoring='roc_auc')
grid_rf_d = GridSearchCV(estimator=rf,
                       param_grid=param_grid,
                       verbose=True,
                       scoring='roc_auc')

In [112]:
%%time
grid_rf.fit(Xd,yd)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
CPU times: total: 2.53 s
Wall time: 55.8 s


In [113]:
grid_rf.best_params_

{'criterion': 'gini', 'n_estimators': 1000}

In [114]:
print(grid_rf.best_score_)

0.9983496621621623


In [115]:
best_esti = grid_rf.best_estimator_
cross_validation(Xd,yd,best_esti,5, None)

Average accuracy score of 5 is : 0.9829
Average f1 score of 5 is       : 0.9761
Average roc score of 5 is      : 0.9969


In [122]:
val = cross_validation(Xd,yd,best_esti,5, xtest=xtest)

Average accuracy score of 5 is : 0.9829
Average f1 score of 5 is       : 0.9761
Average roc score of 5 is      : 0.9969


In [123]:
samp_sub = pd.read_csv("dataset/sample_submission.csv")
samp_sub["Attrition"] = val
samp_sub.to_csv("est_sub_rf_d_cv.csv", index=False)

In [124]:
np.round(best_esti.feature_importances_*100,1)

array([6.1, 1.3, 4.8, 1.1, 4. , 2.1, 2.2, 0. , 2.6, 0.8, 5.2, 2.7, 2.4,
       3. , 2.3, 2.6, 8. , 5.2, 3.3, 0. , 3. , 3.8, 0.5, 2.9, 0. , 7.3,
       4.6, 2.3, 1.8, 4.6, 3.6, 2.3, 3.5])