In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
import zipfile

zip_file_path = "dataset/playground-series-s3e3.zip"
extracted_path = "dataset/"

with zipfile.ZipFile(zip_file_path,"r") as zip_ref:
    zip_ref.extractall(extracted_path)
    
print("Zip file Extracted")

In [2]:
df = pd.read_csv("dataset/train.csv")
test = pd.read_csv("dataset/test.csv")
df.head()

Unnamed: 0,id,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,0,36,Travel_Frequently,599,Research & Development,24,3,Medical,1,4,...,80,1,10,2,3,10,0,7,8,0
1,1,35,Travel_Rarely,921,Sales,8,3,Other,1,1,...,80,1,4,3,3,4,2,0,3,0
2,2,32,Travel_Rarely,718,Sales,26,3,Marketing,1,3,...,80,2,4,3,3,3,2,1,2,0
3,3,38,Travel_Rarely,1488,Research & Development,2,3,Medical,1,3,...,80,0,15,1,1,6,0,0,2,0
4,4,50,Travel_Rarely,1017,Research & Development,5,4,Medical,1,2,...,80,0,31,0,3,31,14,4,10,1


In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == "O":
        le.fit(df[col])
        df[col] = le.transform(df[col])
        test[col] = le.transform(test[col])

In [4]:
x_cols = [col for col in df.columns if col not in ["id","Attrition"]]
y_cols = ["Attrition"]

In [5]:
att = df[df["Attrition"] == 1]
dff = pd.concat([df,att,att,att], axis = 0)

# Shuffle the DataFrame
shuffled_df = dff.sample(frac=1.0, random_state=42)

# Reset the index if needed
shuffled_df.reset_index(drop=True, inplace=True)


In [6]:
X = df[x_cols]
y = df[y_cols]

In [7]:
Xd = shuffled_df[x_cols]
xtest = test[x_cols]
yd = shuffled_df[y_cols]

In [8]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def cross_validation(X,y,estimator,n_splits, xtest=None):

    skf = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)

    acc_scr = []
    f1_scr = []
    auc_scr = []
    
    if xtest is not None:
        preds = np.zeros(len(xtest))
    else:
        preds = None

    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        estimator.fit(X_train, y_train)

        y_preds = estimator.predict(X_test)

        if xtest is not None:
            test_preds = estimator.predict_proba(xtest)[:, 1] / n_splits
            if preds is None:
                preds = test_preds
            else:
                preds += test_preds

        acc_scr.append(accuracy_score(y_test, y_preds))
        f1_scr.append(f1_score(y_test, y_preds))
        auc_scr.append(roc_auc_score(y_test, estimator.predict_proba(X_test)[:, 1]))

    avg_acc = round(np.mean(acc_scr), 4)
    avg_f1 = round(np.mean(f1_scr), 4)
    avg_roc = round(np.mean(auc_scr), 4)

    print(f"Average accuracy score of {n_splits} is : {avg_acc}")
    print(f"Average f1 score of {n_splits} is       : {avg_f1}")
    print(f"Average roc score of {n_splits} is      : {avg_roc}")

    return preds

In [9]:
import warnings
warnings.filterwarnings("ignore")

In [10]:
param_grid = {
    'n_estimators':[1000],
    'learning_rate' :[0.1]  
}

dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=1000,random_state=42)

ada_dt = AdaBoostClassifier(estimator=dt, random_state=42)
ada_rf = AdaBoostClassifier(estimator=rf, random_state=42)

grid_ada_dt = GridSearchCV(estimator=ada_dt,
                            param_grid=param_grid,
                            cv=5,
                            verbose=True,
                            scoring='roc_auc')
grid_ada_dt_d = GridSearchCV(estimator=ada_dt,
                            param_grid=param_grid,
                            cv=5,
                            verbose=True,
                            scoring='roc_auc')

grid_ada_rf = GridSearchCV(estimator=ada_rf,
                            param_grid=param_grid,
                            cv=5,
                            verbose=True,
                            scoring='roc_auc')

grid_ada_rf_d = GridSearchCV(estimator=ada_rf,
                            param_grid=param_grid,
                            cv=5,
                            verbose=True,
                            scoring='roc_auc')

In [11]:
grid_ada_dt.fit(X,y)
grid_ada_dt_d.fit(Xd,yd)
grid_ada_rf.fit(X,y)
grid_ada_rf_d.fit(Xd,yd)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [12]:
print(grid_ada_dt.best_params_)
print(grid_ada_dt_d.best_params_)
print(grid_ada_rf.best_params_)
print(grid_ada_rf_d.best_params_)

{'learning_rate': 0.1, 'n_estimators': 1000}
{'learning_rate': 0.1, 'n_estimators': 1000}
{'learning_rate': 0.1, 'n_estimators': 1000}
{'learning_rate': 0.1, 'n_estimators': 1000}


In [13]:
print(grid_ada_dt.best_score_)
print(grid_ada_dt_d.best_score_)
print(grid_ada_rf.best_score_)
print(grid_ada_rf_d.best_score_)

0.5973889143380668
0.9330554283096657
0.831727840128264
0.998466187585891


In [14]:
best_ada_dt = grid_ada_dt.best_estimator_
best_ada_dt_d = grid_ada_dt_d.best_estimator_
best_ada_rf = grid_ada_rf.best_estimator_
best_ada_rf_d = grid_ada_rf_d.best_estimator_

In [15]:
cross_validation(X,y,best_ada_dt,5, None)
print("-"*80)
cross_validation(Xd,yd,best_ada_dt_d,5, None)
print("-"*80)
cross_validation(X,y,best_ada_rf,5, None)
print("-"*80)
cross_validation(Xd,yd,best_ada_rf_d,5, None)


Average accuracy score of 5 is : 0.8151
Average f1 score of 5 is       : 0.2639
Average roc score of 5 is      : 0.5838
--------------------------------------------------------------------------------
Average accuracy score of 5 is : 0.9249
Average f1 score of 5 is       : 0.9033
Average roc score of 5 is      : 0.941
--------------------------------------------------------------------------------
Average accuracy score of 5 is : 0.8831
Average f1 score of 5 is       : 0.0988
Average roc score of 5 is      : 0.8243
--------------------------------------------------------------------------------
Average accuracy score of 5 is : 0.982
Average f1 score of 5 is       : 0.9749
Average roc score of 5 is      : 0.9969


In [16]:
val = cross_validation(X,y,best_ada_rf,5, xtest=xtest)

Average accuracy score of 5 is : 0.8831
Average f1 score of 5 is       : 0.0988
Average roc score of 5 is      : 0.8243


In [17]:
samp_sub = pd.read_csv("dataset/sample_submission.csv")
samp_sub["Attrition"] = val
samp_sub.to_csv("est_sub_ada_rf.csv", index=False)