<a href="https://colab.research.google.com/github/sumedhakoranga/employee_future_prediction/blob/main/xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dataset link: https://www.kaggle.com/tejashvi14/employee-future-prediction

# Uploading dataset

In [None]:
from google.colab import files

uploaded = files.upload()

Saving Employee.csv to Employee.csv


# Initialization

In [None]:
import pandas as pd
import numpy as np

from itertools import product

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('Employee.csv')

X = df.drop(['LeaveOrNot'], axis=1)
y = df['LeaveOrNot']

# Preparing data

In [None]:
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)

In [None]:
numerical = ['Age']
categorical = ['Education', 'JoiningYear', 'City', 'PaymentTier', 'Gender', 'EverBenched', 'ExperienceInCurrentDomain']

# Creating a Pipeline

In [None]:
def create_new_pipeline(params):
    numerical_transformer = SimpleImputer(strategy='median')

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoding', OneHotEncoder(drop='first'))
    ])

    preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, numerical),
        ('categorical', categorical_transformer, categorical)
    ])

    model = XGBClassifier(
        n_jobs=-1,
        random_state=42,
        **params
    )

    pipeline = Pipeline(
    steps=[
           ('preprocessing', preprocessor),
           ('model', model)
          ]
    )

    return pipeline

# Hyperparameter Tuning

In [None]:
search_space = {
    'n_estimators': np.linspace(10, 700, num=7).astype('int'),
    'max_depth': np.linspace(1, 10, num=5).astype('int'),
    'learning_rate': np.logspace(-3, 1, num=9),
    'reg_alpha': np.logspace(-1, 1, num=5),
    'reg_lambda': np.logspace(-1, 1, num=5)
}

In [None]:
max_score = 0
best_params = {}

for val in product(*search_space.values()):
    params = {}
    for i, param in enumerate(search_space.keys()):
        params[param] = val[i]
    print(params)

    pipeline = create_new_pipeline(params)

    pipeline.fit(X_train, y_train)

    score = pipeline.score(X_val, y_val)
    if score > max_score:
        max_score = score
        best_params = params
    print(f'Score: {score}\tBest score: {max_score}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
{'n_estimators': 470, 'max_depth': 7, 'learning_rate': 10.0, 'reg_alpha': 0.1, 'reg_lambda': 0.1}
Score: 0.3125671321160043	Best score: 0.8592910848549946
{'n_estimators': 470, 'max_depth': 7, 'learning_rate': 10.0, 'reg_alpha': 0.1, 'reg_lambda': 0.31622776601683794}
Score: 0.3125671321160043	Best score: 0.8592910848549946
{'n_estimators': 470, 'max_depth': 7, 'learning_rate': 10.0, 'reg_alpha': 0.1, 'reg_lambda': 1.0}
Score: 0.6154672395273899	Best score: 0.8592910848549946
{'n_estimators': 470, 'max_depth': 7, 'learning_rate': 10.0, 'reg_alpha': 0.1, 'reg_lambda': 3.1622776601683795}
Score: 0.6842105263157895	Best score: 0.8592910848549946
{'n_estimators': 470, 'max_depth': 7, 'learning_rate': 10.0, 'reg_alpha': 0.1, 'reg_lambda': 10.0}
Score: 0.3125671321160043	Best score: 0.8592910848549946
{'n_estimators': 470, 'max_depth': 7, 'learning_rate': 10.0, 'reg_alpha': 0.31622776601683794, 'reg_lambda': 0.1}
Score: 0.31256

In [None]:
best_params

{'learning_rate': 1.0,
 'max_depth': 5,
 'n_estimators': 10,
 'reg_alpha': 0.31622776601683794,
 'reg_lambda': 0.31622776601683794}

In [None]:
max_score

0.8592910848549946

# Training

In [None]:
pipeline = create_new_pipeline(best_params)

In [None]:
pipeline.fit(X_full_train, y_full_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('numerical',
                                                  SimpleImputer(strategy='median'),
                                                  ['Age']),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoding',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['Education', 'JoiningYear',
                                                   'City', 'PaymentTier',
                                                   'Gender', 'EverBenched',
                                                   'ExperienceInCurrentDomain'])])),
               

# Validation

In [None]:
pipeline.score(X_full_train, y_full_train)

0.8737238044062332