<a href="https://colab.research.google.com/github/sumedhakoranga/employee_future_prediction/blob/main/logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dataset link: https://www.kaggle.com/tejashvi14/employee-future-prediction

# Uploading dataset

In [None]:
from google.colab import files

uploaded = files.upload()

Saving Employee.csv to Employee.csv


# Initialization

In [None]:
import pandas as pd
import numpy as np

from itertools import product

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('Employee.csv')

X = df.drop(['LeaveOrNot'], axis=1)
y = df['LeaveOrNot']

# Preparing data

In [None]:
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)

In [None]:
numerical = ['Age']
categorical = ['Education', 'JoiningYear', 'City', 'PaymentTier', 'Gender', 'EverBenched', 'ExperienceInCurrentDomain']

# Creating a Pipeline

In [None]:
def create_new_pipeline(params):
    numerical_transformer = SimpleImputer(strategy='median')

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoding', OneHotEncoder(drop='first'))
    ])

    preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, numerical),
        ('categorical', categorical_transformer, categorical)
    ])

    model = LogisticRegression(
        penalty='elasticnet',
        solver='saga',
        random_state=42,
        **params
    )

    pipeline = Pipeline(
    steps=[
           ('preprocessing', preprocessor),
           ('model', model)
          ]
    )

    return pipeline

# Hyperparameter Tuning

In [None]:
search_space = {
    'C': [0.1, 0.5, 1, 5, 10],
    'l1_ratio': np.linspace(0, 1, num=10)
}

In [None]:
max_score = 0
best_params = {}

for val in product(*search_space.values()):
    params = {}
    for i, param in enumerate(search_space.keys()):
        params[param] = val[i]
    print(params)

    pipeline = create_new_pipeline(params)

    pipeline.fit(X_train, y_train)

    score = pipeline.score(X_val, y_val)
    if score > max_score:
        max_score = score
        best_params = params
    print(f'Score: {score}\tBest score: {max_score}')

{'C': 0.1, 'l1_ratio': 0.0}
Score: 0.8088077336197637	Best score: 0.8088077336197637
{'C': 0.1, 'l1_ratio': 0.1111111111111111}
Score: 0.8066595059076263	Best score: 0.8088077336197637
{'C': 0.1, 'l1_ratio': 0.2222222222222222}
Score: 0.8088077336197637	Best score: 0.8088077336197637
{'C': 0.1, 'l1_ratio': 0.3333333333333333}
Score: 0.807733619763695	Best score: 0.8088077336197637
{'C': 0.1, 'l1_ratio': 0.4444444444444444}
Score: 0.8088077336197637	Best score: 0.8088077336197637
{'C': 0.1, 'l1_ratio': 0.5555555555555556}
Score: 0.8088077336197637	Best score: 0.8088077336197637
{'C': 0.1, 'l1_ratio': 0.6666666666666666}
Score: 0.807733619763695	Best score: 0.8088077336197637
{'C': 0.1, 'l1_ratio': 0.7777777777777777}
Score: 0.8088077336197637	Best score: 0.8088077336197637
{'C': 0.1, 'l1_ratio': 0.8888888888888888}
Score: 0.8066595059076263	Best score: 0.8088077336197637
{'C': 0.1, 'l1_ratio': 1.0}
Score: 0.807733619763695	Best score: 0.8088077336197637
{'C': 0.5, 'l1_ratio': 0.0}
Score

In [None]:
best_params

{'C': 0.5, 'l1_ratio': 0.0}

In [None]:
max_score

0.8120300751879699

# Training

In [None]:
pipeline = create_new_pipeline(best_params)

In [None]:
pipeline.fit(X_full_train, y_full_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('numerical',
                                                  SimpleImputer(strategy='median'),
                                                  ['Age']),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoding',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['Education', 'JoiningYear',
                                                   'City', 'PaymentTier',
                                                   'Gender', 'EverBenched',
                                                   'ExperienceInCurrentDomain'])])),
               

# Validation

In [None]:
pipeline.score(X_full_train, y_full_train)

0.8011821601289629