In [15]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split,KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score,classification_report
import matplotlib.pyplot as plt

In [16]:
data = pd.read_csv("modified_employee_turnover.csv")

In [17]:
if data.isnull().sum().sum() > 0:
    print("Missing Values")
    exit()

In [18]:
print("Class Distribution:\n", data['Employee_Turnover'].value_counts(normalize=True))

Class Distribution:
 Employee_Turnover
0    0.502222
1    0.497778
Name: proportion, dtype: float64


In [19]:
X = data.drop('Employee_Turnover', axis=1)
y = data['Employee_Turnover']

In [20]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,stratify=y, random_state=42)

In [21]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [22]:
C_values = np.logspace(-3,1,20)
l1_ratios = [0.1,.5,.7,.9,.95,.99,1.0]
kf = KFold(n_splits=10, shuffle=True, random_state=42)

best_f1 = 0
best_params = {}

for C in C_values:
    for l1_ratio in l1_ratios:
        f1_scores = []
        for train_idx, val_idx in kf.split(X_train_scaled):
            X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

            model = LogisticRegression(penalty='elasticnet', C=C, l1_ratio=l1_ratio, solver='saga', max_iter=1000,random_state=42, class_weight='balanced')
            model.fit(X_tr, y_tr)
            y_pred = model.predict(X_val)
            f1_scores.append(f1_score(y_val, y_pred))

        mean_f1 = np.mean(f1_scores)
        if mean_f1 > best_f1:
            best_f1 = mean_f1
            best_params = {'C': C, 'l1_ratio':l1_ratio}

print(f"Best Parameters: C={best_params['C']}, l1_ratio={best_params['l1_ratio']}")
print(f"Best Cross-Validated F1-Score: {best_f1:.4f}")

Best Parameters: C=0.20691380811147903, l1_ratio=0.7
Best Cross-Validated F1-Score: 0.8511


In [23]:
final_model = LogisticRegression(penalty='elasticnet', C= best_params['C'], l1_ratio=best_params['l1_ratio'], solver='saga', max_iter=1000, random_state=42, class_weight='balanced')
final_model.fit(X_train_scaled, y_train)

In [24]:
y_train_pred = final_model.predict(X_train_scaled)
print(f"Accuracy : {accuracy_score(y_train, y_train_pred):.4f}")
print(f"F1-Score: {f1_score(y_train, y_train_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_train, y_train_pred))

Accuracy : 0.8540
F1-Score: 0.8538

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.85      0.85       475
           1       0.85      0.86      0.85       470

    accuracy                           0.85       945
   macro avg       0.85      0.85      0.85       945
weighted avg       0.85      0.85      0.85       945



In [25]:
y_test_pred = final_model.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, y_test_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))


Accuracy: 0.8963
F1-Score: 0.8960

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.90      0.90       203
           1       0.90      0.90      0.90       202

    accuracy                           0.90       405
   macro avg       0.90      0.90      0.90       405
weighted avg       0.90      0.90      0.90       405



In [26]:
test_results = X_test.copy()
test_results['Actual_Turnover'] = y_test
test_results['Predicted_Turnover'] = y_test_pred
test_results['Turnover_Probability'] = final_model.predict_proba(X_test_scaled)[:,1]
test_results['Index'] = X_test.index

In [27]:
leavers = test_results[test_results['Predicted_Turnover'] == 1]

In [28]:
if not leavers.empty:
    columns_to_show = ['Index', 'Actual_Turnover', 'Predicted_Turnover', 'Turnover_Probability',
                       'Job_Satisfaction', 'Years_At_Company', 'Monthly_Income', 'Department']
    print(leavers[columns_to_show].sort_values(by='Turnover_Probability', ascending=False))
    print(f"\nTotal number of employees predicted to leave: {len(leavers)}")
else:
    print("No employees are predicted to leave.")

      Index  Actual_Turnover  Predicted_Turnover  Turnover_Probability  \
506     506                1                   1              0.999754   
554     554                1                   1              0.999442   
1079   1079                1                   1              0.999434   
669     669                1                   1              0.999081   
106     106                1                   1              0.999058   
...     ...              ...                 ...                   ...   
943     943                0                   1              0.539778   
789     789                0                   1              0.539616   
48       48                0                   1              0.534209   
276     276                0                   1              0.508051   
440     440                1                   1              0.502295   

      Job_Satisfaction  Years_At_Company  Monthly_Income  Department  
506           0.953170          0.908092