In [75]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [76]:
df=pd.read_csv('employee_data.csv')

In [77]:
df.head()

Unnamed: 0,EmployeeID,Productivity,Years_with_Company,Last_Performance_Score,Age,Department,Education_Level,Turnover
0,1,64.14,10,1,49,IT,1,0
1,2,59.94,14,5,23,IT,4,0
2,3,69.3,11,20,27,Sales,3,0
3,4,65.05,18,8,30,Finance,4,0
4,5,64.73,12,43,54,Marketing,1,0


In [78]:
df.describe()

Unnamed: 0,EmployeeID,Productivity,Years_with_Company,Last_Performance_Score,Age,Education_Level,Turnover
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,50.5,73.493,10.33,47.27,41.69,2.53,0.23
std,29.011492,15.034512,5.713806,27.479232,12.087806,1.17598,0.422953
min,1.0,32.78,1.0,1.0,22.0,1.0,0.0
25%,25.75,62.7225,5.0,27.5,32.0,1.75,0.0
50%,50.5,73.045,10.0,47.0,42.0,2.0,0.0
75%,75.25,83.68,15.0,70.5,51.25,4.0,0.0
max,100.0,107.73,20.0,98.0,64.0,4.0,1.0


In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   EmployeeID              100 non-null    int64  
 1   Productivity            100 non-null    float64
 2   Years_with_Company      100 non-null    int64  
 3   Last_Performance_Score  100 non-null    int64  
 4   Age                     100 non-null    int64  
 5   Department              100 non-null    object 
 6   Education_Level         100 non-null    int64  
 7   Turnover                100 non-null    int64  
dtypes: float64(1), int64(6), object(1)
memory usage: 6.4+ KB


In [80]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import statsmodels.api as sm

In [81]:
df=pd.get_dummies(df, columns=['Department'], drop_first=True)

In [82]:
df['Department_HR'] = df['Department_HR'].astype(int)
df['Department_IT'] = df['Department_IT'].astype(int)
df['Department_Marketing'] = df['Department_Marketing'].astype(int)
df['Department_Sales'] = df['Department_Sales'].astype(int)

In [83]:
df.head()

Unnamed: 0,EmployeeID,Productivity,Years_with_Company,Last_Performance_Score,Age,Education_Level,Turnover,Department_HR,Department_IT,Department_Marketing,Department_Sales
0,1,64.14,10,1,49,1,0,0,1,0,0
1,2,59.94,14,5,23,4,0,0,1,0,0
2,3,69.3,11,20,27,3,0,0,0,0,1
3,4,65.05,18,8,30,4,0,0,0,0,0
4,5,64.73,12,43,54,1,0,0,0,1,0


In [84]:
X = df.drop(columns=['Turnover'])
y = df['Turnover']

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [86]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

In [87]:
Y_pred=log_reg.predict(X_test)

In [88]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [89]:
round(accuracy_score(y_test,Y_pred)*100)

73

In [90]:
confusion_matrix(y_test,Y_pred)

array([[21,  3],
       [ 5,  1]], dtype=int64)

In [91]:
# Adding a constant to the model
X_sm = sm.add_constant(X)

logit_model = sm.Logit(y, X_sm)
result = logit_model.fit()

# Printing the summary of the model
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.494070
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               Turnover   No. Observations:                  100
Model:                          Logit   Df Residuals:                       89
Method:                           MLE   Df Model:                           10
Date:                Fri, 26 Jul 2024   Pseudo R-squ.:                 0.08383
Time:                        13:08:29   Log-Likelihood:                -49.407
converged:                       True   LL-Null:                       -53.928
Covariance Type:            nonrobust   LLR p-value:                    0.5282
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                     -2.3279      2.230     -1.044      0.297      -6.698      

In [101]:
report = classification_report(y_test, Y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.81      0.88      0.84        24
           1       0.25      0.17      0.20         6

    accuracy                           0.73        30
   macro avg       0.53      0.52      0.52        30
weighted avg       0.70      0.73      0.71        30



In [92]:
df['Turnover_Probability'] = log_reg.predict_proba(X)[:, 1]

In [93]:
df.head()

Unnamed: 0,EmployeeID,Productivity,Years_with_Company,Last_Performance_Score,Age,Education_Level,Turnover,Department_HR,Department_IT,Department_Marketing,Department_Sales,Turnover_Probability
0,1,64.14,10,1,49,1,0,0,1,0,0,0.521574
1,2,59.94,14,5,23,4,0,0,1,0,0,0.16258
2,3,69.3,11,20,27,3,0,0,0,0,1,0.11938
3,4,65.05,18,8,30,4,0,0,0,0,0,0.141946
4,5,64.73,12,43,54,1,0,0,0,1,0,0.706252


In [94]:
#none of the factors are statistically significant, meaning we don't have strong evidence that any of these factors are good predictors of turnover based on this model.

In [102]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [103]:
A = df.drop(columns=['Productivity'])
B = df['Productivity']

In [104]:
A_train, A_test, B_train, B_test = train_test_split(A, B, test_size=0.3, random_state=42)

In [105]:
lin_reg = LinearRegression()
lin_reg.fit(A_train, B_train)

In [106]:
B_pred = lin_reg.predict(A_test)

In [107]:
mse = mean_squared_error(B_test, B_pred)
r2 = r2_score(B_test, B_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 190.24009936403064
R-squared: 0.22983805124351142


In [111]:
df['Predicted_Productivity'] = lin_reg.predict(A)

In [112]:
df.head()

Unnamed: 0,EmployeeID,Productivity,Years_with_Company,Last_Performance_Score,Age,Education_Level,Turnover,Department_HR,Department_IT,Department_Marketing,Department_Sales,Turnover_Probability,Predicted_Productivity
0,1,64.14,10,1,49,1,0,0,1,0,0,0.521574,59.181268
1,2,59.94,14,5,23,4,0,0,1,0,0,0.16258,72.23658
2,3,69.3,11,20,27,3,0,0,0,0,1,0.11938,71.592918
3,4,65.05,18,8,30,4,0,0,0,0,0,0.141946,70.707362
4,5,64.73,12,43,54,1,0,0,0,1,0,0.706252,46.346494


In [113]:
average_productivity = df['Predicted_Productivity'].mean()

print("Average Predicted Productivity:", average_productivity)

Average Predicted Productivity: 73.0216087796796


Logistic Regression Model
Model Summary:

The logistic regression model was used to predict employee turnover.
The model did not identify any statistically significant predictors of turnover (all p-values > 0.05).
Pseudo R-squared value is 0.08383, indicating the model explains about 8.38% of the variance in turnover, which is relatively low.
Classification accuracy is 73%, but the model's performance for predicting the minority class (turnover) is poor, with low precision, recall, and F1-score.

Linear Regression Model
Model Summary:

The linear regression model was used to predict employee productivity.
Mean Squared Error (MSE): 190.24
R-squared: 0.23, indicating that about 23% of the variance in productivity is explained by the model.
The average predicted productivity for the entire employee base is approximately 73.02.