In [1]:
# Goal: Predict Employee Satisfaction (Regression) via Decision Trees, bagging, Random Forests, gradient boosting
# and XGBoost.

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/employees-satisfaction-analysis/Employee Attrition.csv
/kaggle/input/employees-satisfaction-analysis/Questions.txt


In [3]:
dataset=pd.read_csv('/kaggle/input/employees-satisfaction-analysis/Employee Attrition.csv')
dataset=dataset.drop('Emp ID',axis=1)

In [4]:
dataset.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,dept,salary
0,0.38,0.53,2.0,157.0,3.0,0.0,0.0,sales,low
1,0.8,0.86,5.0,262.0,6.0,0.0,0.0,sales,medium
2,0.11,0.88,7.0,272.0,4.0,0.0,0.0,sales,medium
3,0.72,0.87,5.0,223.0,5.0,0.0,0.0,sales,low
4,0.37,0.52,2.0,159.0,3.0,0.0,0.0,sales,low


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15787 entries, 0 to 15786
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  float64
 3   average_montly_hours   14999 non-null  float64
 4   time_spend_company     14999 non-null  float64
 5   Work_accident          14999 non-null  float64
 6   promotion_last_5years  14999 non-null  float64
 7   dept                   14999 non-null  object 
 8   salary                 14999 non-null  object 
dtypes: float64(7), object(2)
memory usage: 1.1+ MB


In [6]:
dataset[dataset.isnull().any(axis=1)]

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,dept,salary
12783,,,,,,,,,
12784,,,,,,,,,
12785,,,,,,,,,
12786,,,,,,,,,
12787,,,,,,,,,
...,...,...,...,...,...,...,...,...,...
13566,,,,,,,,,
13567,,,,,,,,,
13568,,,,,,,,,
13569,,,,,,,,,


In [7]:
dataset = dataset.dropna()

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

In [9]:
X_train, X_test, y_train, y_test = train_test_split(dataset.drop('satisfaction_level',axis=1),
                                                   dataset['satisfaction_level'],
                                                   test_size=0.25, random_state=42)

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [11]:
categorical_columns=['dept', 'salary']

column_transformer_pipeline = ColumnTransformer([ 
                                        ('categorical', OneHotEncoder(), categorical_columns) 
                                                ], remainder='passthrough')
pipeline = Pipeline(steps=[('column_transformer', column_transformer_pipeline)])

preprocessor = pipeline.fit(X=X_train)

In [12]:
X_train.head()

Unnamed: 0,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,dept,salary
15268,0.86,6.0,139.0,6.0,0.0,0.0,technical,medium
1934,0.93,4.0,225.0,5.0,0.0,0.0,sales,medium
7900,0.71,5.0,243.0,3.0,0.0,0.0,technical,medium
2952,0.62,4.0,217.0,2.0,0.0,0.0,support,medium
4367,0.53,3.0,211.0,4.0,1.0,0.0,sales,low


In [13]:
y_train.head()

15268    0.38
1934     0.78
7900     0.56
2952     0.96
4367     0.83
Name: satisfaction_level, dtype: float64

In [14]:
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [15]:
X_train

array([[0., 0., 0., ..., 6., 0., 0.],
       [0., 0., 0., ..., 5., 0., 0.],
       [0., 0., 0., ..., 3., 0., 0.],
       ...,
       [0., 0., 0., ..., 2., 0., 0.],
       [0., 0., 0., ..., 5., 0., 0.],
       [0., 0., 0., ..., 4., 0., 0.]])

# Decision Trees

In [16]:
from sklearn.tree import DecisionTreeRegressor

In [17]:
dtr = DecisionTreeRegressor(criterion='squared_error',
                           splitter='best',
                           random_state=42)

In [18]:
r2 = cross_val_score(dtr, X_train, y_train, scoring='r2', cv=10,error_score='raise')
neg_rmse_error = cross_val_score(dtr, X_train, y_train, scoring='neg_root_mean_squared_error', cv=10)


print(f"R^2: {np.mean(r2).round(3)} +/- {np.std(r2).round(3)}")
print(f"-ve RMSE: {np.mean(neg_rmse_error).round(3)} +/- {np.std(neg_rmse_error).round(3)}")

R^2: 0.13 +/- 0.075
-ve RMSE: -0.231 +/- 0.009


In [19]:
y_pred = dtr.fit(X=X_train, y=y_train).predict(X_test)
print(f"R^2 score:{r2_score(y_test, y_pred).round(3)}")

R^2 score:0.119


# Bagging

In [20]:
from sklearn.ensemble import BaggingRegressor

In [21]:
bgr = BaggingRegressor(estimator = dtr, 
                      n_estimators=100,
                      bootstrap=True,
                      oob_score=False,
                      n_jobs=-1,
                      random_state=42)

In [22]:
r2 = cross_val_score(bgr, X_train, y_train, scoring='r2', cv=10, n_jobs=-1)
neg_rmse_error = cross_val_score(bgr, X_train, y_train, scoring='neg_root_mean_squared_error', cv=10, n_jobs=-1)


print(f"R^2: {np.mean(r2).round(3)} +/- {np.std(r2).round(3)}")
print(f"-ve RMSE: {np.mean(neg_rmse_error).round(3)} +/- {np.std(neg_rmse_error).round(3)}")

R^2: 0.514 +/- 0.032
-ve RMSE: -0.173 +/- 0.005


In [23]:
bagging_regressor = bgr.fit(X=X_train, y=y_train)
y_pred = bagging_regressor.predict(X_test)
print(f"R^2 score:{r2_score(y_test, y_pred).round(3)}")

R^2 score:0.496


# Random Forests

In [24]:
from sklearn.ensemble import RandomForestRegressor

In [25]:
rfr = RandomForestRegressor(n_estimators=100,
                           criterion='squared_error',
                           max_features = 'sqrt',
                           bootstrap=True,
                           oob_score=False,
                           n_jobs=-1,
                           random_state=42)

In [26]:
r2 = cross_val_score(rfr, X_train, y_train, scoring='r2', cv=10, n_jobs=-1)
neg_rmse_error = cross_val_score(rfr, X_train, y_train, scoring='neg_root_mean_squared_error', cv=10,n_jobs=-1)


print(f"R^2: {np.mean(r2).round(3)} +/- {np.std(r2).round(3)}")
print(f"-ve RMSE: {np.mean(neg_rmse_error).round(3)} +/- {np.std(neg_rmse_error).round(3)}")

R^2: 0.513 +/- 0.027
-ve RMSE: -0.173 +/- 0.004


In [27]:
randomforest_regressor = rfr.fit(X=X_train, y=y_train)
y_pred = randomforest_regressor.predict(X_test)
print(f"R^2 score:{r2_score(y_test, y_pred).round(3)}")

R^2 score:0.497


# Gradient Boosting

In [28]:
from sklearn.ensemble import GradientBoostingRegressor

In [29]:
gbr = GradientBoostingRegressor(loss = 'squared_error',
                               learning_rate=0.001,
                               n_estimators=2000,
                               random_state=42)

In [30]:
r2 = cross_val_score(gbr, X_train, y_train, scoring='r2', cv=10, n_jobs=-1)
neg_rmse_error = cross_val_score(gbr, X_train, y_train, scoring='neg_root_mean_squared_error', cv=10,n_jobs=-1)


print(f"R^2: {np.mean(r2).round(3)} +/- {np.std(r2).round(3)}")
print(f"-ve RMSE: {np.mean(neg_rmse_error).round(3)} +/- {np.std(neg_rmse_error).round(3)}")

R^2: 0.407 +/- 0.024
-ve RMSE: -0.191 +/- 0.004


In [31]:
gradboost_regressor = gbr.fit(X=X_train, y=y_train)
y_pred = gradboost_regressor.predict(X_test)
print(f"R^2 score:{r2_score(y_test, y_pred).round(3)}")

R^2 score:0.387


# Histogram-Based GBR

In [32]:
from sklearn.ensemble import HistGradientBoostingRegressor

In [33]:
hgbr = HistGradientBoostingRegressor(loss = 'squared_error',
                                     learning_rate=2e-2,
                                     max_iter = 1000,
                                     random_state=42)

In [34]:
r2 = cross_val_score(hgbr, X_train, y_train, scoring='r2', cv=10, n_jobs=-1)
neg_rmse_error = cross_val_score(hgbr, X_train, y_train, scoring='neg_root_mean_squared_error', cv=10,n_jobs=-1)


print(f"R^2: {np.mean(r2).round(3)} +/- {np.std(r2).round(3)}")
print(f"-ve RMSE: {np.mean(neg_rmse_error).round(3)} +/- {np.std(neg_rmse_error).round(3)}")

R^2: 0.469 +/- 0.028
-ve RMSE: -0.181 +/- 0.005


In [35]:
hgradboost_regressor = hgbr.fit(X=X_train, y=y_train)
y_pred = hgradboost_regressor.predict(X_test)
print(f"R^2 score:{r2_score(y_test, y_pred).round(3)}")

R^2 score:0.452


# XGBoost

In [36]:
from xgboost import XGBRegressor

In [37]:
xgbr = XGBRegressor(n_estimators=1000, eta=0.003, random_state=42)

In [38]:
r2 = cross_val_score(xgbr, X_train, y_train, scoring='r2', cv=10)
neg_rmse_error = cross_val_score(xgbr, X_train, y_train, scoring='neg_root_mean_squared_error', cv=10)


print(f"R^2: {np.mean(r2).round(3)} +/- {np.std(r2).round(3)}")
print(f"-ve RMSE: {np.mean(neg_rmse_error).round(3)} +/- {np.std(neg_rmse_error).round(3)}")

R^2: 0.46 +/- 0.028
-ve RMSE: -0.182 +/- 0.005


In [39]:
xgboost_regressor = xgbr.fit(X=X_train, y=y_train)
y_pred = xgboost_regressor.predict(X_test)
print(f"R^2 score:{r2_score(y_test, y_pred).round(3)}")

R^2 score:0.44
