# Import libraries

In [90]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.preprocessing import OrdinalEncoder

# Load and check dataset

In [92]:
df_data = pd.read_csv('dataset/HR_comma_sep.csv')
df_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [94]:
print('Summary train data')
print('*'*50)

print(f'Shape: {df_data.shape}')
print('*'*50)

print(f'Data description: \n {df_data.describe()}')
print('*'*50)


Summary train data
**************************************************
Shape: (14999, 10)
**************************************************
Data description: 
        satisfaction_level  last_evaluation  number_project  \
count        14999.000000     14999.000000    14999.000000   
mean             0.612834         0.716102        3.803054   
std              0.248631         0.171169        1.232592   
min              0.090000         0.360000        2.000000   
25%              0.440000         0.560000        3.000000   
50%              0.640000         0.720000        4.000000   
75%              0.820000         0.870000        5.000000   
max              1.000000         1.000000        7.000000   

       average_montly_hours  time_spend_company  Work_accident          left  \
count          14999.000000        14999.000000   14999.000000  14999.000000   
mean             201.050337            3.498233       0.144610      0.238083   
std               49.943099            1.

In [97]:
print(f'Data information: {df_data.info()}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB
Data information: None


In [101]:
print('--- Check feature by feature ---')
print('*'*50)

for fe in df_data.columns:
    print('-'*20)
    print(df_data[fe].value_counts())

--- Check feature by feature ---
**************************************************
--------------------
0.10    358
0.11    335
0.74    257
0.77    252
0.84    247
       ... 
0.25     34
0.28     31
0.27     30
0.26     30
0.12     30
Name: satisfaction_level, Length: 92, dtype: int64
--------------------
0.55    358
0.50    353
0.54    350
0.51    345
0.57    333
       ... 
0.39     52
0.43     50
0.38     50
0.44     44
0.36     22
Name: last_evaluation, Length: 65, dtype: int64
--------------------
4    4365
3    4055
5    2761
2    2388
6    1174
7     256
Name: number_project, dtype: int64
--------------------
135    153
156    153
149    148
151    147
160    136
      ... 
297      7
288      6
299      6
96       6
303      6
Name: average_montly_hours, Length: 215, dtype: int64
--------------------
3     6443
2     3244
4     2557
5     1473
6      718
10     214
7      188
8      162
Name: time_spend_company, dtype: int64
--------------------
0    12830
1     2169
Name: Wo

# Explore Data Analysis

## Heatmap

In [104]:
# create a copy of original dataset
df_corr = df_data.copy()

In [108]:
# encode categorical features
encoder = OrdinalEncoder()
df_corr[['salary', 'Department']] = encoder.fit_transform(df_corr[['salary', 'Department']])

In [113]:
# create corrrelation matrix
df_corr = df_corr.corr()
df_corr_round = df_corr.round(3)

In [115]:
# draw headmap
fig = ff.create_annotated_heatmap(
            z=df_corr_round.to_numpy(),
            x=df_corr.columns.tolist(),
            y=df_corr.columns.tolist(),
            zmax=1, zmin=-1,
            showscale=True,
            hoverongaps=True,
            colorscale='Viridis',
            annotation_text=df_corr_round.to_numpy()
            )

fig.update_layout(
    margin = dict(t=10,r=10,b=10,l=10),
    showlegend = False,
    width = 800, height = 600
)

## Influence of number of working year to left/stay company decision

In [120]:
# create dataframe with number of working years in company and left information
df_tsc = df_data[['time_spend_company', 'left']]
df_tsc = df_tsc.groupby('time_spend_company')['left'].value_counts()
df_tsc = df_tsc.unstack()
df_tsc.fillna(0)

left,0,1
time_spend_company,Unnamed: 1_level_1,Unnamed: 2_level_1
2,3191.0,53.0
3,4857.0,1586.0
4,1667.0,890.0
5,640.0,833.0
6,509.0,209.0
7,188.0,0.0
8,162.0,0.0
10,214.0,0.0


In [126]:
fig = go.Figure(data=[
    go.Bar(name='Stay', x=df_tsc.index, y=df_tsc[0], text=df_tsc[0], textposition='auto'),
    go.Bar(name='Left', x=df_tsc.index, y=df_tsc[1], text=df_tsc[1], textposition='auto'),
])
# Change the bar mode
fig.update_layout(barmode='group',
                title="Statistic number of working years of employees",
                xaxis_title="Number of working years",
                yaxis_title="Number of employees",
                legend_title="Employee Type")
fig.show()

## Influence of monthly working hours to left/stay company decision

In [60]:
# create dataframe with average monthly working hours and left information
df_mh = df_data[['average_montly_hours', 'left']]

# draw violin chart
fig = px.violin(
                df_mh, 
                y="average_montly_hours", 
                color='left', 
                points='all', 
                box=True,
                title='Influence of monthly working hours to left/stay company decision: 0 - Stay, 1 - Left',
                labels={'average_montly_hours': 'Average Monthly Working Hours', 'left':'Left/Stay'})
fig.show()

## Compare percentage of leave the company between departments

In [192]:
# create dataframe with department and left information
df_dpm = df_data[['Department', 'left']]
df_dpm = df_dpm.groupby('Department')['left'].value_counts()
df_dpm = df_dpm.unstack()
df_dpm.columns = ['Stay', 'Left']
df_dpm['Percent'] = round((df_dpm['Left'] / df_dpm['Stay'])*100, 2)
df_dpm.sort_values('Percent', ascending=False, inplace=True)
df_dpm.style.background_gradient(cmap='plasma_r')

Unnamed: 0_level_0,Stay,Left,Percent
Department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
hr,524,215,41.03
accounting,563,204,36.23
technical,2023,697,34.45
support,1674,555,33.15
sales,3126,1014,32.44
marketing,655,203,30.99
IT,954,273,28.62
product_mng,704,198,28.12
RandD,666,121,18.17
management,539,91,16.88


In [199]:
# draw bar chart
fig = px.bar(df_dpm, y='Percent', text='Percent', color='Percent')
fig.show()

## Influence of number of working projects current to left/stay company decision

In [206]:
# create dataframe with number of projects and left information
df_project = df_data[['number_project', 'left']]
df_project = df_project.groupby('number_project')['left'].value_counts()
df_project = df_project.unstack()
# df_project

In [213]:
# draw bar chart
fig = go.Figure(data=[
    go.Bar(name='Stay', x=df_project.index, y=df_project[0], text=df_project[0], textposition='auto'),
    go.Bar(name='Left', x=df_project.index, y=df_project[1], text=df_project[1], textposition='auto'),
])
fig.update_layout(barmode='group',
                title="Statistic number of projects of employees",
                xaxis_title="Number of projects",
                yaxis_title="Number of employees",
                legend_title="Employee Type")
fig.show()

## Influence of last evaluation to left/stay company decision

In [220]:
# create dataframe with last evaluation and left information
df_le = df_data[['last_evaluation', 'left']]

# draw violin chart
fig = px.violin(df_le, y="last_evaluation", color='left', points='all', box=True)
fig.show()

## Influence of salary level current to left/stay company decision

In [227]:
# create dataframe with salary and left information
df_salary = df_data[['salary', 'left']]
df_salary = df_salary.groupby('salary')['left'].value_counts()
df_salary = df_salary.unstack()
# df_salary

In [235]:
# draw bar chart
salary_level = ['High', 'Low', 'Medium']
fig = go.Figure(data=[
    go.Bar(name='Stay', x=salary_level, y=df_salary[0], text=df_salary[0], textposition='auto'),
    go.Bar(name='Left', x=salary_level, y=df_salary[1], text=df_salary[1], textposition='auto'),
])
fig.update_layout(barmode='group',
                title="Statistic salary of employees",
                xaxis_title="Salary Level",
                yaxis_title="Number of employees",
                legend_title="Employee Type")
fig.show()

## Influence of promotion last 5 years to left/stay company decision

In [252]:
df_pl5 = df_data[['promotion_last_5years', 'left']]
df_pl5 = df_pl5.groupby('promotion_last_5years')['left'].value_counts()
df_pl5 = df_pl5.unstack()
df_pl5.style.background_gradient(cmap='plasma_r')

left,0,1
promotion_last_5years,Unnamed: 1_level_1,Unnamed: 2_level_1
0,11128,3552
1,300,19


In [261]:
# draw bar chart
fig = go.Figure(data=[
    go.Bar(name='Stay', x=df_pl5.index, y=df_pl5[0], text=df_pl5[0], textposition='auto'),
    go.Bar(name='Left', x=df_pl5.index, y=df_pl5[1], text=df_pl5[1], textposition='auto'),
])
fig.update_layout(barmode='group',
                title="Statistic promotion last 5 years of employees",
                xaxis_title="Promotion last 5 years",
                yaxis_title="Number of employees",
                legend_title="Employee Type")
fig.show()

## Influence of working accident to left/stay company decision

In [270]:
df_wa = df_data[['Work_accident', 'left']]
df_wa = df_wa.groupby('Work_accident')['left'].value_counts()
df_wa = df_wa.unstack()
df_wa.style.background_gradient(cmap='plasma_r')

left,0,1
Work_accident,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9428,3402
1,2000,169


In [280]:
fig = go.Figure(data=[
    go.Bar(name='Stay', x=df_wa.index, y=df_wa[0], text=df_wa[0], textposition='auto'),
    go.Bar(name='Left', x=df_wa.index, y=df_wa[1], text=df_wa[1], textposition='auto'),
])
# Change the bar mode
fig.update_layout(barmode='group',
                title="Statistic working accident of employees",
                xaxis_title="Woking Accident",
                yaxis_title="Number of employees",
                legend_title="Employee Type")
fig.show()

## Influence of sastisfaction level to left/stay company decision

In [290]:
# create dataframe with satisfaction level and left information
df_sl = df_data[['satisfaction_level', 'left']]

# draw violin chart
fig = px.violin(df_sl, y='satisfaction_level', color='left', points='all', box=True)
fig.show()

## Cluster employee with satisfaction level and last evaluation

In [None]:
# draw scatter chart with satisfaction level and last evaluation information of people who leave company
fig = px.scatter(
                    df_data, 
                    x=df_data['satisfaction_level'][df_data['left'] == 1], 
                    y=df_data['last_evaluation'][df_data['left'] == 1], 
                    width=1000,
                    height=800,
                    title='Employees who left',
                    labels={'x':'Satisfaction level', 'y':'Last Evaluation'},
                    # color=df_data['Work_accident'][df_data['left']==1],
                    # symbol=df_data['Work_accident'][df_data['left']==1]
                )
fig.show()

In [None]:
# draw scatter chart with satisfaction level and last evaluation information of people who stay company
fig = px.scatter(
                    df_data, 
                    x=df_data['satisfaction_level'][df_data['left'] == 0], 
                    y=df_data['last_evaluation'][df_data['left'] == 0], 
                    width=1000,
                    height=800,
                    title='Employees who left',
                    labels={'x':'Satisfaction level', 'y':'Last Evaluation'}
                )
fig.show()

In [None]:
# create dataframe with satisfaction level and last evaluation information
df_kmeans = df_data[df_data.left == 1].drop(['number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'left', 'promotion_last_5years', 'Department', 'salary'], axis=1)

# cluster people who leave company base on satisfaction level and last evaluation information
kmeans = KMeans(n_clusters=3, random_state=10).fit(df_kmeans)
# print(kmeans.cluster_centers_)
# print(kmeans.labels_)

# add cluster ID into original dataset
df_left = df_data[df_data.left == 1]
df_left['label'] = kmeans.labels_
df_left['label'] = df_left['label'].astype(str)

Unnamed: 0,satisfaction_level,last_evaluation
0,0.38,0.53
1,0.8,0.86
2,0.11,0.88
3,0.72,0.87
4,0.37,0.52


In [None]:
# draw scatter chart with clusters
fig = px.scatter(
                    df_left, 
                    x='satisfaction_level', 
                    y='last_evaluation', 
                    width=1000,
                    height=800,
                    color='label',
                    color_discrete_sequence=["red", "blue", "green"],
                    symbol='label',
                    title='Employees who left',
                    labels={'satisfaction_level':'Satisfaction level', 'last_evaluation':'Last Evaluation', 'label':'Cluster'}
                )
fig.show()

## Influence of average monthly working hours to people in 3 clusters

In [None]:
# draw displot chart
winner_month_hour = df_left[df_left.label == '0'].average_montly_hours
bad_match_month_hour = df_left[df_left.label == '1'].average_montly_hours
frustrated_month_hour = df_left[df_left.label == '2'].average_montly_hours

hist_data = [winner_month_hour, bad_match_month_hour, frustrated_month_hour]
group_labels = ['Winner', 'Bad Match', 'Frustrated']

fig = ff.create_distplot(hist_data, group_labels, bin_size=2.0, show_curve=True, show_hist=False)
fig.update_layout(title_text='Leavers: Hours per month distribution')
fig.show()
