### **Import Libraries & Read Data**

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import warnings
warnings.filterwarnings('ignore')

In [34]:
# load the data
employee_df = pd.read_csv('data/WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [35]:
employee_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [36]:
employee_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [37]:
employee_df.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

<br/>

### **Overall Attrition Overview**

In [38]:
# calculate attrition rate
attrition_counts = employee_df['Attrition'].value_counts()
attrition_rate = (attrition_counts['Yes'] / len(employee_df)) * 100

print(f"Total Employees: {len(employee_df)}")
print(f"Employees Left: {attrition_counts['Yes']}")
print(f"Employees Stayed: {attrition_counts['No']}")
print(f"Attrition Rate: {attrition_rate:.2f}%")

Total Employees: 1470
Employees Left: 237
Employees Stayed: 1233
Attrition Rate: 16.12%


In [39]:
# set color scheme
colors = {
    'primary': '#3873E5',
    'danger': '#ff1e0e',
    'success': '#00cc44',
    'warning': '#ffaa00',
    'info': '#00b4d8',
    'dark': '#2d3436'
}

In [40]:
fig_attrition = go.Figure(data=[go.Pie(
    labels=['Stayed', 'Left'],
    values=[attrition_counts['No'], attrition_counts['Yes']],
    hole=0.4,
    marker=dict(colors=[colors['success'], colors['danger']]),
    textfont=dict(size=14),
    textposition='outside',
    textinfo='label+percent+value'
)])

fig_attrition.update_layout(
    title='Overall Employee Attrition Rate',
    height=400,
    annotations=[dict(text=f'{attrition_rate:.1f}%<br>Attrition', 
                     x=0.5, y=0.5, font_size=20, showarrow=False)]
)
fig_attrition.show()

<br/>

### **Attrition by department**

In [41]:
dept_attrition = employee_df.groupby(['Department', 'Attrition']).size().unstack(fill_value=0)
dept_attrition['Total'] = dept_attrition.sum(axis=1)
dept_attrition['Attrition_Rate'] = (dept_attrition['Yes'] / dept_attrition['Total']) * 100
print(dept_attrition)

Attrition                No  Yes  Total  Attrition_Rate
Department                                             
Human Resources          51   12     63       19.047619
Research & Development  828  133    961       13.839750
Sales                   354   92    446       20.627803


In [42]:
# department comparison
fig_dept = px.bar(
    employee_df.groupby(['Department', 'Attrition']).size().reset_index(name='Count'),
    x='Department', 
    y='Count',
    color='Attrition',
    title='Attrition by Department',
    text='Count',
    color_discrete_map={'Yes': colors['danger'], 'No': colors['success']},
    barmode='group'
)
fig_dept.update_traces(texttemplate='%{text}', textposition='outside')
fig_dept.update_layout(height=400)
fig_dept.show()

<br/>

### **Age Distribution**

In [43]:
# age distribution by attrition
fig_age = px.histogram(
    employee_df, 
    x='Age', 
    color='Attrition',
    nbins=20,
    title='Age Distribution by Attrition Status',
    marginal='box',
    color_discrete_map={'Yes': colors['danger'], 'No': colors['success']},
    opacity=0.7
)
fig_age.update_layout(height=500, barmode='overlay')
fig_age.show()

In [44]:
# age statistics
age_stats = employee_df.groupby('Attrition')['Age'].agg(['mean', 'median', 'std'])
print("Age Statistics by Attrition:")
print(age_stats)

Age Statistics by Attrition:
                mean  median      std
Attrition                            
No         37.561233    36.0  8.88836
Yes        33.607595    32.0  9.68935


<br/>

### **Salary Analysis**

In [45]:
# monthly income distribution
fig_salary = px.box(
    employee_df,
    x='Attrition',
    y='MonthlyIncome',
    color='Attrition',
    title='Monthly Income Distribution by Attrition Status',
    color_discrete_map={'Yes': colors['danger'], 'No': colors['success']},
    points='all'
)
fig_salary.update_layout(height=500, showlegend=False)
fig_salary.show()

In [46]:
salary_stats = employee_df.groupby('Attrition')['MonthlyIncome'].agg(['mean', 'median', 'std'])
print("Salary Statistics by Attrition:")
print(salary_stats)

Salary Statistics by Attrition:
                  mean  median          std
Attrition                                  
No         6832.739659  5204.0  4818.208001
Yes        4787.092827  3202.0  3640.210367


<br/>

### **Satisfaction Metrics Analysis**

In [47]:
satisfaction_cols = ['JobSatisfaction', 'EnvironmentSatisfaction', 
                    'RelationshipSatisfaction', 'WorkLifeBalance']

fig_satisfaction = make_subplots(
    rows=2, cols=2,
    subplot_titles=satisfaction_cols,
    vertical_spacing=0.15,
    horizontal_spacing=0.1
)

positions = [(1,1), (1,2), (2,1), (2,2)]

for col, pos in zip(satisfaction_cols, positions):
    for attrition in ['Yes', 'No']:
        data = employee_df[employee_df['Attrition'] == attrition][col].value_counts().sort_index()
        color = colors['danger'] if attrition == 'Yes' else colors['success']
        
        fig_satisfaction.add_trace(
            go.Bar(
                x=data.index,
                y=data.values,
                name=f'Attrition: {attrition}',
                marker_color=color,
                showlegend=(pos == (1,1))  # Only show legend for first subplot
            ),
            row=pos[0], col=pos[1]
        )

fig_satisfaction.update_layout(
    title='Satisfaction Metrics by Attrition Status',
    height=600,
    barmode='group'
)
fig_satisfaction.show()

<br/>

### **Years at Company**

In [48]:
fig_years = px.histogram(
    employee_df,
    x='YearsAtCompany',
    color='Attrition',
    title='Employee Tenure Distribution',
    nbins=20,
    color_discrete_map={'Yes': colors['danger'], 'No': colors['success']},
    marginal='violin'
)
fig_years.update_layout(height=500, barmode='overlay')
fig_years.show()

<br/>

### **Overtime Analysis**

In [49]:
overtime_attrition = pd.crosstab(employee_df['OverTime'], employee_df['Attrition'], normalize='index') * 100
print("Attrition Rate by Overtime Status:")
print(overtime_attrition)

Attrition Rate by Overtime Status:
Attrition         No        Yes
OverTime                       
No         89.563567  10.436433
Yes        69.471154  30.528846


In [50]:
fig_overtime = px.bar(
    employee_df.groupby(['OverTime', 'Attrition']).size().reset_index(name='Count'),
    x='OverTime',
    y='Count',
    color='Attrition',
    title='Impact of Overtime on Attrition',
    text='Count',
    color_discrete_map={'Yes': colors['danger'], 'No': colors['success']},
    barmode='group'
)
fig_overtime.update_traces(texttemplate='%{text}', textposition='outside')
fig_overtime.show()

<br/>

### **Job Role Analysis**

In [52]:
role_attrition = employee_df.groupby(['JobRole', 'Attrition']).size().unstack(fill_value=0)
role_attrition['Total'] = role_attrition.sum(axis=1)
role_attrition['Attrition_Rate'] = (role_attrition.get('Yes', 0) / role_attrition['Total']) * 100
role_attrition = role_attrition.sort_values('Attrition_Rate', ascending=False)

In [53]:
fig_role = px.bar(
    role_attrition.reset_index(),
    x='Attrition_Rate',
    y='JobRole',
    orientation='h',
    title='Attrition Rate by Job Role',
    text='Attrition_Rate',
    color='Attrition_Rate',
    color_continuous_scale=['green', 'yellow', 'red']
)
fig_role.update_traces(texttemplate='%{text:.1f}%', textposition='outside')
fig_role.update_layout(height=600, showlegend=False)
fig_role.show()

<br/>

### **Distance from Home**

In [54]:
fig_distance = px.box(
    employee_df,
    x='Attrition',
    y='DistanceFromHome',
    color='Attrition',
    title='Distance from Home by Attrition Status',
    color_discrete_map={'Yes': colors['danger'], 'No': colors['success']}
)
fig_distance.update_layout(height=400, showlegend=False)
fig_distance.show()

<br/>

### **Correlation Map**

In [55]:
# convert attrition to numeric for correlation
employee_df_corr = employee_df.copy()
employee_df_corr['Attrition_Binary'] = (employee_df_corr['Attrition'] == 'Yes').astype(int)

In [56]:
corr_features = ['Attrition_Binary', 'Age', 'MonthlyIncome', 'YearsAtCompany', 
                'YearsInCurrentRole', 'YearsSinceLastPromotion', 'JobSatisfaction',
                'EnvironmentSatisfaction', 'WorkLifeBalance', 'DistanceFromHome',
                'NumCompaniesWorked', 'TotalWorkingYears']

correlation_matrix = employee_df_corr[corr_features].corr()

fig_corr = px.imshow(
    correlation_matrix,
    text_auto='.2f',
    color_continuous_scale='RdBu_r',
    title='Feature Correlation Matrix',
    aspect='auto'
)
fig_corr.update_layout(height=700)
fig_corr.show()

<br/>

### **Key Insights**

In [57]:
risk_factors = {
    'Overtime Workers': employee_df[employee_df['OverTime'] == 'Yes']['Attrition'].value_counts(normalize=True)['Yes'] * 100,
    'Low Job Satisfaction': employee_df[employee_df['JobSatisfaction'] == 1]['Attrition'].value_counts(normalize=True).get('Yes', 0) * 100,
    'Long Commute (>20km)': employee_df[employee_df['DistanceFromHome'] > 20]['Attrition'].value_counts(normalize=True).get('Yes', 0) * 100,
    'New Employees (<2 years)': employee_df[employee_df['YearsAtCompany'] < 2]['Attrition'].value_counts(normalize=True).get('Yes', 0) * 100,
}

In [58]:
print("\nHigh Risk Groups (Attrition Rate):")
for factor, rate in sorted(risk_factors.items(), key=lambda x: x[1], reverse=True):
    print(f"  {factor}: {rate:.1f}%")


High Risk Groups (Attrition Rate):
  New Employees (<2 years): 34.9%
  Overtime Workers: 30.5%
  Low Job Satisfaction: 22.8%
  Long Commute (>20km): 22.1%


In [59]:
# create risk factors visualization
fig_risk = go.Figure(go.Bar(
    x=list(risk_factors.values()),
    y=list(risk_factors.keys()),
    orientation='h',
    marker_color=[colors['danger'] if v > 20 else colors['warning'] if v > 15 else colors['info'] for v in risk_factors.values()],
    text=[f'{v:.1f}%' for v in risk_factors.values()],
    textposition='outside'
))

fig_risk.update_layout(
    title='High Risk Groups - Attrition Rates',
    xaxis_title='Attrition Rate (%)',
    height=400
)
fig_risk.show()