# Employee Attrition Prediction - Exploratory Data Analysis
---
**Objective:** Analyze HR employee data to understand factors affecting employee attrition

---

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Settings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("✅ Libraries imported successfully!")

## 2. Load Dataset

In [None]:
# Load data
df = pd.read_csv('../data/hr_employee_data.csv')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

## 3. Dataset Overview

In [None]:
# Basic information
print("Dataset Information:")
print("="*60)
df.info()

In [None]:
# Statistical summary
df.describe()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values:")
print("="*60)
if missing_values.sum() == 0:
    print("✅ No missing values found!")
else:
    print(missing_values[missing_values > 0])

## 4. Target Variable Analysis

In [None]:
# Attrition distribution
print("Attrition Distribution:")
print("="*60)
attrition_counts = df['Attrition'].value_counts()
print(attrition_counts)
print(f"\nAttrition Rate: {(attrition_counts['Yes'] / len(df)) * 100:.2f}%")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
sns.countplot(data=df, x='Attrition', ax=axes[0], palette='Set2')
axes[0].set_title('Attrition Count Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Attrition', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
for container in axes[0].containers:
    axes[0].bar_label(container)

# Pie chart
colors = ['#90EE90', '#FFB6C1']
axes[1].pie(attrition_counts, labels=attrition_counts.index, autopct='%1.1f%%', 
            startangle=90, colors=colors, textprops={'fontsize': 12})
axes[1].set_title('Attrition Percentage Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

## 5. Attrition Analysis by Key Factors

### 5.1 Job Satisfaction vs Attrition

In [None]:
# Job Satisfaction analysis
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Count plot
sns.countplot(data=df, x='JobSatisfaction', hue='Attrition', ax=axes[0], palette='Set1')
axes[0].set_title('Job Satisfaction vs Attrition', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Job Satisfaction Level (1=Low, 4=High)', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].legend(title='Attrition')

# Percentage plot
job_sat_attrition = pd.crosstab(df['JobSatisfaction'], df['Attrition'], normalize='index') * 100
job_sat_attrition.plot(kind='bar', ax=axes[1], color=['#90EE90', '#FFB6C1'])
axes[1].set_title('Attrition Rate by Job Satisfaction', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Job Satisfaction Level', fontsize=12)
axes[1].set_ylabel('Percentage (%)', fontsize=12)
axes[1].legend(title='Attrition')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=0)

plt.tight_layout()
plt.show()

print("\nAttrition Rate by Job Satisfaction:")
print(job_sat_attrition['Yes'].sort_values(ascending=False))

### 5.2 Monthly Income vs Attrition

In [None]:
# Monthly Income analysis
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Box plot
sns.boxplot(data=df, x='Attrition', y='MonthlyIncome', ax=axes[0], palette='Set2')
axes[0].set_title('Monthly Income vs Attrition', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Attrition', fontsize=12)
axes[0].set_ylabel('Monthly Income ($)', fontsize=12)

# Histogram
df[df['Attrition']=='Yes']['MonthlyIncome'].hist(bins=30, alpha=0.5, label='Left', ax=axes[1], color='red')
df[df['Attrition']=='No']['MonthlyIncome'].hist(bins=30, alpha=0.5, label='Stayed', ax=axes[1], color='green')
axes[1].set_title('Monthly Income Distribution by Attrition', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Monthly Income ($)', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].legend()

plt.tight_layout()
plt.show()

# Statistics
print("\nMonthly Income Statistics by Attrition:")
print("="*60)
print(df.groupby('Attrition')['MonthlyIncome'].describe())

### 5.3 Years at Company vs Attrition

In [None]:
# Years at Company analysis
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Box plot
sns.boxplot(data=df, x='Attrition', y='YearsAtCompany', ax=axes[0], palette='coolwarm')
axes[0].set_title('Years at Company vs Attrition', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Attrition', fontsize=12)
axes[0].set_ylabel('Years at Company', fontsize=12)

# Distribution plot
df[df['Attrition']=='Yes']['YearsAtCompany'].hist(bins=20, alpha=0.5, label='Left', ax=axes[1], color='red')
df[df['Attrition']=='No']['YearsAtCompany'].hist(bins=20, alpha=0.5, label='Stayed', ax=axes[1], color='green')
axes[1].set_title('Years at Company Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Years at Company', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].legend()

plt.tight_layout()
plt.show()

# Statistics
print("\nYears at Company Statistics by Attrition:")
print("="*60)
print(df.groupby('Attrition')['YearsAtCompany'].describe())

### 5.4 Overtime vs Attrition

In [None]:
# Overtime analysis
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Count plot
sns.countplot(data=df, x='OverTime', hue='Attrition', ax=axes[0], palette='Set3')
axes[0].set_title('Overtime vs Attrition', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Overtime', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].legend(title='Attrition')

# Percentage plot
overtime_attrition = pd.crosstab(df['OverTime'], df['Attrition'], normalize='index') * 100
overtime_attrition.plot(kind='bar', ax=axes[1], color=['#90EE90', '#FFB6C1'])
axes[1].set_title('Attrition Rate by Overtime', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Overtime', fontsize=12)
axes[1].set_ylabel('Percentage (%)', fontsize=12)
axes[1].legend(title='Attrition')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=0)

plt.tight_layout()
plt.show()

print("\nAttrition Rate by Overtime:")
print(overtime_attrition['Yes'])

## 6. Additional Insights

### 6.1 Correlation Heatmap

In [None]:
# Select numerical columns for correlation
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Create a copy and encode Attrition
df_corr = df[numerical_cols].copy()

# Calculate correlation matrix
correlation_matrix = df_corr.corr()

# Plot heatmap
plt.figure(figsize=(16, 12))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0, 
            linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Correlation Heatmap of Numerical Features', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

### 6.2 Age vs Attrition

In [None]:
# Age analysis
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Box plot
sns.boxplot(data=df, x='Attrition', y='Age', ax=axes[0], palette='pastel')
axes[0].set_title('Age vs Attrition', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Attrition', fontsize=12)
axes[0].set_ylabel('Age', fontsize=12)

# Histogram
df[df['Attrition']=='Yes']['Age'].hist(bins=20, alpha=0.5, label='Left', ax=axes[1], color='red')
df[df['Attrition']=='No']['Age'].hist(bins=20, alpha=0.5, label='Stayed', ax=axes[1], color='green')
axes[1].set_title('Age Distribution by Attrition', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Age', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].legend()

plt.tight_layout()
plt.show()

### 6.3 Department vs Attrition

In [None]:
# Department analysis
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Count plot
sns.countplot(data=df, x='Department', hue='Attrition', ax=axes[0], palette='muted')
axes[0].set_title('Department vs Attrition', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Department', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].legend(title='Attrition')
axes[0].tick_params(axis='x', rotation=15)

# Percentage plot
dept_attrition = pd.crosstab(df['Department'], df['Attrition'], normalize='index') * 100
dept_attrition.plot(kind='bar', ax=axes[1], color=['#90EE90', '#FFB6C1'])
axes[1].set_title('Attrition Rate by Department', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Department', fontsize=12)
axes[1].set_ylabel('Percentage (%)', fontsize=12)
axes[1].legend(title='Attrition')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=15)

plt.tight_layout()
plt.show()

print("\nAttrition Rate by Department:")
print(dept_attrition['Yes'].sort_values(ascending=False))

## 7. Key Findings Summary

In [None]:
print("="*60)
print("KEY FINDINGS FROM EXPLORATORY DATA ANALYSIS")
print("="*60)
print("\n1. ATTRITION OVERVIEW:")
print(f"   - Overall attrition rate: {(df['Attrition']=='Yes').sum() / len(df) * 100:.2f}%")
print(f"   - Employees who left: {(df['Attrition']=='Yes').sum()}")
print(f"   - Employees who stayed: {(df['Attrition']=='No').sum()}")

print("\n2. JOB SATISFACTION:")
print("   - Lower job satisfaction correlates with higher attrition")
satisfaction_attrition = df[df['Attrition']=='Yes']['JobSatisfaction'].mean()
print(f"   - Average satisfaction (left): {satisfaction_attrition:.2f}")

print("\n3. MONTHLY INCOME:")
income_left = df[df['Attrition']=='Yes']['MonthlyIncome'].mean()
income_stayed = df[df['Attrition']=='No']['MonthlyIncome'].mean()
print(f"   - Average income (left): ${income_left:.2f}")
print(f"   - Average income (stayed): ${income_stayed:.2f}")
print(f"   - Difference: ${income_stayed - income_left:.2f}")

print("\n4. YEARS AT COMPANY:")
years_left = df[df['Attrition']=='Yes']['YearsAtCompany'].mean()
years_stayed = df[df['Attrition']=='No']['YearsAtCompany'].mean()
print(f"   - Average tenure (left): {years_left:.2f} years")
print(f"   - Average tenure (stayed): {years_stayed:.2f} years")

print("\n5. OVERTIME:")
overtime_yes_attrition = (df[(df['OverTime']=='Yes') & (df['Attrition']=='Yes')].shape[0] / 
                          df[df['OverTime']=='Yes'].shape[0] * 100)
overtime_no_attrition = (df[(df['OverTime']=='No') & (df['Attrition']=='Yes')].shape[0] / 
                         df[df['OverTime']=='No'].shape[0] * 100)
print(f"   - Attrition rate with overtime: {overtime_yes_attrition:.2f}%")
print(f"   - Attrition rate without overtime: {overtime_no_attrition:.2f}%")

print("\n" + "="*60)
print("✅ EDA COMPLETED!")
print("="*60)

---
## End of Exploratory Data Analysis
**Next Steps:** Data Preprocessing and Model Training

---