
# Employee Retention & Performance Analysis
This notebook explores and analyzes synthetic HR data. 
We'll clean the data, engineer tenure features, visualize trends, and prepare it for machine learning insights.


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")


In [None]:

df = pd.read_csv('employees.csv')
print("Initial dataset shape:", df.shape)
df.head()


In [None]:

df.info()
df.describe(include='all')


In [None]:

df = df.replace('', np.nan)
df.isnull().sum()


In [None]:

df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Salary'] = df['Salary'].fillna(df['Salary'].median())


In [None]:

df['Department'] = df['Department'].str.title()
df['Department'] = df['Department'].fillna('Unknown')
df['Performance_Score'] = df['Performance_Score'].fillna('Not Rated')


In [None]:

df['Join_Date'] = pd.to_datetime(df['Join_Date'], errors='coerce', dayfirst=True)
df['Leave_Date'] = pd.to_datetime(df['Leave_Date'], errors='coerce', dayfirst=True)
df['Leave_Date'] = df['Leave_Date'].fillna(pd.Timestamp('today'))
df['Tenure_Days'] = (df['Leave_Date'] - df['Join_Date']).dt.days
df['Tenure_Years'] = df['Tenure_Days'] / 365.0


In [None]:

df['Email_Valid'] = df['Email'].apply(lambda x: isinstance(x, str) and '@' in x and '.' in x)
df = df[df['Email_Valid']].drop('Email_Valid', axis=1)


In [None]:

plt.figure(figsize=(10,5))
sns.histplot(df['Age'], bins=30, kde=True)
plt.title('Age Distribution')
plt.show()

plt.figure(figsize=(10,5))
sns.histplot(df['Salary'], bins=30, kde=True, color='orange')
plt.title('Salary Distribution')
plt.show()


In [None]:

plt.figure(figsize=(10,6))
sns.boxplot(x='Department', y='Salary', data=df)
plt.title('Salary Distribution by Department')
plt.xticks(rotation=45)
plt.show()


In [None]:

plt.figure(figsize=(10,6))
sns.barplot(x='Department', y='Tenure_Years', data=df)
plt.title('Average Tenure by Department')
plt.xticks(rotation=45)
plt.show()


In [None]:

df['Join_Year'] = df['Join_Date'].dt.year
plt.figure(figsize=(10,5))
df['Join_Year'].value_counts().sort_index().plot(kind='bar')
plt.title('Hiring Trends by Year')
plt.ylabel('Number of Hires')
plt.show()


In [None]:

plt.figure(figsize=(6,4))
sns.heatmap(df[['Age', 'Salary', 'Tenure_Years']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [None]:

le_dept = LabelEncoder()
le_perf = LabelEncoder()
df['Department_Code'] = le_dept.fit_transform(df['Department'])
df['Performance_Code'] = le_perf.fit_transform(df['Performance_Score'])
df[['Age', 'Salary', 'Tenure_Years', 'Department_Code', 'Performance_Code']].head()


In [None]:

df.to_csv('cleaned_employees.csv', index=False)
print("Cleaned data saved to cleaned_employees.csv")
