
# Employee Retention & Performance Analysis
This notebook explores and analyzes synthetic HR data. 
We'll clean the data using reusable scripts, engineer tenure features, visualize trends, and prepare it for machine learning.


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('../scripts')  # Adjust to your structure

from cleaning_utils import clean_numeric_column, standardize_text_column, filter_valid_emails, engineer_tenure

pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")


In [None]:

df = pd.read_csv('../data/employees.csv')
print("Initial dataset shape:", df.shape)
df.head()


In [None]:

df = clean_numeric_column(df, 'Age')
df = clean_numeric_column(df, 'Salary')
df = standardize_text_column(df, 'Department')
df = standardize_text_column(df, 'Performance_Score', fillna_value='Not Rated')
df = filter_valid_emails(df)
df = engineer_tenure(df)
print("Cleaned dataset shape:", df.shape)
df.head()


In [None]:

plt.figure(figsize=(10,5))
sns.histplot(df['Age'], bins=30, kde=True)
plt.title('Age Distribution')
plt.savefig('../outputs/age_distribution.png')
plt.show()

plt.figure(figsize=(10,5))
sns.histplot(df['Salary'], bins=30, kde=True, color='orange')
plt.title('Salary Distribution')
plt.savefig('../outputs/salary_distribution.png')
plt.show()

plt.figure(figsize=(10,6))
sns.boxplot(x='Department', y='Salary', data=df)
plt.title('Salary Distribution by Department')
plt.xticks(rotation=45)
plt.savefig('../outputs/salary_by_department.png')
plt.show()

plt.figure(figsize=(10,6))
sns.barplot(x='Department', y='Tenure_Years', data=df)
plt.title('Average Tenure by Department')
plt.xticks(rotation=45)
plt.savefig('../outputs/tenure_by_department.png')
plt.show()

df['Join_Year'] = df['Join_Date'].dt.year
plt.figure(figsize=(10,5))
df['Join_Year'].value_counts().sort_index().plot(kind='bar')
plt.title('Hiring Trends by Year')
plt.ylabel('Number of Hires')
plt.savefig('../outputs/hiring_trends.png')
plt.show()

plt.figure(figsize=(6,4))
sns.heatmap(df[['Age', 'Salary', 'Tenure_Years']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.savefig('../outputs/correlation_heatmap.png')
plt.show()


In [None]:

df.to_csv('../outputs/cleaned_employees.csv', index=False)
print("Cleaned data saved to ../outputs/cleaned_employees.csv")
