# Employee Salary Data Analysis

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols

df = pd.read_csv('employee_salary_data.csv')
df.head()

## Data Cleaning

In [None]:
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

## Convert Categorical Variables

In [None]:
categorical_cols = ['Job_Title', 'Education_Level', 'Department', 'Gender']
for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].astype('category')

df_encoded = pd.get_dummies(df, drop_first=True)

## Outlier Detection in Monthly Salary

In [None]:
Q1 = df['Monthly_Salary'].quantile(0.25)
Q3 = df['Monthly_Salary'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
df = df[(df['Monthly_Salary'] >= lower) & (df['Monthly_Salary'] <= upper)]

## Education Level vs Salary

In [None]:
edu_order = ['Diploma', 'Bachelor', 'Master', 'PhD']
df['Education_Level'] = pd.Categorical(df['Education_Level'], categories=edu_order, ordered=True)
dataset_avg = df.groupby('Education_Level')['Monthly_Salary'].mean().reset_index()

plt.figure(figsize=(8,5))
sns.lineplot(x='Education_Level', y='Monthly_Salary', data=dataset_avg, marker='o')
plt.title('Average Monthly Salary by Education Level')
plt.xlabel('Education Level')
plt.ylabel('Average Monthly Salary')
plt.grid()
plt.show()

## Work Experience vs Salary

In [None]:
plt.figure(figsize=(8,5))
plt.scatter(df['Work_Experience'], df['Monthly_Salary'], color='teal')
plt.title('Work Experience vs Monthly Salary')
plt.xlabel('Work Experience (Years)')
plt.ylabel('Monthly Salary')
plt.grid(True)
plt.show()

## Salary Comparison by Gender

In [None]:
gender_salary = df.groupby('Gender')['Monthly_Salary'].mean()
gender_salary.plot(kind='bar', color=['lightblue', 'pink', 'gray'])
plt.title('Average Monthly Salary by Gender')
plt.ylabel('Average Salary')
plt.show()

## Department with Highest Remote Workers

In [None]:
remote_avg = df.groupby('Department')['Remote_Work_Percentage'].mean().sort_values(ascending=False)
remote_avg.plot(kind='bar', color='skyblue')
plt.title('Average Remote Work Percentage per Department')
plt.ylabel('Average Remote Work (%)')
plt.show()

## ANOVA Tests

In [None]:
anova1 = ols('Monthly_Salary ~ Work_Experience', data=df).fit()
anova1_table = sm.stats.anova_lm(anova1, type=2)
print(anova1_table)

anova2 = ols('Monthly_Salary ~ Education_Level', data=df).fit()
anova2_table = sm.stats.anova_lm(anova2, type=2)
print(anova2_table)

anova3 = ols('Monthly_Salary ~ Education_Level * Work_Experience', data=df).fit()
anova3_table = sm.stats.anova_lm(anova3, type=2)
print(anova3_table)

## Box Plot of Salary by Job Title

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='Job_Title', y='Monthly_Salary', data=df)
plt.title('Salary Distribution Across Job Titles')
plt.xticks(rotation=45)
plt.show()

## Heatmap of Correlations

In [None]:
corr = df_encoded.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, cmap='coolwarm', annot=False)
plt.title('Correlation Heatmap')
plt.show()

## Save Processed Dataset

In [None]:
df.to_csv('employee_salary_cleaned.csv', index=False)