In [None]:
%pip install pandas
%pip install numpy
%pip install matplotlib
%pip install seaborn
%pip install scikit-learn
%pip install skimpy


# Import Libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from skimpy import skim


# Get data


In [None]:
df_salary = pd.read_csv("../datasets/salary.csv")

# Clean Data


In [None]:
skim(df_salary)

In [None]:
df_salary.isnull().sum()
df_salary.dropna(inplace=True)

In [None]:
df_salary.isnull().sum()

In [None]:
df_salary.info()

In [None]:
np.random.seed(43)
df_salary.sample(10)

In [None]:
df_salary.drop(columns=["FIRST NAME", "LAST NAME", "DOJ", "CURRENT DATE"], axis=1, inplace=True)
df_salary.columns = df_salary.columns.str.lower()
df_salary.columns

In [None]:
# change tipy of columns
df_salary['age'] = df_salary['age'].astype('int64')

In [None]:
df_salary['unit'].value_counts()

In [None]:
df_salary["designation"].value_counts()

In [None]:
df_salary["sex"].value_counts()

In [None]:
# calculate the mean salary for each unit
df_unit_by_salary = df_salary.groupby('unit')['salary'].agg(['mean', 'median', 'std', 'min', 'max'])

for col in df_unit_by_salary.columns:
    sns.barplot(x=df_unit_by_salary.index, y=df_unit_by_salary[col])
    plt.title(col)
    plt.show()

In [None]:


df_designation_by_salary = df_salary.groupby('designation')['salary'].agg(['mean', 'median', 'std', 'min', 'max']).reset_index()

# create a mosaic plot for the mean salary of each designation
fig, ax = plt.subplot_mosaic([['mean', 'median'], ['std', 'min'], ['max', 'max']], sharex=True, sharey=False, figsize=(20, 20))
sns.barplot(x='designation', y='mean', data=df_designation_by_salary, ax=ax['mean'])
sns.barplot(x='designation', y='median', data=df_designation_by_salary, ax=ax['median'])
sns.barplot(x='designation', y='std', data=df_designation_by_salary, ax=ax['std'])
sns.barplot(x='designation', y='min', data=df_designation_by_salary, ax=ax['min'])
sns.barplot(x='designation', y='max', data=df_designation_by_salary, ax=ax['max'])
plt.xticks(rotation=90)
plt.show()

In [None]:
df_salary_by_exp = df_salary['past exp'].value_counts().reset_index()

df_salary_by_exp

In [None]:
# get 10 examples of each past experience
df_exp_10 = df_salary.groupby('past exp').head(10)
df_exp_10 = df_exp_10[['past exp', 'salary']].sort_values('past exp', ascending=False).reset_index(drop=True)

ax= sns.scatterplot(x='past exp', y='salary', data=df_exp_10, palette='dark', hue='past exp', hue_order=df_exp_10['past exp'].unique())
sns.move_legend(ax, loc='upper left', bbox_to_anchor=(1, 1), title='Past Experience')
plt.title('Salary vs Past Experience')
plt.xlabel('Past Experience')
plt.ylabel('Salary')
plt.show()

In [None]:
import math

def mean(x):
    return round(np.mean(x), 2)

def_salary_mean_unit_designation = df_salary.pivot_table(index=["unit"], columns=["designation"], values=["salary"], aggfunc=[mean], fill_value=0, margins=True)
def_salary_mean_unit_designation


In [None]:
df_salary_by_age = df_salary.groupby('age')['salary'].std().reset_index()
df_salary_by_exp = df_salary.groupby('past exp')['salary'].std().reset_index()

fig, ax = plt.subplot_mosaic([['age', 'exp']], figsize=(20, 10))
sns.lineplot(x='age', y='salary', data=df_salary_by_age, ax=ax['age'])
sns.lineplot(x='past exp', y='salary', data=df_salary_by_exp, ax=ax['exp'])
plt.show()


In [None]:
df_rating_by_unit = df_salary.pivot_table(index=["unit"], columns=["designation"], values="ratings")
df_rating_by_unit

# show the heatmap of the ratings by unit and designation
sns.heatmap(df_rating_by_unit, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Ratings by Unit and Designation')
plt.show()
