In [None]:
%pip install pandas
%pip install numpy
%pip install matplotlib
%pip install seaborn
%pip install scikit-learn
%pip install skimpy


#### Import Libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from skimpy import skim


#### Get data


In [None]:
df_salary = pd.read_csv("../datasets/salary.csv")
pd.set_option("display.float_format", "${:,.2f}".format)


#### Clean Data


In [None]:
skim(df_salary)

In [None]:
df_salary.isnull().sum()
df_salary.dropna(inplace=True)

In [None]:
df_salary.isnull().sum()

In [None]:
df_salary.info()

#### Test Data


In [None]:
np.random.seed(43)
df_salary.sample(10)

In [None]:
if 'unit' in df_salary.columns:
    df_salary['unit'] = df_salary['unit'].astype('string')

if 'designation' in df_salary.columns:
    df_salary['designation'] = df_salary['designation'].astype('string')


In [None]:
df_salary.drop(columns=["FIRST NAME", "LAST NAME", "DOJ", "CURRENT DATE"], axis=1, inplace=True)
df_salary.columns = df_salary.columns.str.lower()
df_salary.columns

In [None]:
# change tipy of columns
df_salary['age'] = df_salary['age'].astype('int64')

In [None]:
df_salary['unit'].value_counts()

In [None]:
df_salary["designation"].value_counts()

In [None]:
df_salary["sex"].value_counts()

#### Check quality of data

In [None]:
# calculate stats of salary for each unit
df_unit_by_salary = df_salary.groupby('unit')['salary'].agg(['mean', 'median', 'std', 'min', 'max'])

# create a mosaic plot for the stats salary of each unit
fig, ax = plt.subplot_mosaic([['mean', 'median'], ['std', 'min'], ['max', 'max']], sharex=True, sharey=False, figsize=(20, 20))
sns.barplot(x="unit",y="mean", data=df_unit_by_salary, ax=ax['mean'])
sns.barplot(x="unit",y="median", data=df_unit_by_salary, ax=ax['median'])
sns.barplot(x="unit",y="std", data=df_unit_by_salary, ax=ax['std'])
sns.barplot(x="unit",y="min", data=df_unit_by_salary, ax=ax['min'])
sns.barplot(x="unit",y="max", data=df_unit_by_salary, ax=ax['max'])
plt.xticks(rotation=90)
plt.xlabel("Unit")
plt.show()

In [None]:

# calculate stats of salary for each designation
df_designation_by_salary = df_salary.groupby('designation')['salary'].agg(['mean', 'median', 'std', 'min', 'max']).reset_index()

# create a mosaic plot for the stats salary of each designation
fig, ax = plt.subplot_mosaic([['mean', 'median'], ['std', 'min'], ['max', 'max']], sharex=True, sharey=True, figsize=(20, 20))
sns.barplot(x='designation', y='mean', data=df_designation_by_salary, ax=ax['mean'])
sns.barplot(x='designation', y='median', data=df_designation_by_salary, ax=ax['median'])
sns.barplot(x='designation', y='std', data=df_designation_by_salary, ax=ax['std'])
sns.barplot(x='designation', y='min', data=df_designation_by_salary, ax=ax['min'])
sns.barplot(x='designation', y='max', data=df_designation_by_salary, ax=ax['max'])
plt.xticks(rotation=90)
plt.xlabel('Designation')
plt.show()

In [None]:
df_salary_by_exp = df_salary['past exp'].value_counts().reset_index()

df_salary_by_exp

In [None]:
# get 100 examples of each past experience
df_exp_10 = df_salary.groupby('past exp').head(100)
df_exp_10 = df_exp_10[['past exp', 'salary']].sort_values('past exp', ascending=False).reset_index(drop=True)

ax= sns.boxplot(x='past exp', y='salary', data=df_exp_10, palette='dark', hue='past exp', hue_order=df_exp_10['past exp'].unique())
sns.move_legend(ax, loc='upper left', bbox_to_anchor=(1, 1), title='Past Experience')
plt.title('Salary vs Past Experience')
plt.xlabel('Past Experience')
plt.ylabel('Salary')
plt.show()

In [None]:
def_salary_mean_unit_designation = df_salary.pivot_table(index=["unit"], columns=["designation"], values=["salary"], aggfunc=['mean'], fill_value=0, margins=True,)
def_salary_mean_unit_designation

In [None]:

# Comparaison between past exp and age
df_salary_by_age = df_salary.groupby('age')['salary'].std().reset_index()
df_salary_by_exp = df_salary.groupby('past exp')['salary'].std().reset_index()

fig, ax = plt.subplot_mosaic([['age', 'exp']], figsize=(20, 10))
sns.lineplot(x='age', y='salary', data=df_salary_by_age, ax=ax['age'])
sns.lineplot(x='past exp', y='salary', data=df_salary_by_exp, ax=ax['exp'])
plt.show()

In [None]:
df_rating_by_unit = df_salary.pivot_table(index=["unit"], columns=["designation"], values="ratings")

# show the heatmap of the ratings by unit and designation
sns.heatmap(df_rating_by_unit, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Ratings by Unit and Designation')
plt.xlabel('Designation')
plt.ylabel('Unit')
plt.show()

# Selection of features

In [None]:
data_salary = df_salary.copy()

data_salary = data_salary[['unit', 'designation', 'past exp', 'salary']]
data_salary.count()

## Machine Learning

In [None]:

# encode datafrom sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.metrics import  mean_squared_error, r2_score, mean_absolute_error

In [None]:
from sklearn.preprocessing import LabelEncoder

# encode data
le = LabelEncoder()
da  = LabelEncoder()

du = data_salary['unit'].unique()
dd = data_salary['designation'].unique()
data_salary['unit'] = le.fit_transform(data_salary['unit'])
data_salary['designation'] = da.fit_transform(data_salary['designation'])


print(dict(zip(du,le.transform(du))))
print(dict(zip(dd,da.transform(dd))))

ml_data = data_salary.copy()
ml_data['unit'] = ml_data['unit'].astype('int32')
ml_data['designation'] = ml_data['unit'].astype('int32')


# split data
x_train, x_test, y_train, y_test = train_test_split(ml_data.drop(columns='salary'), ml_data['salary'], test_size=0.2, random_state=43)


In [None]:
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(Lasso())

param_grid = {'lasso__alpha': np.linspace(0.1, 1, 10),
              'lasso__max_iter': [100, 1000, 10000],
              'lasso__selection': ['cyclic', 'random'],
              'lasso__warm_start': [True, False],
              'lasso__positive': [True, False]}

gs = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=10, verbose=1)
gs.fit(x_train, y_train)

print(gs.best_params_)

y_pred = gs.predict(x_test)


In [None]:
Mean_Squared_Error = mean_squared_error(y_test, y_pred)
R2_Score = r2_score(y_test, y_pred)
Mean_Absolute_Error = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error: {Mean_Squared_Error}")
print(f"R2 Score: {R2_Score}")
print(f"Mean Absolute Error: {Mean_Absolute_Error}")

In [None]:
# export the model
import joblib

joblib.dump(gs, '../streamlit/model/salary.pkl')