Supervised Learning Final: Data Engineer Salaries Model

The problem we want to solve is for companies to know how they should set salaries for new positions.
By putting the factors into the model, they can get a prediction of how they should be setting salaries to be
similar to market value.

Dataset URL (Kaggle): https://www.kaggle.com/datasets/chopper53/data-engineer-salary-in-2024?resource=download

In [None]:
import pandas as pd
import sklearn.model_selection
import statsmodels.formula.api as smf

salaries = pd.read_csv('salaries.csv')

# filter for relevant data
data = salaries[
    (salaries['job_title'] == 'Data Scientist') &
     (salaries['employment_type'] == 'FT') & (salaries['employee_residence'] == 'US')
]

# todo convert some features to numeric values e.g. experience level (convert to scale of 1-4), etc
exp_levels = data['experience_level'].unique()
# remove 'EX' level since we don't know what this is
data = data[data['experience_level'] != 'EX']
# convert other exp levels to ordinal numbers
exp_mapping = {'EN': 1, 'MI': 2, 'SE': 3}
data['experience_level'] = data['experience_level'].apply(lambda x: exp_mapping[x])
# convert company size to ordinal numbers
company_size_mapping = {'S': 1, 'M': 2, 'L': 3}
data['company_size'] = data['company_size'].apply(lambda x: company_size_mapping[x])


# drop irrelevant features
data = data.drop(columns=[
    'employment_type', 
    'job_title',
    'salary',
    'salary_currency', 
    'employee_residence', 
    'company_location',
])

# set dependent variable as first col for correlation matrix to show good results
data = data[['salary_in_usd', 'work_year', 'remote_ratio', 'experience_level', 'company_size']]

corr = data.corr()

X_train, X_test = sklearn.model_selection.train_test_split(data, test_size=0.2)
model = smf.ols(formula='salary_in_usd ~ experience_level', data=X_train).fit()
adj_R2 = model.rsquared

print(adj_R2)