In [1]:
import pandas as pd

df = pd.read_csv("./ds_salaries.csv")

display(df)

print(df["company_size"].unique())

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M
...,...,...,...,...,...,...,...,...,...,...,...
3750,2020,SE,FT,Data Scientist,412000,USD,412000,US,100,US,L
3751,2021,MI,FT,Principal Data Scientist,151000,USD,151000,US,100,US,L
3752,2020,EN,FT,Data Scientist,105000,USD,105000,US,100,US,S
3753,2020,EN,CT,Business Data Analyst,100000,USD,100000,US,100,US,L


['L' 'S' 'M']


In [2]:
df_features = df.copy(deep=True)

target = df_features["salary_in_usd"]
df_features.drop(["salary", "salary_in_usd"], axis=1, inplace=True)

display(df_features)

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_currency,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,EUR,ES,100,ES,L
1,2023,MI,CT,ML Engineer,USD,US,100,US,S
2,2023,MI,CT,ML Engineer,USD,US,100,US,S
3,2023,SE,FT,Data Scientist,USD,CA,100,CA,M
4,2023,SE,FT,Data Scientist,USD,CA,100,CA,M
...,...,...,...,...,...,...,...,...,...
3750,2020,SE,FT,Data Scientist,USD,US,100,US,L
3751,2021,MI,FT,Principal Data Scientist,USD,US,100,US,L
3752,2020,EN,FT,Data Scientist,USD,US,100,US,S
3753,2020,EN,CT,Business Data Analyst,USD,US,100,US,L


# Create preprocessor

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer


oneHot_features = ["work_year", "employment_type", "remote_ratio", "salary_currency", "employee_residence", "company_location","company_size", "experience_level"]
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(), oneHot_features),
    ("bow", CountVectorizer(), ["job_title"])
    ])
df_preprocessed4 = preprocessor.fit_transform(df_features)


# init Model

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

estimator = LinearRegression()

# Execute pipeline and evaluate

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

kfold = KFold(n_splits=5, shuffle=True, random_state = 42)

r2_scores = cross_val_score(pipeline, df_preprocessed1, target, cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed1, target, cv=kfold, scoring='neg_mean_squared_error')

r2_scores1 = cross_val_score(pipeline, df_preprocessed2, target, cv=kfold, scoring='r2')
mse_scores1 = -cross_val_score(pipeline, df_preprocessed2, target, cv=kfold, scoring='neg_mean_squared_error')

r2_scores2 = cross_val_score(pipeline, df_preprocessed3, target, cv=kfold, scoring='r2')
mse_scores2 = -cross_val_score(pipeline, df_preprocessed3, target, cv=kfold, scoring='neg_mean_squared_error')

r2_scores3 = cross_val_score(pipeline, df_preprocessed4, target, cv=kfold, scoring='r2')
mse_scores3 = -cross_val_score(pipeline, df_preprocessed4, target, cv=kfold, scoring='neg_mean_squared_error')

print("Variante1")
print(np.mean(r2_scores))
print(np.mean(mse_scores))
print("Variante2")
print(np.mean(r2_scores1))
print(np.mean(mse_scores1))
print("Variante3")
print(np.mean(r2_scores2))
print(np.mean(mse_scores2))
print("Variante4")
print(np.mean(r2_scores3))
print(np.mean(mse_scores3))

Variante1
0.39595090289473467
2399795759.8722467
Variante2
0.39764731394090347
2393031918.0397487
Variante3
0.396660391393596
2396915337.94404
Variante4
0.3982415072986292
2390609244.379858


# init Model

In [9]:
from xgboost import XGBRegressor

# Create an XGBRegressor instance
estimator = XGBRegressor(random_state=42)

# Apply hyperparameter tuning and evaluate

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
import numpy as np

kfold = KFold(n_splits=5, shuffle=True, random_state = 42)

# Define the parameter grid
param_grid = {
    'estimator__learning_rate': [0.025, 0.05, 0.1, 0.2, 0.3],
    'estimator__gamma': [0, 0.1, 0.2, 0.3, 0.4, 1.0, 1.5, 2.0],
    'estimator__max_depth': [2, 3, 5, 7, 10, 100],
    'estimator__colsample_bylevel': [0.25, 1.0],
    'estimator__subsample': [0.15, 0.5, 0.75, 1.0],
}

steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

# Create GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)

print("Version1: ", cross_val_score(grid_search, df_preprocessed4, target, cv=kfold, scoring='r2').mean())

# Fit the data to find the best parameters
grid_search.fit(df_preprocessed4, target)

# Get the best parameters and best score
best_params = grid_search.best_params_
print("Best Parameter Values: ", best_params)

# chatgpt: 
# GridSearchCV.best_score_ is an attribute that stores the average cross-validated score achieved by the best estimator found during the grid search.
best_score = grid_search.best_score_
print("Version 2: ", best_score)

# chatgpt:
# When you call gridsearchcv.score(X, y), it calculates the score of the best estimator using the input data X and target values y. 
# It returns a single score value that represents the performance of the best estimator on the provided data.
print("Version 3", grid_search.score(df_preprocessed4, target))

# Create a new XGBRegressor with the best parameters
best_xgb = grid_search.best_estimator_

steps = [('estimator', best_xgb)]

pipeline = Pipeline(steps)

r2_scores = cross_val_score(pipeline, df_preprocessed4, target, cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed4, target, cv=kfold, scoring='neg_mean_squared_error')

print("Version 4: ", np.mean(r2_scores))
print("MSE: ", np.mean(mse_scores))


Version1:  0.4249950618339019
Best Parameter Values:  {'estimator__colsample_bylevel': 0.25, 'estimator__gamma': 0, 'estimator__learning_rate': 0.05, 'estimator__max_depth': 7, 'estimator__subsample': 0.5}
Version 2:  0.38997787951209445
Version 3 0.5051946906677074
Version 4:  0.43341205424940166
MSE:  2250426367.6265516
