# **Loading**

In [23]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler, StandardScaler, MaxAbsScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import KFold, GridSearchCV

from dataset.load import load_df
from utils import evaluate

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_validate

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
scoring = ["neg_root_mean_squared_error",  
            "neg_mean_absolute_error", "r2"]

In [3]:
df = load_df()
print(f"Dataframe shape: {df.shape}")
df.head(2)

Dataframe shape: (21283, 50)


Unnamed: 0,RemoteWork,EdLevel,YearsCodePro,Country,Age,Salary,"Developer, full-stack","Developer, back-end","Developer, front-end",DevOps specialist,...,Microsoft Azure,Google Cloud,Firebase,Heroku,DigitalOcean,Docker,npm,Homebrew,Yarn,Kubernetes
10,"Hybrid (some remote, some in-person)",Bachelor’s degree,2.0,United Kingdom of Great Britain and Northern I...,18-24 years old,60307.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
12,"Hybrid (some remote, some in-person)",Bachelor’s degree,5.0,United States of America,25-34 years old,65000.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# **Get Train-Test split**

In [4]:
print("*" * 25)
train = df.sample(frac=0.95, random_state=42)
test = df.drop(train.index)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print("*" * 25)

X_train = train.drop(["Salary"], axis=1)
y_train = train["Salary"].values
X_test= test.drop(["Salary"], axis=1)
y_test = test["Salary"].values

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")
print("*" * 25)

*************************
Train shape: (20219, 50)
Test shape: (1064, 50)
*************************
X_train shape: (20219, 49)
y_train shape: (20219,)
X_test shape: (1064, 49)
y_test shape: (1064,)
*************************


# **AdaBoost**

In [None]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=200, random_state=42)

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2)
print(f"RMSE: mean: {-1 * np.mean(scores['test_neg_root_mean_squared_error'])} | {-1 * scores['test_neg_root_mean_squared_error']}")
print(f"MAE: mean: {-1 * np.mean(scores['test_neg_mean_absolute_error'])} | {-1 * scores['test_neg_mean_absolute_error']}")
print(f"R2-score: mean: {np.mean(scores['test_r2'])} | {scores['test_r2']}")
print("*" * 69)
print()

RMSE: mean: 38764.97440457434 | [37585.35695962 38924.38736436 38630.01312895 40356.03589556
 38329.07867438]
MAE: mean: 26193.787291398443 | [25466.27744807 26485.87362541 26011.96844664 27022.31706053
 25982.49987633]
R2-score: mean: 0.5928480966720853 | [0.60719844 0.5944499  0.59798265 0.56624913 0.59836036]
*********************************************************************



In [17]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), random_state=42)
params = {
    "n_estimators": [*range(200, 560, 50)]
}

grid = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=3, verbose=3, cv=5, refit="r2")

pipe = Pipeline([
    ("preprocess", transform),
    ("grid", grid)
])

pipe.fit(X_train, y_train)
print(f"The best params: {pipe['grid'].best_params_}")
print(f"The best score: {pipe['grid'].best_score_}")

Fitting 5 folds for each of 8 candidates, totalling 40 fits




The best params: {'n_estimators': 400}
The best score: 0.5960465049256479


# **Bagging**

In [None]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=200, n_jobs=2, random_state=42)

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2)
print(f"RMSE: mean: {-1 * np.mean(scores['test_neg_root_mean_squared_error'])} | {-1 * scores['test_neg_root_mean_squared_error']}")
print(f"MAE: mean: {-1 * np.mean(scores['test_neg_mean_absolute_error'])} | {-1 * scores['test_neg_mean_absolute_error']}")
print(f"R2-score: mean: {np.mean(scores['test_r2'])} | {scores['test_r2']}")
print("*" * 69)
print()

RMSE: mean: 38911.11883075573 | [38070.85579488 39099.72393929 38477.89847245 40414.39031969
 38492.72562747]
MAE: mean: 27417.893751010077 | [26963.84926682 27849.56810872 27012.6656693  28108.0319531
 27155.35375711]
R2-score: mean: 0.5897665701292938 | [0.59698508 0.59078803 0.60114249 0.56499383 0.59492342]
*********************************************************************



In [19]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = BaggingRegressor(base_estimator=DecisionTreeRegressor(), random_state=42)
params = {
    "n_estimators": [*range(200, 560, 50)]
}

grid = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=-1, verbose=3, cv=5, refit="r2")

pipe = Pipeline([
    ("preprocess", transform),
    ("grid", grid)
])

pipe.fit(X_train, y_train)
print(f"The best params: {pipe['grid'].best_params_}")
print(f"The best score: {pipe['grid'].best_score_}")

Fitting 5 folds for each of 8 candidates, totalling 40 fits




The best params: {'n_estimators': 550}
The best score: 0.5905154579424845


# **RandomForest**

In [None]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = RandomForestRegressor(n_estimators=200, n_jobs=2, random_state=42)

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2)
print(f"RMSE: mean: {-1 * np.mean(scores['test_neg_root_mean_squared_error'])} | {-1 * scores['test_neg_root_mean_squared_error']}")
print(f"MAE: mean: {-1 * np.mean(scores['test_neg_mean_absolute_error'])} | {-1 * scores['test_neg_mean_absolute_error']}")
print(f"R2-score: mean: {np.mean(scores['test_r2'])} | {scores['test_r2']}")
print("*" * 69)
print()

RMSE: mean: 38917.28320578492 | [38091.66786902 39123.25578711 38459.56648893 40363.86931061
 38548.05657325]
MAE: mean: 27427.936402339943 | [26987.93126113 27882.32719873 27010.836114   28074.1387908
 27184.44864704]
R2-score: mean: 0.5896401744787039 | [0.59654433 0.59029532 0.60152246 0.56608073 0.59375804]
*********************************************************************



In [21]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = RandomForestRegressor(random_state=42)
params = {
    "n_estimators": [*range(200, 560, 50)]
}

grid = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=-1, verbose=3, cv=5, refit="r2")

pipe = Pipeline([
    ("preprocess", transform),
    ("grid", grid)
])

pipe.fit(X_train, y_train)
print(f"The best params: {pipe['grid'].best_params_}")
print(f"The best score: {pipe['grid'].best_score_}")

Fitting 5 folds for each of 8 candidates, totalling 40 fits




The best params: {'n_estimators': 550}
The best score: 0.5904401829715102


# **Gradient Boost**

In [None]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = GradientBoostingRegressor(n_estimators=200)

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2)
print(f"RMSE: mean: {-1 * np.mean(scores['test_neg_root_mean_squared_error'])} | {-1 * scores['test_neg_root_mean_squared_error']}")
print(f"MAE: mean: {-1 * np.mean(scores['test_neg_mean_absolute_error'])} | {-1 * scores['test_neg_mean_absolute_error']}")
print(f"R2-score: mean: {np.mean(scores['test_r2'])} | {scores['test_r2']}")
print("*" * 69)
print()

RMSE: mean: 37730.7612213689 | [36736.50299898 37845.68564879 37482.05424689 39532.57814741
 37056.98506477]
MAE: mean: 26009.150920067732 | [25515.908245   26423.78759751 25698.72456364 26827.23175549
 25580.10243871]
R2-score: mean: 0.614245085231671 | [0.62474069 0.61661625 0.62152095 0.58376976 0.62457778]
*********************************************************************



In [23]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = GradientBoostingRegressor()

params = {
    "n_estimators": [*range(200, 560, 50)]
}

grid = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=3, verbose=3, cv=5, refit="r2")

pipe = Pipeline([
    ("preprocess", transform),
    ("grid", grid)
])

pipe.fit(X_train, y_train)
print(f"The best params: {pipe['grid'].best_params_}")
print(f"The best score: {pipe['grid'].best_score_}")

Fitting 5 folds for each of 8 candidates, totalling 40 fits
The best params: {'n_estimators': 450}
The best score: 0.6162582664420287


# **Train & Save Best Model**

In [6]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = GradientBoostingRegressor(n_estimators=450)

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
evaluate(y_test, y_pred)

                         Metrics        Values
0  Root Mean Square Error (RMSE)  37359.955759
1      Mean Absolute Error (MAE)  25795.854225
2                  R2-score (R2)      0.613329


In [17]:
joblib.dump(pipe, "best_model.joblib")

['best_model.joblib']