In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler, StandardScaler, MaxAbsScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import KFold, GridSearchCV

from dataset.load import load_df
from utils import evaluate

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_validate

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
scoring = ["neg_root_mean_squared_error",  
            "neg_mean_absolute_error", "r2"]

In [3]:
df = load_df()
print(f"Dataframe shape: {df.shape}")
df.head(2)

Dataframe shape: (21283, 50)


Unnamed: 0,RemoteWork,EdLevel,YearsCodePro,Country,Age,Salary,"Developer, full-stack","Developer, back-end","Developer, front-end",DevOps specialist,...,Microsoft Azure,Google Cloud,Firebase,Heroku,DigitalOcean,Docker,npm,Homebrew,Yarn,Kubernetes
10,"Hybrid (some remote, some in-person)",Bachelor’s degree,2.0,United Kingdom of Great Britain and Northern I...,18-24 years old,60307.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
12,"Hybrid (some remote, some in-person)",Bachelor’s degree,5.0,United States of America,25-34 years old,65000.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# **Get Train-Test split**

In [4]:
print("*" * 25)
train = df.sample(frac=0.95, random_state=42)
test = df.drop(train.index)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print("*" * 25)

X_train = train.drop(["Salary"], axis=1)
y_train = train["Salary"].values
X_test= test.drop(["Salary"], axis=1)
y_test = test["Salary"].values

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")
print("*" * 25)

*************************
Train shape: (20219, 50)
Test shape: (1064, 50)
*************************
X_train shape: (20219, 49)
y_train shape: (20219,)
X_test shape: (1064, 49)
y_test shape: (1064,)
*************************


# **AdaBoost**

In [5]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=200, random_state=42)

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2)
print(f"RMSE: mean: {-1 * np.mean(scores['test_neg_root_mean_squared_error'])} | {-1 * scores['test_neg_root_mean_squared_error']}")
print(f"MAE: mean: {-1 * np.mean(scores['test_neg_mean_absolute_error'])} | {-1 * scores['test_neg_mean_absolute_error']}")
print(f"R2-score: mean: {np.mean(scores['test_r2'])} | {scores['test_r2']}")
print("*" * 69)
print()

RMSE: mean: 38764.97440457434 | [37585.35695962 38924.38736436 38630.01312895 40356.03589556
 38329.07867438]
MAE: mean: 26193.787291398443 | [25466.27744807 26485.87362541 26011.96844664 27022.31706053
 25982.49987633]
R2-score: mean: 0.5928480966720853 | [0.60719844 0.5944499  0.59798265 0.56624913 0.59836036]
*********************************************************************



In [None]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), random_state=42)
params = {
    "n_estimators": [*range(200, 560, 50)]
}

grid = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, n_jobs=3, verbose=1, cv=5, refit="r2")

pipe = Pipeline([
    ("preprocess", transform),
    ("grid", grid)
])

pipe.fit(X_train, y_train)
print(f"The best params: {pipe.best_params}")
print(f"The best score: {pipe.best_score_}")

In [6]:
label_scaling = Pipeline([
    ("label_encode", OrdinalEncoder()),
    ("scale", MaxAbsScaler())
])

transform = ColumnTransformer([
    ("label", label_scaling, ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=200, random_state=42)

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2, return_estimator=True)
print(f"RMSE: mean: {-1 * np.mean(scores['test_neg_root_mean_squared_error'])} | {-1 * scores['test_neg_root_mean_squared_error']}")
print(f"MAE: mean: {-1 * np.mean(scores['test_neg_mean_absolute_error'])} | {-1 * scores['test_neg_mean_absolute_error']}")
print(f"R2-score: mean: {np.mean(scores['test_r2'])} | {scores['test_r2']}")
print("*" * 69)
print()

RMSE: mean: 38659.405212849546 | [37566.69471491 38713.00785021 38599.58329832 40031.06714097
 38386.67305984]
MAE: mean: 26132.649260817718 | [25427.60608309 26380.83965021 26088.16861034 26787.17784223
 25979.45411823]
R2-score: mean: 0.595081167178998 | [0.60758842 0.59884263 0.59861576 0.5732066  0.59715242]
*********************************************************************



# **Bagging**

In [7]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=200, n_jobs=2, random_state=42)

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2)
print(f"RMSE: mean: {-1 * np.mean(scores['test_neg_root_mean_squared_error'])} | {-1 * scores['test_neg_root_mean_squared_error']}")
print(f"MAE: mean: {-1 * np.mean(scores['test_neg_mean_absolute_error'])} | {-1 * scores['test_neg_mean_absolute_error']}")
print(f"R2-score: mean: {np.mean(scores['test_r2'])} | {scores['test_r2']}")
print("*" * 69)
print()

RMSE: mean: 38911.11883075573 | [38070.85579488 39099.72393929 38477.89847245 40414.39031969
 38492.72562747]
MAE: mean: 27417.893751010077 | [26963.84926682 27849.56810872 27012.6656693  28108.0319531
 27155.35375711]
R2-score: mean: 0.5897665701292938 | [0.59698508 0.59078803 0.60114249 0.56499383 0.59492342]
*********************************************************************



In [8]:
label_scaling = Pipeline([
    ("label_encode", OrdinalEncoder()),
    ("scale", MaxAbsScaler())
])

transform = ColumnTransformer([
    ("label", label_scaling, ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=200, n_jobs=2, random_state=42)

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2, return_estimator=True)
print(f"RMSE: mean: {-1 * np.mean(scores['test_neg_root_mean_squared_error'])} | {-1 * scores['test_neg_root_mean_squared_error']}")
print(f"MAE: mean: {-1 * np.mean(scores['test_neg_mean_absolute_error'])} | {-1 * scores['test_neg_mean_absolute_error']}")
print(f"R2-score: mean: {np.mean(scores['test_r2'])} | {scores['test_r2']}")
print("*" * 69)
print()

RMSE: mean: 38911.48390781162 | [38074.50016802 39101.40081064 38476.81493544 40412.19987377
 38492.50375119]
MAE: mean: 27418.35992826893 | [26965.17349777 27850.79318538 27013.14875907 28107.74126813
 27154.94293099]
R2-score: mean: 0.5897589749143257 | [0.59690792 0.59075293 0.60116496 0.56504098 0.59492809]
*********************************************************************



# **RandomForest**

In [9]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = RandomForestRegressor(n_estimators=200, n_jobs=2, random_state=42)

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2)
print(f"RMSE: mean: {-1 * np.mean(scores['test_neg_root_mean_squared_error'])} | {-1 * scores['test_neg_root_mean_squared_error']}")
print(f"MAE: mean: {-1 * np.mean(scores['test_neg_mean_absolute_error'])} | {-1 * scores['test_neg_mean_absolute_error']}")
print(f"R2-score: mean: {np.mean(scores['test_r2'])} | {scores['test_r2']}")
print("*" * 69)
print()

RMSE: mean: 38917.28320578492 | [38091.66786902 39123.25578711 38459.56648893 40363.86931061
 38548.05657325]
MAE: mean: 27427.936402339943 | [26987.93126113 27882.32719873 27010.836114   28074.1387908
 27184.44864704]
R2-score: mean: 0.5896401744787039 | [0.59654433 0.59029532 0.60152246 0.56608073 0.59375804]
*********************************************************************



In [10]:
label_scaling = Pipeline([
    ("label_encode", OrdinalEncoder()),
    ("scale", MaxAbsScaler())
])

transform = ColumnTransformer([
    ("label", label_scaling, ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = RandomForestRegressor(n_estimators=200, n_jobs=2, random_state=42)

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2, return_estimator=True)
print(f"RMSE: mean: {-1 * np.mean(scores['test_neg_root_mean_squared_error'])} | {-1 * scores['test_neg_root_mean_squared_error']}")
print(f"MAE: mean: {-1 * np.mean(scores['test_neg_mean_absolute_error'])} | {-1 * scores['test_neg_mean_absolute_error']}")
print(f"R2-score: mean: {np.mean(scores['test_r2'])} | {scores['test_r2']}")
print("*" * 69)
print()

RMSE: mean: 38917.73001434382 | [38095.71336438 39125.12720277 38458.77956107 40361.39331942
 38547.63662407]
MAE: mean: 27428.459827085626 | [26989.6970413  27882.14274151 27012.45427671 28073.68307122
 27184.3220047 ]
R2-score: mean: 0.5896308729818247 | [0.59645863 0.59025613 0.60153876 0.56613396 0.59376689]
*********************************************************************



# **Gradient Boost**

In [11]:
transform = ColumnTransformer([
    ("label", OrdinalEncoder(), ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = GradientBoostingRegressor(n_estimators=200)

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2)
print(f"RMSE: mean: {-1 * np.mean(scores['test_neg_root_mean_squared_error'])} | {-1 * scores['test_neg_root_mean_squared_error']}")
print(f"MAE: mean: {-1 * np.mean(scores['test_neg_mean_absolute_error'])} | {-1 * scores['test_neg_mean_absolute_error']}")
print(f"R2-score: mean: {np.mean(scores['test_r2'])} | {scores['test_r2']}")
print("*" * 69)
print()

RMSE: mean: 37730.7612213689 | [36736.50299898 37845.68564879 37482.05424689 39532.57814741
 37056.98506477]
MAE: mean: 26009.150920067732 | [25515.908245   26423.78759751 25698.72456364 26827.23175549
 25580.10243871]
R2-score: mean: 0.614245085231671 | [0.62474069 0.61661625 0.62152095 0.58376976 0.62457778]
*********************************************************************



In [12]:
label_scaling = Pipeline([
    ("label_encode", OrdinalEncoder()),
    ("scale", MaxAbsScaler())
])

transform = ColumnTransformer([
    ("label", label_scaling, ["EdLevel", "Country", "Age"]),
    ("onehot", OneHotEncoder(sparse=False, handle_unknown="ignore"), ["RemoteWork"]),
    ("scaler", MaxAbsScaler(), ["YearsCodePro"])
], remainder="passthrough")

model = GradientBoostingRegressor(n_estimators=200)

pipe = Pipeline([
    ("preprocess", transform),
    ("model", model)
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_validate(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=2, return_estimator=True)
print(f"RMSE: mean: {-1 * np.mean(scores['test_neg_root_mean_squared_error'])} | {-1 * scores['test_neg_root_mean_squared_error']}")
print(f"MAE: mean: {-1 * np.mean(scores['test_neg_mean_absolute_error'])} | {-1 * scores['test_neg_mean_absolute_error']}")
print(f"R2-score: mean: {np.mean(scores['test_r2'])} | {scores['test_r2']}")
print("*" * 69)
print()

RMSE: mean: 37731.81278252379 | [36737.15440576 37845.68564879 37482.05424689 39532.92471638
 37061.24489479]
MAE: mean: 26010.10769767022 | [25515.908245   26423.78759751 25698.72456364 26828.53321816
 25583.58486405]
R2-score: mean: 0.6142237005706326 | [0.62472738 0.61661625 0.62152095 0.58376246 0.62449146]
*********************************************************************

