In [2]:
import pandas as pd
import numpy as np
import warnings
from scipy.sparse import load_npz
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [3]:
#General preprocessing
df_preprocessed_general_train = load_npz('./ds_salaries_GeneralPreprocessing_train_stratified.npz')
df_preprocessed_general_test = load_npz('./ds_salaries_GeneralPreprocessing_test_stratified.npz')

#Title Clustering
df_preprocessed_TC_train = load_npz('ds_salaries_Title_Clustering_Preprocessing_train_stratified.npz')
df_preprocessed_TC_test = load_npz('ds_salaries_Title_Clustering_Preprocessing_test_stratified.npz')

#Residence Clustering
df_preprocessed_RC_train = load_npz('ds_salaries_Residence_Clustering_train_stratified.npz')
df_preprocessed_RC_test = load_npz('ds_salaries_Residence_Clustering_test_stratified.npz')

#Feature Elimination
df_preprocessed_FE_train = load_npz('ds_salaries_Feature_Elimination_train_stratified.npz')
df_preprocessed_FE_test = load_npz('ds_salaries_Feature_Elimination_test_stratified.npz')

#Combined Preprocessing
df_preprocessed_CP_train = load_npz('ds_salaries_Combined_Preprocessing_train_stratified.npz')
df_preprocessed_CP_test = load_npz('ds_salaries_Combined_Preprocessing_test_stratified.npz')

#Oversampling
df_preprocessed_oversampling_train = load_npz('./ds_salaries_Oversampling_features_train_stratified.npz')
df_target_oversampling_train = pd.read_csv('./ds_salaries_Oversampling_target_train_stratified.csv')

#General targets
df_target_train = pd.read_csv("./ds_salaries_target_train_stratified.csv")
df_target_test = pd.read_csv("./ds_salaries_target_test_stratified.csv")

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Baseline Model (Linear Regression)

In [4]:
estimator = LinearRegression()

steps = [('estimator', estimator)]

pipeline = Pipeline(steps)


print("Linear Regression: ")
print("--------------------------")

#General Preprocessing
pipeline.fit(df_preprocessed_general_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("General Preprocessing:")
print("R2: ",  r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Title Clustering
pipeline.fit(df_preprocessed_TC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_TC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Job Title Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Residence Clustering
pipeline.fit(df_preprocessed_RC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_RC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Employee Residence Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Feature Elimination
pipeline.fit(df_preprocessed_FE_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_FE_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Feature Elimination:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Combined Preprocessing
pipeline.fit(df_preprocessed_CP_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_CP_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Combined Preprocessing:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Oversampling
pipeline.fit(df_preprocessed_oversampling_train, df_target_oversampling_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Oversampling:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

Linear Regression: 
--------------------------
General Preprocessing:
R2:  0.36846441250265827
MSE:  2465888439.8754754

Job Title Clustering:
R2:  0.3691262052983829
MSE:  2463304409.3997946

Employee Residence Clustering:
R2:  0.4094541265489403
MSE:  2305840353.8743777

Feature Elimination:
R2:  0.37505771542731836
MSE:  2440144285.809756

Combined Preprocessing:
R2:  0.410021285459622
MSE:  2303625830.0548024

Oversampling:
R2:  -0.11011418293801656
MSE:  4334542320.087681



# Gradient Boosting Regression

In [5]:
estimator = GradientBoostingRegressor(learning_rate=0.1, max_depth=10, min_samples_split=20, max_features='log2', subsample=1.0, random_state = 42)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

print("Gradient Boosting: ")
print("--------------------------")

#General Preprocessing
pipeline.fit(df_preprocessed_general_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("General Preprocessing:")
print("R2: ",  r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Title Clustering
pipeline.fit(df_preprocessed_TC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_TC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Job Title Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Residence Clustering
pipeline.fit(df_preprocessed_RC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_RC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Employee Residence Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Feature Elimination
pipeline.fit(df_preprocessed_FE_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_FE_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Feature Elimination:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Combined Preprocessing
pipeline.fit(df_preprocessed_CP_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_CP_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Combined Preprocessing:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Oversampling
pipeline.fit(df_preprocessed_oversampling_train, df_target_oversampling_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Oversampling:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

Gradient Boosting: 
--------------------------
General Preprocessing:
R2:  0.3970665821853362
MSE:  2354208653.380199



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Job Title Clustering:
R2:  0.41465830222083044
MSE:  2285520174.1688294

Employee Residence Clustering:
R2:  0.4093101206247737
MSE:  2306402638.1711082



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Feature Elimination:
R2:  0.4101606401837309
MSE:  2303081706.7939363

Combined Preprocessing:
R2:  0.4096070239852304
MSE:  2305243351.85565



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Oversampling:
R2:  -0.03794156517806835
MSE:  4052737735.622269



# eXtreme Boost Regression

In [6]:
estimator = XGBRegressor(colsample_bylevel=0.25, gamma=0, learning_rate=0.05, max_depth=5, subsample=0.5, random_state = 42)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

print("Extreme Gradient Boosting: ")
print("--------------------------")

#General Preprocessing
pipeline.fit(df_preprocessed_general_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("General Preprocessing:")
print("R2: ",  r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Title Clustering
pipeline.fit(df_preprocessed_TC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_TC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Job Title Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Residence Clustering
pipeline.fit(df_preprocessed_RC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_RC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Employee Residence Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Feature Elimination
pipeline.fit(df_preprocessed_FE_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_FE_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Feature Elimination:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Combined Preprocessing
pipeline.fit(df_preprocessed_CP_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_CP_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Combined Preprocessing:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Oversampling
pipeline.fit(df_preprocessed_oversampling_train, df_target_oversampling_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Oversampling:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

Extreme Gradient Boosting: 
--------------------------
General Preprocessing:
R2:  0.39992050652482336
MSE:  2343065244.875052

Job Title Clustering:
R2:  0.41110023516320404
MSE:  2299412972.29359

Employee Residence Clustering:
R2:  0.3991586679499395
MSE:  2346039913.2422013

Feature Elimination:
R2:  0.3979280521378028
MSE:  2350844965.1904235

Combined Preprocessing:
R2:  0.4200677436664181
MSE:  2264398515.4833417

Oversampling:
R2:  9.305629113542935e-05
MSE:  3904228078.7593546



# ANNs MLP Regressor

In [7]:
warnings.filterwarnings('ignore')

estimator = MLPRegressor(hidden_layer_sizes= ( 10,), activation='relu', alpha=0.0001, solver='lbfgs', random_state = 42)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

print("ANNs: ")
print("--------------------------")

#General Preprocessing
pipeline.fit(df_preprocessed_general_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("General Preprocessing:")
print("R2: ",  r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Title Clustering
pipeline.fit(df_preprocessed_TC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_TC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Job Title Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Residence Clustering
pipeline.fit(df_preprocessed_RC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_RC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Employee Residence Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Feature Elimination
pipeline.fit(df_preprocessed_FE_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_FE_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Feature Elimination:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Combined Preprocessing
pipeline.fit(df_preprocessed_CP_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_CP_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Combined Preprocessing:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Oversampling
pipeline.fit(df_preprocessed_oversampling_train, df_target_oversampling_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Oversampling:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

ANNs: 
--------------------------
General Preprocessing:
R2:  0.4105396972390639
MSE:  2301601643.8658867

Job Title Clustering:
R2:  0.3828822122047033
MSE:  2409592822.783489

Employee Residence Clustering:
R2:  0.4015323238238814
MSE:  2336771756.8695874

Feature Elimination:
R2:  0.4104281423996816
MSE:  2302037219.935218

Combined Preprocessing:
R2:  0.404919051959294
MSE:  2323547967.2313666

Oversampling:
R2:  -0.07157627428198343
MSE:  4184067532.39048



# DNNs MLP Regressor

In [8]:
warnings.filterwarnings('ignore')

estimator = MLPRegressor(hidden_layer_sizes = (10,10,10,10,10,10,10,10,10,10), solver = "adam", 
                         batch_size = 180, alpha = 0.2, activation = "relu", 
                         learning_rate= "constant", random_state = 42)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

print("DNNs: ")
print("--------------------------")

#General Preprocessing
pipeline.fit(df_preprocessed_general_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("General Preprocessing:")
print("R2: ",  r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Title Clustering
pipeline.fit(df_preprocessed_TC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_TC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Job Title Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Residence Clustering
pipeline.fit(df_preprocessed_RC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_RC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Employee Residence Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Feature Elimination
pipeline.fit(df_preprocessed_FE_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_FE_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Feature Elimination:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Combined Preprocessing
pipeline.fit(df_preprocessed_CP_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_CP_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Combined Preprocessing:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Oversampling
pipeline.fit(df_preprocessed_oversampling_train, df_target_oversampling_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Oversampling:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

DNNs: 
--------------------------
General Preprocessing:
R2:  0.40066558570214117
MSE:  2340156015.107937

Job Title Clustering:
R2:  0.4018618254535832
MSE:  2335485187.6315475

Employee Residence Clustering:
R2:  0.4029618195282295
MSE:  2331190160.199521

Feature Elimination:
R2:  -0.1346470917933169
MSE:  4430333305.648028

Combined Preprocessing:
R2:  0.40941597804666074
MSE:  2305989308.189276

Oversampling:
R2:  -0.09112703095120267
MSE:  4260405249.244242



# SVR

In [9]:
warnings.filterwarnings('ignore')
estimator = SVR(C = 100, gamma = 1, kernel = 'poly')
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

print("SVR: ")
print("--------------------------")

#General Preprocessing
pipeline.fit(df_preprocessed_general_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("General Preprocessing:")
print("R2: ",  r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Title Clustering
pipeline.fit(df_preprocessed_TC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_TC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Job Title Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Residence Clustering
pipeline.fit(df_preprocessed_RC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_RC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Employee Residence Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Feature Elimination
pipeline.fit(df_preprocessed_FE_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_FE_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Feature Elimination:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Combined Preprocessing
pipeline.fit(df_preprocessed_CP_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_CP_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Combined Preprocessing:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Oversampling
pipeline.fit(df_preprocessed_oversampling_train, df_target_oversampling_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Oversampling:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

SVR: 
--------------------------
General Preprocessing:
R2:  0.3918529792639589
MSE:  2374565642.643261

Job Title Clustering:
R2:  0.38340698025678743
MSE:  2407543817.9469156

Employee Residence Clustering:
R2:  0.3916095827127415
MSE:  2375516006.7301545

Feature Elimination:
R2:  0.3909753119903495
MSE:  2377992574.7542872

Combined Preprocessing:
R2:  0.38888249949582354
MSE:  2386164152.4757094

Oversampling:
R2:  -0.08604804714092684
MSE:  4240573892.608102

