In [2]:
import pandas as pd
import numpy as np
import warnings
from scipy.sparse import load_npz
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [3]:
#General preprocessing
df_preprocessed_general_train = load_npz('./ds_salaries_GeneralPreprocessing_train.npz')
df_preprocessed_general_test = load_npz('./ds_salaries_GeneralPreprocessing_test.npz')

#Title Clustering
df_preprocessed_TC_train = load_npz('ds_salaries_Title_Clustering_Preprocessing_train.npz')
df_preprocessed_TC_test = load_npz('ds_salaries_Title_Clustering_Preprocessing_test.npz')

#Residence Clustering
df_preprocessed_RC_train = load_npz('ds_salaries_Residence_Clustering_train.npz')
df_preprocessed_RC_test = load_npz('ds_salaries_Residence_Clustering_test.npz')

#Feature Elimination
df_preprocessed_FE_train = load_npz('ds_salaries_Feature_Elimination_train.npz')
df_preprocessed_FE_test = load_npz('ds_salaries_Feature_Elimination_test.npz')

#Combined Preprocessing
df_preprocessed_CP_train = load_npz('ds_salaries_Combined_Preprocessing_train.npz')
df_preprocessed_CP_test = load_npz('ds_salaries_Combined_Preprocessing_test.npz')

#Oversampling
df_preprocessed_oversampling_train = load_npz('./ds_salaries_Oversampling_features_train.npz')
df_target_oversampling_train = pd.read_csv('./ds_salaries_Oversampling_target_train.csv')

#General targets
df_target_train = pd.read_csv("./ds_salaries_target_train.csv")
df_target_test = pd.read_csv("./ds_salaries_target_test.csv")

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Mean Model Prediction

In [10]:
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error

# Calculate the mean of the training target
mean_salary_train = df_target_train.mean()

# Mean Model Prediction
print("Mean Model Prediction: ")
print("--------------------------")
predictions = np.full(shape=len(df_target_test), fill_value=mean_salary_train)
r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)
print("General Preprocessing:")
print("R2: ",  r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")


Mean Model Prediction: 
--------------------------
General Preprocessing:
R2:  -0.00014811055177710308
MSE:  4374719727.01373



# Baseline Model (Linear Regression)

In [21]:
estimator = LinearRegression()

steps = [('estimator', estimator)]

pipeline = Pipeline(steps)


print("Linear Regression: ")
print("--------------------------")

#General Preprocessing
pipeline.fit(df_preprocessed_general_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("General Preprocessing:")
print("R2: ",  r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Title Clustering
pipeline.fit(df_preprocessed_TC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_TC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Job Title Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Residence Clustering
pipeline.fit(df_preprocessed_RC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_RC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Employee Residence Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Feature Elimination
pipeline.fit(df_preprocessed_FE_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_FE_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Feature Elimination:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Combined Preprocessing
pipeline.fit(df_preprocessed_CP_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_CP_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Combined Preprocessing:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Oversampling
pipeline.fit(df_preprocessed_oversampling_train, df_target_oversampling_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Oversampling:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

Linear Regression: 
--------------------------
General Preprocessing:
R2:  0.38425054897723754
MSE:  2430849423.925842

Job Title Clustering:
R2:  0.4077065991625144
MSE:  2338249867.4248195

Employee Residence Clustering:
R2:  0.41350507563198413
MSE:  2315358700.957605

Feature Elimination:
R2:  0.3916643759156496
MSE:  2401581193.296662

Combined Preprocessing:
R2:  0.42876886033725425
MSE:  2255100486.8477845

Oversampling:
R2:  0.11450896790733234
MSE:  3495732495.8693123



# Gradient Boosting Regression

In [22]:
estimator = GradientBoostingRegressor(learning_rate=0.1, max_depth=10, min_samples_split=20, max_features='log2', subsample=1.0, random_state = 42)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

print("Gradient Boosting: ")
print("--------------------------")

#General Preprocessing
pipeline.fit(df_preprocessed_general_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("General Preprocessing:")
print("R2: ",  r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Title Clustering
pipeline.fit(df_preprocessed_TC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_TC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Job Title Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Residence Clustering
pipeline.fit(df_preprocessed_RC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_RC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Employee Residence Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Feature Elimination
pipeline.fit(df_preprocessed_FE_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_FE_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Feature Elimination:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Combined Preprocessing
pipeline.fit(df_preprocessed_CP_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_CP_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Combined Preprocessing:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Oversampling
pipeline.fit(df_preprocessed_oversampling_train, df_target_oversampling_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Oversampling:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

Gradient Boosting: 
--------------------------
General Preprocessing:
R2:  0.4290101198238734
MSE:  2254148045.098806

Job Title Clustering:
R2:  0.4328472690658193
MSE:  2238999786.324383

Employee Residence Clustering:
R2:  0.4127900076810612
MSE:  2318181638.9459076

Feature Elimination:
R2:  0.4235109185408955
MSE:  2275857736.0267878

Combined Preprocessing:
R2:  0.416172412926228
MSE:  2304828613.899697

Oversampling:
R2:  0.4274250948032965
MSE:  2260405390.7639585



# eXtreme Boost Regression

In [23]:
estimator = XGBRegressor(colsample_bylevel=0.25, gamma=0, learning_rate=0.05, max_depth=5, subsample=0.5, random_state = 42)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

print("Extreme Gradient Boosting: ")
print("--------------------------")

#General Preprocessing
pipeline.fit(df_preprocessed_general_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("General Preprocessing:")
print("R2: ",  r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Title Clustering
pipeline.fit(df_preprocessed_TC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_TC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Job Title Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Residence Clustering
pipeline.fit(df_preprocessed_RC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_RC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Employee Residence Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Feature Elimination
pipeline.fit(df_preprocessed_FE_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_FE_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Feature Elimination:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Combined Preprocessing
pipeline.fit(df_preprocessed_CP_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_CP_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Combined Preprocessing:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Oversampling
pipeline.fit(df_preprocessed_oversampling_train, df_target_oversampling_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Oversampling:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

Extreme Gradient Boosting: 
--------------------------
General Preprocessing:
R2:  0.42658022214241753
MSE:  2263740770.466818

Job Title Clustering:
R2:  0.4263198258383887
MSE:  2264768760.349049

Employee Residence Clustering:
R2:  0.42196883508646066
MSE:  2281945557.4136076

Feature Elimination:
R2:  0.42596649429361244
MSE:  2266163639.0300574

Combined Preprocessing:
R2:  0.4326314753089392
MSE:  2239851694.724345

Oversampling:
R2:  0.41088448807928457
MSE:  2325704229.8610344



# ANNs MLP Regressor

In [24]:
warnings.filterwarnings('ignore')

estimator = MLPRegressor(hidden_layer_sizes= ( 10,), activation='relu', alpha=0.0001, solver='lbfgs', random_state = 42)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

print("ANNs: ")
print("--------------------------")

#General Preprocessing
pipeline.fit(df_preprocessed_general_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("General Preprocessing:")
print("R2: ",  r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Title Clustering
pipeline.fit(df_preprocessed_TC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_TC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Job Title Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Residence Clustering
pipeline.fit(df_preprocessed_RC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_RC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Employee Residence Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Feature Elimination
pipeline.fit(df_preprocessed_FE_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_FE_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Feature Elimination:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Combined Preprocessing
pipeline.fit(df_preprocessed_CP_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_CP_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Combined Preprocessing:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Oversampling
pipeline.fit(df_preprocessed_oversampling_train, df_target_oversampling_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Oversampling:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

ANNs: 
--------------------------
General Preprocessing:
R2:  0.4102190162728011
MSE:  2328331372.694442

Job Title Clustering:
R2:  0.4186787884382218
MSE:  2294933970.12965

Employee Residence Clustering:
R2:  0.4005131060957161
MSE:  2366648266.2352386

Feature Elimination:
R2:  0.39481262587131283
MSE:  2389152564.122268

Combined Preprocessing:
R2:  0.43546115736791
MSE:  2228680704.654412

Oversampling:
R2:  0.40432247136721033
MSE:  2351609692.736953



# DNNs MLP Regressor

In [25]:
warnings.filterwarnings('ignore')

estimator = MLPRegressor(hidden_layer_sizes = (10,10,10,10,10,10,10,10,10,10), solver = "adam", 
                         batch_size = 180, alpha = 0.2, activation = "relu", 
                         learning_rate= "constant", random_state = 42)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

print("DNNs: ")
print("--------------------------")

#General Preprocessing
pipeline.fit(df_preprocessed_general_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("General Preprocessing:")
print("R2: ",  r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Title Clustering
pipeline.fit(df_preprocessed_TC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_TC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Job Title Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Residence Clustering
pipeline.fit(df_preprocessed_RC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_RC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Employee Residence Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Feature Elimination
pipeline.fit(df_preprocessed_FE_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_FE_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Feature Elimination:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Combined Preprocessing
pipeline.fit(df_preprocessed_CP_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_CP_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Combined Preprocessing:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Oversampling
pipeline.fit(df_preprocessed_oversampling_train, df_target_oversampling_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Oversampling:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

DNNs: 
--------------------------
General Preprocessing:
R2:  0.4179762086006882
MSE:  2297707607.6708508

Job Title Clustering:
R2:  0.4226002809008812
MSE:  2279452742.045806

Employee Residence Clustering:
R2:  0.42312624905457574
MSE:  2277376330.3148856

Feature Elimination:
R2:  -0.11142758697777677
MSE:  4387682530.699232

Combined Preprocessing:
R2:  0.4400610440138851
MSE:  2210521318.9092517

Oversampling:
R2:  0.3790934259473322
MSE:  2451208661.803419



# SVR

In [26]:
warnings.filterwarnings('ignore')
estimator = SVR(C = 100, gamma = 1, kernel = 'poly')
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

print("SVR: ")
print("--------------------------")

#General Preprocessing
pipeline.fit(df_preprocessed_general_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("General Preprocessing:")
print("R2: ",  r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Title Clustering
pipeline.fit(df_preprocessed_TC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_TC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Job Title Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Residence Clustering
pipeline.fit(df_preprocessed_RC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_RC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Employee Residence Clustering:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Feature Elimination
pipeline.fit(df_preprocessed_FE_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_FE_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Feature Elimination:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Combined Preprocessing
pipeline.fit(df_preprocessed_CP_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_CP_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Combined Preprocessing:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

#Oversampling
pipeline.fit(df_preprocessed_oversampling_train, df_target_oversampling_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Oversampling:")
print("R2: ", r2_score_holdout)
print("MSE: ", mse_score_holdout)
print("")

SVR: 
--------------------------
General Preprocessing:
R2:  0.41681724717485824
MSE:  2302282943.8074102

Job Title Clustering:
R2:  0.41491843654209093
MSE:  2309779049.1228757

Employee Residence Clustering:
R2:  0.41682510279230933
MSE:  2302251931.4806857

Feature Elimination:
R2:  0.4167001602472197
MSE:  2302745178.390205

Combined Preprocessing:
R2:  0.41284714926152766
MSE:  2317956055.995361

Oversampling:
R2:  0.41603745845230156
MSE:  2305361385.800111

