In [3]:
import pandas as pd
import numpy as np
import warnings
from scipy.sparse import load_npz
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [37]:
#General preprocessing
df_preprocessed_general_train = load_npz('./ds_salaries_GeneralPreprocessing_train.npz')
df_preprocessed_general_test = load_npz('./ds_salaries_GeneralPreprocessing_test.npz')

#Title Clustering
df_preprocessed_TC_train = load_npz('ds_salaries_Title_Clustering_Preprocessing_train.npz')
df_preprocessed_TC_test = load_npz('ds_salaries_Title_Clustering_Preprocessing_test.npz')

#Residence Clustering
df_preprocessed_RC_train = load_npz('ds_salaries_Title_Clustering_Preprocessing_train.npz')
df_preprocessed_RC_test = load_npz('ds_salaries_Title_Clustering_Preprocessing_test.npz')

#Feature Elimination
df_preprocessed_FE_train = load_npz('ds_salaries_Feature_Elimination_train.npz')
df_preprocessed_FE_test = load_npz('ds_salaries_Feature_Elimination_test.npz')

#Combined Preprocessing
df_preprocessed_CP_train = load_npz('ds_salaries_Selected_Features_train.npz')
df_preprocessed_CP_test = load_npz('ds_salaries_Selected_Features_test.npz')

#Oversampling
df_preprocessed_oversampling_train = load_npz('./ds_salaries_Oversampling_features_train.npz')
df_target_oversampling_train = pd.read_csv('./ds_salaries_Oversampling_target_train.csv')

#General targets
df_target_train = pd.read_csv("./ds_salaries_target_train.csv")
df_target_test = pd.read_csv("./ds_salaries_target_test.csv")

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Baseline Model (Linear Regression)

In [38]:
estimator = LinearRegression()

steps = [('estimator', estimator)]

pipeline = Pipeline(steps)


#General Preprocessing
pipeline.fit(df_preprocessed_general_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("General Preprocessing:")
print("Linear Regression R2: ",  r2_score_holdout)
print("Linear Regression MSE: ", mse_score_holdout)
print("")

#Title Clustering
pipeline.fit(df_preprocessed_TC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_TC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Job Title Clustering:")
print("Linear Regression R2: ", r2_score_holdout)
print("Linear Regression MSE: ", mse_score_holdout)
print("")

#Residence Clustering
pipeline.fit(df_preprocessed_RC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_RC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Employee Residence Clustering:")
print("Linear Regression R2: ", r2_score_holdout)
print("Linear Regression MSE: ", mse_score_holdout)
print("")

#Feature Elimination
pipeline.fit(df_preprocessed_RC_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_RC_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Employee Residence Clustering:")
print("Linear Regression R2: ", r2_score_holdout)
print("Linear Regression MSE: ", mse_score_holdout)
print("")

#Feature Selection Preprocessing
pipeline.fit(df_preprocessed_FS_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_FS_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Feature Selection Preprocessing:")
print("Linear Regression R2: ", r2_score_holdout)
print("Linear Regression MSE: ", mse_score_holdout)
print("")

#Oversampling
pipeline.fit(df_preprocessed_oversampling_train, df_target_oversampling_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("Oversampling:")
print("Linear Regression R2: ", r2_score_holdout)
print("Linear Regression MSE: ", mse_score_holdout)
print("")

General Preprocessing:
Linear Regression R2:  0.38425054897723754
Linear Regression MSE:  2430849423.925842

Job Title Clustering:
Linear Regression R2:  0.4077065991625144
Linear Regression MSE:  2338249867.4248195

Feature Selection Preprocessing:
Linear Regression R2:  0.42876886033725425
Linear Regression MSE:  2255100486.8477845

Oversampling Preprocessing:
Linear Regression R2:  0.11450896790733234
Linear Regression MSE:  3495732495.8693123



# Gradient Boosting Regression

In [None]:
estimator = GradientBoostingRegressor(learning_rate=0.1, max_depth=10, min_samples_split=20, max_features='log2', subsample=1.0, random_state = 42)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

r2_scores = cross_val_score(pipeline, df_preprocessed_general, df_target.values.ravel(), cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_general, df_target.values.ravel(), cv=kfold, scoring='neg_mean_squared_error')

print("General Preprocessing:")
print("Gradient Boosting R2: ", np.mean(r2_scores))
print("Gradient Boosting MSE: ", np.mean(mse_scores))

r2_scores = cross_val_score(pipeline, df_preprocessed_BoW, df_target.values.ravel(), cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_BoW, df_target.values.ravel(), cv=kfold, scoring='neg_mean_squared_error')

print("Bag of Words Preprocessing:")
print("Gradient Boosting R2: ", np.mean(r2_scores))
print("Gradient Boosting MSE: ", np.mean(mse_scores))

General Preprocessing:
Gradient Boosting R2:  0.43338823147095235
Gradient Boosting MSE:  2248331731.9132476
Bag of Words Preprocessing:
Gradient Boosting R2:  0.441056136548054
Gradient Boosting MSE:  2216641567.392548


In [None]:
estimator = GradientBoostingRegressor(learning_rate=0.1, max_depth=10, min_samples_split=20, max_features='log2', subsample=1.0, random_state = 42)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

r2_scores = cross_val_score(pipeline, df_preprocessed_general_train, df_target_train.values.ravel(), cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_general_train, df_target_train.values.ravel(), cv=kfold, scoring='neg_mean_squared_error')

print("General Preprocessing:")
print("Gradient Boosting R2: ", np.mean(r2_scores))
print("Gradient Boosting MSE: ", np.mean(mse_scores))

r2_scores = cross_val_score(pipeline, df_preprocessed_BoW, df_target.values.ravel(), cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_BoW, df_target.values.ravel(), cv=kfold, scoring='neg_mean_squared_error')

print("Bag of Words Preprocessing:")
print("Gradient Boosting R2: ", np.mean(r2_scores))
print("Gradient Boosting MSE: ", np.mean(mse_scores))

General Preprocessing:
Gradient Boosting R2:  0.441462930034063
Gradient Boosting MSE:  2244044625.480514
Bag of Words Preprocessing:
Gradient Boosting R2:  0.441056136548054
Gradient Boosting MSE:  2216641567.392548


In [None]:
estimator = GradientBoostingRegressor(learning_rate=0.1, max_depth=10, min_samples_split=20, max_features='log2', subsample=1.0, random_state = 42)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

pipeline.fit(df_preprocessed_general_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)
print("General Preprocessing:")
print("Gradient Boosting R2: ", r2_score_holdout)
print("Gradient Boosting MSE: ", mse_score_holdout)

r2_scores = cross_val_score(pipeline, df_preprocessed_BoW, df_target.values.ravel(), cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_BoW, df_target.values.ravel(), cv=kfold, scoring='neg_mean_squared_error')

print("Bag of Words Preprocessing:")
print("Gradient Boosting R2: ", np.mean(r2_scores))
print("Gradient Boosting MSE: ", np.mean(mse_scores))

General Preprocessing:
Gradient Boosting R2:  0.4290101198238734
Gradient Boosting MSE:  2254148045.098806
Bag of Words Preprocessing:
Gradient Boosting R2:  0.441056136548054
Gradient Boosting MSE:  2216641567.392548


# eXtreme Boost Regression

In [31]:
estimator = XGBRegressor(colsample_bylevel=0.25, gamma=0, learning_rate=0.05, max_depth=5, subsample=0.5, random_state = 42)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

pipeline.fit(df_preprocessed_general_train, df_target_train)
pred = pipeline.predict(df_preprocessed_general_test)
r2 = r2_score(df_target_test, pred)
print("General Preprocessing: ", r2)


pipeline.fit(df_preprocessed_BoW_train, df_target_train)
pred = pipeline.predict(df_preprocessed_BoW_test)
r2 = r2_score(df_target_test, pred)

print("Bag-of-Words: ", r2)

# r2_scores = cross_val_score(pipeline, df_preprocessed_general, df_target, cv=kfold, scoring='r2')
# mse_scores = -cross_val_score(pipeline, df_preprocessed_general, df_target, cv=kfold, scoring='neg_mean_squared_error')

# print("General Preprocessing:")
# print("XGBoost R2: ", np.mean(r2_scores))
# print("XGBoost MSE: ", np.mean(mse_scores))

# r2_scores = cross_val_score(pipeline, df_preprocessed_BoW, df_target, cv=kfold, scoring='r2')
# mse_scores = -cross_val_score(pipeline, df_preprocessed_BoW, df_target, cv=kfold, scoring='neg_mean_squared_error')

# print("Bag of Words Preprocessing:")
# print("XGBoost R2: ", np.mean(r2_scores))
# print("XGBoost MSE: ", np.mean(mse_scores))

General Preprocessing:  0.42658022214241753
Bag-of-Words:  0.4263198258383887


# ANNs MLP Regressor

In [None]:

warnings.filterwarnings('ignore')

estimator = MLPRegressor(hidden_layer_sizes= ( 10,), activation='relu', alpha=0.0001, solver='lbfgs', random_state = 42)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

r2_scores = cross_val_score(pipeline, df_preprocessed_general, df_target.values.ravel(), cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_general, df_target.values.ravel(), cv=kfold, scoring='neg_mean_squared_error')

print("General Preprocessing:")
print("ANNs MLP Regressor R2: ", np.mean(r2_scores))
print("ANNs MLP Regressor MSE: ", np.mean(mse_scores))

r2_scores = cross_val_score(pipeline, df_preprocessed_BoW, df_target, cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_BoW, df_target, cv=kfold, scoring='neg_mean_squared_error')

print("Bag of Words Preprocessing:")
print("ANNs MLP Regressor R2: ", np.mean(r2_scores))
print("ANNs MLP Regressor MSE: ", np.mean(mse_scores))

General Preprocessing:
ANNs MLP Regressor R2:  0.41959051369265277
ANNs MLP Regressor MSE:  2303145838.5044823
Bag of Words Preprocessing:
ANNs MLP Regressor R2:  0.4235546823929238
ANNs MLP Regressor MSE:  2289061573.167543


# DNNs MLP Regressor

In [None]:
warnings.filterwarnings('ignore')

estimator = MLPRegressor(hidden_layer_sizes = (10,10,10,10,10,10,10,10,10,10), solver = "adam", 
                         batch_size = 180, alpha = 0.2, activation = "relu", 
                         learning_rate= "constant", random_state = 42)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

r2_scores = cross_val_score(pipeline, df_preprocessed_general, df_target.values.ravel(), cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_general, df_target.values.ravel(), cv=kfold, scoring='neg_mean_squared_error')

print("General Preprocessing:")
print("DNNs MLP Regressor R2: ", np.mean(r2_scores))
print("DNNs MLP Regressor MSE: ", np.mean(mse_scores))

r2_scores = cross_val_score(pipeline, df_preprocessed_BoW, df_target.values.ravel(), cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_BoW, df_target.values.ravel(), cv=kfold, scoring='neg_mean_squared_error')

print("Bag of Words Preprocessing:")
print("DNNs MLP Regressor R2: ", np.mean(r2_scores))
print("DNNs MLP Regressor MSE: ", np.mean(mse_scores))

General Preprocessing:
DNNs MLP Regressor R2:  0.4346433555749029
DNNs MLP Regressor MSE:  2244625276.584117
Bag of Words Preprocessing:
DNNs MLP Regressor R2:  0.44003892301282443
DNNs MLP Regressor MSE:  2223723372.941045


# SVR

In [18]:
warnings.filterwarnings('ignore')
estimator = SVR(C = 100, gamma = 1, kernel = 'poly')
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

pipeline.fit(df_preprocessed_general_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_general_test)

# General Preprocessing
r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)
print("General Preprocessing:")
print("Gradient Boosting R2: ", r2_score_holdout)
print("Gradient Boosting MSE: ", mse_score_holdout)

# BoW
#r2_scores = cross_val_score(pipeline, df_preprocessed_BoW, df_target_test, cv=kfold, scoring='r2')
#mse_scores = -cross_val_score(pipeline, df_preprocessed_BoW, df_target_test, cv=kfold, scoring='neg_mean_squared_error')

#print("Bag of Words Preprocessing:")
#print("Linear Regression R2: ", np.mean(r2_scores))
#print("Linear Regression MSE: ", np.mean(mse_scores))

# Oversampling

pipeline.fit(df_preprocessed_oversampling_train, df_target_oversampling_train)
predictions = pipeline.predict(df_preprocessed_general_test)

# General Preprocessing
r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)
print("Oversampling:")
print("Gradient Boosting R2: ", r2_score_holdout)
print("Gradient Boosting MSE: ", mse_score_holdout)

General Preprocessing:
Gradient Boosting R2:  0.41681724717485824
Gradient Boosting MSE:  2302282943.8074102
Oversampling:
Gradient Boosting R2:  0.41603745845230156
Gradient Boosting MSE:  2305361385.800111
