In [83]:
import pandas as pd
from scipy.sparse import load_npz
from sklearn.model_selection import KFold 

df_preprocessed_general_train = load_npz('./ds_salaries_GeneralPreprocessing_train.npz')
df_preprocessed_general_test = load_npz('./ds_salaries_GeneralPreprocessing_test.npz')
df_preprocessed_BoW = load_npz('./ds_salaries_BoWPreprocessing.npz')


df_target_train = pd.read_csv("./ds_salaries_target_train.csv")
df_target_test = pd.read_csv("./ds_salaries_target_test.csv")

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Baseline Model (Linear Regression)

In [93]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

estimator = LinearRegression()

steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

r2_scores = cross_val_score(pipeline, df_preprocessed_general, df_target, cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_general, df_target, cv=kfold, scoring='neg_mean_squared_error')

print("General Preprocessing:")
print("Linear Regression R2: ", np.mean(r2_scores))
print("Linear Regression MSE: ", np.mean(mse_scores))

r2_scores = cross_val_score(pipeline, df_preprocessed_BoW, df_target, cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_BoW, df_target, cv=kfold, scoring='neg_mean_squared_error')

print("Bag of Words Preprocessing:")
print("Linear Regression R2: ", np.mean(r2_scores))
print("Linear Regression MSE: ", np.mean(mse_scores))



General Preprocessing:
Linear Regression R2:  0.3955740773866091
Linear Regression MSE:  2398876426.349582
Bag of Words Preprocessing:
Linear Regression R2:  0.40788955832863866
Linear Regression MSE:  2350872916.9588165


In [96]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

estimator = LinearRegression()

steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

r2_scores = cross_val_score(pipeline, df_preprocessed_general_train, df_target_train, cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_general_train, df_target_train, cv=kfold, scoring='neg_mean_squared_error')

print("General Preprocessing:")
print("Linear Regression R2: ", np.mean(r2_scores))
print("Linear Regression MSE: ", np.mean(mse_scores))

r2_scores = cross_val_score(pipeline, df_preprocessed_BoW, df_target, cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_BoW, df_target, cv=kfold, scoring='neg_mean_squared_error')

print("Bag of Words Preprocessing:")
print("Linear Regression R2: ", np.mean(r2_scores))
print("Linear Regression MSE: ", np.mean(mse_scores))



General Preprocessing:
Linear Regression R2:  0.4215832371895859
Linear Regression MSE:  2320066867.7581606
Bag of Words Preprocessing:
Linear Regression R2:  0.40788955832863866
Linear Regression MSE:  2350872916.9588165


In [85]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

estimator = LinearRegression()

steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

pipeline.fit(df_preprocessed_general_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)

print("General Preprocessing:")
print("Linear Regression R2: ",  r2_score_holdout)
print("Linear Regression MSE: ", mse_score_holdout)

r2_scores = cross_val_score(pipeline, df_preprocessed_BoW, df_target, cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_BoW, df_target, cv=kfold, scoring='neg_mean_squared_error')

print("Bag of Words Preprocessing:")
print("Linear Regression R2: ", np.mean(r2_scores))
print("Linear Regression MSE: ", np.mean(mse_scores))



General Preprocessing:
Linear Regression R2:  0.38425054897723754
Linear Regression MSE:  2430849423.925842
Bag of Words Preprocessing:
Linear Regression R2:  0.40788955832863866
Linear Regression MSE:  2350872916.9588165


# Gradient Boosting Regression

In [88]:
from sklearn.ensemble import GradientBoostingRegressor

estimator = GradientBoostingRegressor(learning_rate=0.1, max_depth=10, min_samples_split=20, max_features='log2', subsample=1.0, random_state = 42)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

r2_scores = cross_val_score(pipeline, df_preprocessed_general, df_target.values.ravel(), cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_general, df_target.values.ravel(), cv=kfold, scoring='neg_mean_squared_error')

print("General Preprocessing:")
print("Gradient Boosting R2: ", np.mean(r2_scores))
print("Gradient Boosting MSE: ", np.mean(mse_scores))

r2_scores = cross_val_score(pipeline, df_preprocessed_BoW, df_target.values.ravel(), cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_BoW, df_target.values.ravel(), cv=kfold, scoring='neg_mean_squared_error')

print("Bag of Words Preprocessing:")
print("Gradient Boosting R2: ", np.mean(r2_scores))
print("Gradient Boosting MSE: ", np.mean(mse_scores))

General Preprocessing:
Gradient Boosting R2:  0.43338823147095235
Gradient Boosting MSE:  2248331731.9132476
Bag of Words Preprocessing:
Gradient Boosting R2:  0.441056136548054
Gradient Boosting MSE:  2216641567.392548


In [97]:
from sklearn.ensemble import GradientBoostingRegressor

estimator = GradientBoostingRegressor(learning_rate=0.1, max_depth=10, min_samples_split=20, max_features='log2', subsample=1.0, random_state = 42)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

r2_scores = cross_val_score(pipeline, df_preprocessed_general_train, df_target_train.values.ravel(), cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_general_train, df_target_train.values.ravel(), cv=kfold, scoring='neg_mean_squared_error')

print("General Preprocessing:")
print("Gradient Boosting R2: ", np.mean(r2_scores))
print("Gradient Boosting MSE: ", np.mean(mse_scores))

r2_scores = cross_val_score(pipeline, df_preprocessed_BoW, df_target.values.ravel(), cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_BoW, df_target.values.ravel(), cv=kfold, scoring='neg_mean_squared_error')

print("Bag of Words Preprocessing:")
print("Gradient Boosting R2: ", np.mean(r2_scores))
print("Gradient Boosting MSE: ", np.mean(mse_scores))

General Preprocessing:
Gradient Boosting R2:  0.441462930034063
Gradient Boosting MSE:  2244044625.480514
Bag of Words Preprocessing:
Gradient Boosting R2:  0.441056136548054
Gradient Boosting MSE:  2216641567.392548


In [98]:
from sklearn.ensemble import GradientBoostingRegressor

estimator = GradientBoostingRegressor(learning_rate=0.1, max_depth=10, min_samples_split=20, max_features='log2', subsample=1.0, random_state = 42)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

pipeline.fit(df_preprocessed_general_train, df_target_train)
predictions = pipeline.predict(df_preprocessed_general_test)

r2_score_holdout = r2_score(df_target_test, predictions)
mse_score_holdout = mean_squared_error(df_target_test, predictions)
print("General Preprocessing:")
print("Gradient Boosting R2: ", r2_score_holdout)
print("Gradient Boosting MSE: ", mse_score_holdout)

r2_scores = cross_val_score(pipeline, df_preprocessed_BoW, df_target.values.ravel(), cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_BoW, df_target.values.ravel(), cv=kfold, scoring='neg_mean_squared_error')

print("Bag of Words Preprocessing:")
print("Gradient Boosting R2: ", np.mean(r2_scores))
print("Gradient Boosting MSE: ", np.mean(mse_scores))

General Preprocessing:
Gradient Boosting R2:  0.4290101198238734
Gradient Boosting MSE:  2254148045.098806
Bag of Words Preprocessing:
Gradient Boosting R2:  0.441056136548054
Gradient Boosting MSE:  2216641567.392548


# eXtreme Boost Regression

In [7]:
#!pip install xgboost

In [14]:
from xgboost import XGBRegressor

estimator = XGBRegressor(colsample_bylevel=0.25, gamma=0, learning_rate=0.05, max_depth=5, subsample=0.5, random_state = 42)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

r2_scores = cross_val_score(pipeline, df_preprocessed_general, df_target, cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_general, df_target, cv=kfold, scoring='neg_mean_squared_error')

print("General Preprocessing:")
print("XGBoost R2: ", np.mean(r2_scores))
print("XGBoost MSE: ", np.mean(mse_scores))

r2_scores = cross_val_score(pipeline, df_preprocessed_BoW, df_target, cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_BoW, df_target, cv=kfold, scoring='neg_mean_squared_error')

print("Bag of Words Preprocessing:")
print("XGBoost R2: ", np.mean(r2_scores))
print("XGBoost MSE: ", np.mean(mse_scores))

General Preprocessing:
XGBoost R2:  0.42545052868197386
XGBoost MSE:  2281175724.9420257
Bag of Words Preprocessing:
XGBoost R2:  0.43580347673716757
XGBoost MSE:  2239341629.5901804


# ANNs MLP Regressor

In [15]:
from sklearn.neural_network import MLPRegressor
import warnings
warnings.filterwarnings('ignore')

estimator = MLPRegressor(hidden_layer_sizes= ( 10,), activation='relu', alpha=0.0001, solver='lbfgs', random_state = 42)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

r2_scores = cross_val_score(pipeline, df_preprocessed_general, df_target.values.ravel(), cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_general, df_target.values.ravel(), cv=kfold, scoring='neg_mean_squared_error')

print("General Preprocessing:")
print("ANNs MLP Regressor R2: ", np.mean(r2_scores))
print("ANNs MLP Regressor MSE: ", np.mean(mse_scores))

r2_scores = cross_val_score(pipeline, df_preprocessed_BoW, df_target, cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_BoW, df_target, cv=kfold, scoring='neg_mean_squared_error')

print("Bag of Words Preprocessing:")
print("ANNs MLP Regressor R2: ", np.mean(r2_scores))
print("ANNs MLP Regressor MSE: ", np.mean(mse_scores))

General Preprocessing:
ANNs MLP Regressor R2:  0.41959051369265277
ANNs MLP Regressor MSE:  2303145838.5044823
Bag of Words Preprocessing:
ANNs MLP Regressor R2:  0.4235546823929238
ANNs MLP Regressor MSE:  2289061573.167543


# DNNs MLP Regressor

In [8]:
from sklearn.neural_network import MLPRegressor
warnings.filterwarnings('ignore')

estimator = MLPRegressor(hidden_layer_sizes = (10,10,10,10,10,10,10,10,10,10), solver = "adam", 
                         batch_size = 180, alpha = 0.2, activation = "relu", 
                         learning_rate= "constant", random_state = 42)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

r2_scores = cross_val_score(pipeline, df_preprocessed_general, df_target.values.ravel(), cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_general, df_target.values.ravel(), cv=kfold, scoring='neg_mean_squared_error')

print("General Preprocessing:")
print("DNNs MLP Regressor R2: ", np.mean(r2_scores))
print("DNNs MLP Regressor MSE: ", np.mean(mse_scores))

r2_scores = cross_val_score(pipeline, df_preprocessed_BoW, df_target.values.ravel(), cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_BoW, df_target.values.ravel(), cv=kfold, scoring='neg_mean_squared_error')

print("Bag of Words Preprocessing:")
print("DNNs MLP Regressor R2: ", np.mean(r2_scores))
print("DNNs MLP Regressor MSE: ", np.mean(mse_scores))

General Preprocessing:
DNNs MLP Regressor R2:  0.4346433555749029
DNNs MLP Regressor MSE:  2244625276.584117
Bag of Words Preprocessing:
DNNs MLP Regressor R2:  0.44003892301282443
DNNs MLP Regressor MSE:  2223723372.941045
