In [5]:
import pandas as pd
from scipy.sparse import load_npz
from sklearn.model_selection import KFold 

# df_preprocessed_general = load_npz('./ds_salaries_GeneralPreprocessing.npz')
df_preprocessed_general = load_npz('./ds_salaries_BoWPreprocessing.npz')
df_target = pd.read_csv("./ds_salaries_target.csv")

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Baseline Model (Linear Regression)

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

estimator = LinearRegression()

steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

r2_scores = cross_val_score(pipeline, df_preprocessed_general, df_target, cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_general, df_target, cv=kfold, scoring='neg_mean_squared_error')

print("General Preprocessing:")
print("Linear Regression R2: ", np.mean(r2_scores))
print("Linear Regression MSE: ", np.mean(mse_scores))


General Preprocessing:
Linear Regression R2:  0.4078881044892661
Linear Regression MSE:  2350878670.820264


# Gradient Boosting Regression

In [16]:
from sklearn.ensemble import GradientBoostingRegressor

estimator = GradientBoostingRegressor(learning_rate=0.1, max_depth=10, min_samples_split=20, max_features='log2', subsample=1.0)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

r2_scores = cross_val_score(pipeline, df_preprocessed_general, df_target.values.ravel(), cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_general, df_target.values.ravel(), cv=kfold, scoring='neg_mean_squared_error')

print("General Preprocessing:")
print("Gradient Boosting R2: ", np.mean(r2_scores))
print("Gradient Boosting MSE: ", np.mean(mse_scores))

General Preprocessing:
Gradient Boosting R2:  0.43693558691737433
Gradient Boosting MSE:  2246829544.2235193


# eXtreme Boost Regression

In [17]:
#!pip install xgboost

In [18]:
from xgboost import XGBRegressor

estimator = XGBRegressor(colsample_bylevel=0.25, gamma=0, learning_rate=0.05, max_depth=5, subsample=0.5)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

r2_scores = cross_val_score(pipeline, df_preprocessed_general, df_target, cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_general, df_target, cv=kfold, scoring='neg_mean_squared_error')

print("General Preprocessing:")
print("XGBoost R2: ", np.mean(r2_scores))
print("XGBoost MSE: ", np.mean(mse_scores))

General Preprocessing:
XGBoost R2:  0.4269042774935478
XGBoost MSE:  2275072491.208445


# DNNs MLP Regressor

In [20]:
from sklearn.neural_network import MLPRegressor

estimator = MLPRegressor(hidden_layer_sizes = (10,10,10,10,10,10,10,10,10,10), solver = "adam")
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

r2_scores = cross_val_score(pipeline, df_preprocessed_general, df_target.values.ravel(), cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_general, df_target.values.ravel(), cv=kfold, scoring='neg_mean_squared_error')

print("General Preprocessing:")
print("DNNs MLP Regressor R2: ", np.mean(r2_scores))
print("DNNs MLP Regressor MSE: ", np.mean(mse_scores))



General Preprocessing:
DNNs MLP Regressor R2:  0.43569233810335345
DNNs MLP Regressor MSE:  2249239282.5527124
