In [5]:
import pandas as pd
from scipy.sparse import load_npz
from sklearn.model_selection import KFold 

df_preprocessed_general = load_npz('./ds_salaries_GeneralPreprocessing.npz')
df_target = pd.read_csv("./ds_salaries_target.csv")


kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Baseline Model (Linear Regression)

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

estimator = LinearRegression()

steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

r2_scores = cross_val_score(pipeline, df_preprocessed_general, df_target, cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_general, df_target, cv=kfold, scoring='neg_mean_squared_error')

print("Linear Regression R2: ", np.mean(r2_scores))
print("Linear Regression MSE: ", np.mean(mse_scores))


Linear Regression R2:  0.3955730222374845
Linear Regression MSE:  2398880723.6277514


# eXtreme Boost Regression

In [40]:
#!pip install xgboost

In [41]:
from xgboost import XGBRegressor

estimator = XGBRegressor(colsample_bylevel=0.25, gamma=0, learning_rate=0.05, max_depth=5, subsample=0.5)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

r2_scores = cross_val_score(pipeline, df_preprocessed_general, target, cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_general, target, cv=kfold, scoring='neg_mean_squared_error')

print("XGBoost R2: ", np.mean(r2_scores))
print("XGBoost MSE: ", np.mean(mse_scores))

XGBoost R2:  0.4269042774935478
XGBoost MSE:  2275072491.208445


# Gradient Boosting Regression

In [1]:
from sklearn.ensemble import GradientBoostingRegressor

estimator = GradientBoostingRegressor(learning_rate=0.1, max_depth=10, min_samples_split=20, max_features='log2', subsample=1.0)
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

r2_scores = cross_val_score(pipeline, df_preprocessed_general, target.values.ravel(), cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_general, target.values.ravel(), cv=kfold, scoring='neg_mean_squared_error')

print("Gradient Boosting R2: ", np.mean(r2_scores))
print("Gradient Boosting MSE: ", np.mean(mse_scores))

NameError: name 'Pipeline' is not defined

# DNNs MLP Regressor

In [8]:
from sklearn.neural_network import MLPRegressor

estimator = MLPRegressor(hidden_layer_sizes = (10,10,10,10,10,10,10,10,10,10), solver = "adam")
steps = [('estimator', estimator)]

pipeline = Pipeline(steps)

r2_scores = cross_val_score(pipeline, df_preprocessed_general, df_target.values.ravel(), cv=kfold, scoring='r2')
mse_scores = -cross_val_score(pipeline, df_preprocessed_general, df_target.values.ravel(), cv=kfold, scoring='neg_mean_squared_error')

print("DNNs MLP Regressor R2: ", np.mean(r2_scores))
print("DNNs MLP Regressor MSE: ", np.mean(mse_scores))



DNNs MLP Regressor R2:  -0.10691472360418089
DNNs MLP Regressor MSE:  2252277413.515023


