In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv("../Dataset/ds_salaries.csv")

In [2]:
df_features = df.copy(deep=True)
target = df_features["salary_in_usd"]
df_features.drop(["salary", "salary_in_usd"], axis=1, inplace=True)

In [4]:
oneHot_features = ["work_year", "employment_type", "remote_ratio", "job_title", "salary_currency", "employee_residence", "company_location","company_size", "experience_level"]
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(), oneHot_features),
    ])
df_preprocessed = preprocessor.fit_transform(df_features)


# init Model

In [5]:
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [6]:
regressor = MLPRegressor()

parameters = {
    "solver" : ["lbfgs", "sgd", "adam"]
}

#Prüfen welches sinnvoller ist in Bezug auf Regression. Laut ChatGPT ist es KFold.
k_10_fold_cv = KFold(n_splits=10, shuffle=True, random_state=42)
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

grid_search_estimator = GridSearchCV(regressor, parameters, scoring='r2', cv=k_10_fold_cv, return_train_score=False)

grid_search_estimator.fit(df_preprocessed,target)

#results = pd.DataFrame(grid_search_estimator.cv_results_)
    
# print the best parameter setting
print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))

best score is 0.4077028361552717 with params {'solver': 'lbfgs'}


In [8]:
regressor = MLPRegressor()

parameters = {
    "hidden_layer_sizes" : [(10,), (50,), (100,)],
    "activation": ["relu", "tanh"],
    "solver" : ["lbfgs", "sgd", "adam"]
}

#Prüfen welches sinnvoller ist in Bezug auf Regression. Laut ChatGPT ist es KFold.
k_10_fold_cv = KFold(n_splits=10, shuffle=True, random_state=42)
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

grid_search_estimator = GridSearchCV(regressor, parameters, scoring='r2', cv=k_10_fold_cv, return_train_score=False)

grid_search_estimator.fit(df_preprocessed,target)

#results = pd.DataFrame(grid_search_estimator.cv_results_)
    
# print the best parameter setting
print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))

results = pd.DataFrame(grid_search_estimator.cv_results_)
print(results)

best score is 0.4203945711812123 with params {'activation': 'relu', 'hidden_layer_sizes': (10,), 'solver': 'lbfgs'}
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0        0.359060      0.056244         0.000356        0.000048   
1        0.172836      0.119280         0.000336        0.000023   
2        0.950425      0.009166         0.000336        0.000023   
3        1.456380      0.352923         0.000498        0.000058   
4        0.382040      0.218482         0.000584        0.000282   
5        2.109464      0.136969         0.000609        0.000135   
6        1.895033      0.053281         0.000708        0.000298   
7        0.544295      0.353388         0.000706        0.000377   
8        3.977745      0.420531         0.000842        0.000466   
9        0.084674      0.031386         0.000320        0.000024   
10       0.093329      0.033609         0.000319        0.000027   
11       1.019714      0.006413         0.000380        0.000014   


In [9]:
regressor = MLPRegressor()

parameters = {
    "hidden_layer_sizes" : [(10,), (50,), (100,)],
    "activation": ["relu", "tanh"],
    "solver" : ["lbfgs", "sgd", "adam"],
    'alpha': [0.0001, 0.001, 0.01]
}

#Prüfen welches sinnvoller ist in Bezug auf Regression. Laut ChatGPT ist es KFold.
k_10_fold_cv = KFold(n_splits=10, shuffle=True, random_state=42)
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

grid_search_estimator = GridSearchCV(regressor, parameters, scoring='r2', cv=k_10_fold_cv, return_train_score=False)

grid_search_estimator.fit(df_preprocessed,target)

#results = pd.DataFrame(grid_search_estimator.cv_results_)
    
# print the best parameter setting
print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))

results = pd.DataFrame(grid_search_estimator.cv_results_)
print(results)

best score is 0.42103698335464906 with params {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'solver': 'lbfgs'}
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0        0.379525      0.052288         0.000383        0.000091   
1        0.078102      0.063745         0.000365        0.000100   
2        0.988169      0.028414         0.000344        0.000021   
3        1.514751      0.403496         0.000536        0.000097   
4        0.423319      0.306813         0.000781        0.000487   
5        2.113150      0.144979         0.000689        0.000283   
6        1.999705      0.151223         0.001316        0.001748   
7        0.837871      0.447839         0.001034        0.001167   
8        3.902309      0.403383         0.001050        0.000655   
9        0.331322      0.028712         0.000342        0.000013   
10       0.147233      0.144952         0.000333        0.000014   
11       1.078147      0.077091         0.000331  