In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv("./ds_salaries.csv")

In [2]:
df_features = df.copy(deep=True)
target = df_features["salary_in_usd"]
df_features.drop(["salary", "salary_in_usd"], axis=1, inplace=True)

In [3]:
oneHot_features = ["work_year", "employment_type", "remote_ratio", "job_title", "salary_currency", "employee_residence", "company_location","company_size", "experience_level"]
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(), oneHot_features),
    ])
df_preprocessed = preprocessor.fit_transform(df_features)


# init Model

In [4]:
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [33]:
regressor = MLPRegressor()

parameters = {
    "hidden_layer_sizes": [(100,100),(100,100,100),(10,10,10,10,10,10,10,10,10)],
    "solver": ["lbfgs", "sgd", "adam"]
}

#Prüfen welches sinnvoller ist in Bezug auf Regression. Laut ChatGPT ist es KFold.
k_10_fold_cv = KFold(n_splits=10, shuffle=True, random_state=42)
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

grid_search_estimator = GridSearchCV(regressor, parameters, scoring='r2', cv=k_10_fold_cv, return_train_score=False)

grid_search_estimator.fit(df_preprocessed,target)

results = pd.DataFrame(grid_search_estimator.cv_results_)
print(results)
    
# print the best parameter setting
print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))

   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       7.083341      0.229504         0.001964        0.000815   
1       2.142308      3.250260         0.001305        0.000932   
2       9.167933      0.355813         0.001907        0.004686   
3       9.649185      0.256764         0.004894        0.007055   
4      10.863007      0.241316         0.001782        0.004660   
5      11.857682      0.131954         0.001562        0.004686   
6       2.391625      0.714866         0.001563        0.004689   
7       3.654594      2.669266         0.002046        0.004625   
8       6.469462      0.216413         0.000000        0.000000   

               param_hidden_layer_sizes param_solver  \
0                            (100, 100)        lbfgs   
1                            (100, 100)          sgd   
2                            (100, 100)         adam   
3                       (100, 100, 100)        lbfgs   
4                       (100, 100, 100)          

In [34]:
regressor = MLPRegressor()

parameters = {
    "hidden_layer_sizes": [(10,10,10,10,10,10,10,10,10,10),(100,100,100,100,100,100,100,100,100,100),(10,10,10,10,10,10,10,10,10,10,10,10,10,10,10)],
    "solver": ["lbfgs", "sgd", "adam"]
}

#Prüfen welches sinnvoller ist in Bezug auf Regression. Laut ChatGPT ist es KFold.
k_10_fold_cv = KFold(n_splits=10, shuffle=True, random_state=42)
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

grid_search_estimator = GridSearchCV(regressor, parameters, scoring='r2', cv=k_10_fold_cv, return_train_score=False)

grid_search_estimator.fit(df_preprocessed,target)

results = pd.DataFrame(grid_search_estimator.cv_results_)
print(results)
    
# print the best parameter setting
print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))

   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       3.007595      0.246308         0.003574        0.006168   
1       3.739567      3.059376         0.001563        0.004690   
2       7.658335      0.809924         0.001411        0.003232   
3      21.463450     11.794150         0.007254        0.007766   
4      29.255870      0.610951         0.005954        0.002481   
5      22.885223      4.246790         0.004817        0.005965   
6       3.820160      1.749571         0.000157        0.000471   
7       7.337113      2.241766         0.000000        0.000000   
8       8.909608      0.559159         0.003129        0.006259   

                            param_hidden_layer_sizes param_solver  \
0           (10, 10, 10, 10, 10, 10, 10, 10, 10, 10)        lbfgs   
1           (10, 10, 10, 10, 10, 10, 10, 10, 10, 10)          sgd   
2           (10, 10, 10, 10, 10, 10, 10, 10, 10, 10)         adam   
3  (100, 100, 100, 100, 100, 100, 100, 100, 100, ... 

In [None]:
# In Train and test split aufteillen

attributes_train, attributes_test, target_train, target_test = train_test_split(df_preprocessed, target, test_size=0.3, random_state=42)

regressor = MLPRegressor()

parameters = {
    "hidden_layer_sizes": [(10,10,10,10,10,10,10,10,10,10),(5, 5, 5, 5, 5, 5, 5, 5, 5, 5), (20, 20, 20, 20, 20, 20, 20, 20, 20, 20)],
    "solver": ["lbfgs", "sgd", "adam"],
    "activation": ["relu", "tanh", "identity", "logistic"],
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0],
    "batch_size": [8,16,32,64,128,256,512],
    "learning_rate": ["constant", "invscaling", "adaptive"],
    "learning_rate_init": [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0],
    "power_t": [0.0005, 0.005, 0.05, 0.5, 5.0, 50.0],
    "max_iter": [100,500,1000,10000]
    
}

#Prüfen welches sinnvoller ist in Bezug auf Regression. Laut ChatGPT ist es KFold.
k_10_fold_cv = KFold(n_splits=10, shuffle=True, random_state=42)
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

grid_search_estimator = GridSearchCV(regressor, parameters, scoring='r2', cv=k_10_fold_cv, return_train_score=False)

#Trainieren auf trainingsdaten
grid_search_estimator.fit(attributes_train,target_train)

#Bestes model nutzen
best_model = grid_search_estimator.best_estimator_

#Validate on the test set
validation_score = best_model.score(attributes_test, target_test)

#results = pd.DataFrame(grid_search_estimator.cv_results_)
    
# print the best parameter setting
print("Best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))
print("Test score: {}".format(validation_score))