In [1]:
import datetime
from sklearn import metrics, model_selection, ensemble
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import KBinsDiscretizer
from sklearn import linear_model
from sklearn.metrics import mean_squared_error



In [2]:
try: 
    import optuna
except:
    !pip install optuna
    import optuna

In [3]:
train = pd.read_csv("./Datasets/prepared_train.csv", index_col='id')

In [4]:
BINS = 16

In [5]:
X = train.query("target > 6").drop(["target"], axis=1)
y = train.query("target > 6")["target"] * 100

In [6]:
num_train = train.select_dtypes([int, float])
cat_train = train.select_dtypes(object)

num = list(num_train.drop(['target'],axis=1))
cat = list(cat_train)

In [7]:
rmse = make_scorer(mean_squared_error, squared=False)

pipeline_num = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')), 
    ('normal', PowerTransformer()), 
#     ('scaling', MinMaxScaler()),
    ('scaling', StandardScaler()), 
    ('bins', KBinsDiscretizer(n_bins = BINS))
])

pipeline_cat = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoding', OneHotEncoder(handle_unknown='ignore')),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', pipeline_num, num),
        ('cat', pipeline_cat, cat),
        ], remainder="drop")

def objective(trial):
    
    param_model = {
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 1.0),
    }
    
    pipeline_ridge = Pipeline(steps=[('preprocessor', preprocessor),  
                                     ('model', linear_model.Ridge(**param_model)),
                                     ])

    rmse_mean_cv = cross_val_score(pipeline_ridge, X, y, cv=5, scoring=rmse)
    print(rmse_mean_cv)

    return rmse_mean_cv.mean()

In [8]:
study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=5)

[32m[I 2021-09-06 12:44:19,882][0m A new study created in memory with name: no-name-d407f04b-cf6b-47ea-883f-43a4d48cde6c[0m
[32m[I 2021-09-06 12:46:15,178][0m Trial 0 finished with value: 71.79711553017304 and parameters: {'alpha': 0.02087043129109916}. Best is trial 0 with value: 71.79711553017304.[0m


[71.51529143 71.85824933 71.90718661 71.76907965 71.93577064]


[32m[I 2021-09-06 12:47:06,782][0m Trial 1 finished with value: 71.79712574428257 and parameters: {'alpha': 0.010750521629052804}. Best is trial 0 with value: 71.79711553017304.[0m


[71.51528975 71.85824495 71.90718689 71.76913702 71.9357701 ]


[32m[I 2021-09-06 12:47:56,324][0m Trial 2 finished with value: 71.79709059924633 and parameters: {'alpha': 0.04430924237349258}. Best is trial 2 with value: 71.79709059924633.[0m


[71.51529471 71.8582449  71.90706186 71.76908205 71.93576948]


[32m[I 2021-09-06 12:48:55,615][0m Trial 3 finished with value: 71.79709980312006 and parameters: {'alpha': 0.2986790506985575}. Best is trial 2 with value: 71.79709059924633.[0m


[71.51525921 71.85823239 71.90717512 71.76908216 71.93575014]


[32m[I 2021-09-06 12:49:51,864][0m Trial 4 finished with value: 71.79702201908546 and parameters: {'alpha': 0.591165940669679}. Best is trial 4 with value: 71.79702201908546.[0m


[71.51508177 71.85821035 71.90703685 71.76902524 71.9357559 ]


In [9]:
best_params = study.best_trial.params
best_params

{'alpha': 0.591165940669679}

# save configs

In [10]:
import yaml
import os


directory = './'
if not os.path.exists(directory):
    os.makedirs(directory)
    
    
with open('./Configs/params_model_linear.yaml', 'w') as outfile:
    yaml.dump(best_params, outfile, default_flow_style=False)