In [23]:
import datetime
from sklearn import metrics, model_selection, ensemble
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import KBinsDiscretizer
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import mlflow

In [24]:
try: 
    import optuna
except:
    !pip install optuna
    import optuna

In [25]:
train = pd.read_csv("~/Документы/Git/Kaggle_30DaysOfML/data/prepare/prepared_train.csv", index_col='id')
# val = pd.read_csv("./data/prepared/prepared_val.csv", index_col='id')

In [49]:
BINS = 32

In [50]:
X = train.query("target > 600").drop(["target"], axis=1)
y = train.query("target > 600")["target"]

In [51]:
num_train = train.select_dtypes([int, float])
cat_train = train.select_dtypes(object)

num = list(num_train.drop(['target'],axis=1))
cat = list(cat_train)

In [60]:
rmse = make_scorer(mean_squared_error, squared=False)

pipeline_num = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')), 
    ('normal', PowerTransformer()), 
#     ('scaling', MinMaxScaler()),
    ('scaling', StandardScaler()), 
    ('bins', KBinsDiscretizer(n_bins = BINS))
])

pipeline_cat = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoding', OneHotEncoder(handle_unknown='ignore')),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', pipeline_num, num),
        ('cat', pipeline_cat, cat),
        ], remainder="drop")


def objective(trial):
    
    param_model = {
        'alpha': trial.suggest_loguniform('alpha', 1, 1000.0),
        'random_state': trial.suggest_categorical('random_state', [42]),
    }
    
#     for k, v in param_model.items():
#         log_param(k, v)
        
    pipeline_ridge = Pipeline(steps=[('preprocessor', preprocessor),  
                                     ('model', linear_model.Ridge(**param_model)),
                                     ])

    rmse_mean_cv = cross_val_score(pipeline_ridge, X, y, cv=5, scoring=rmse)
#     print(rmse_mean_cv)

    mlflow.start_run(run_name = 'linear')
    for k, v in param_model.items():
        mlflow.log_param(k, v)
    mlflow.log_metric("rmse", rmse_mean_cv.mean())
    mlflow.log_metric("varianse", np.var(rmse_mean_cv))
    mlflow.end_run()

    return rmse_mean_cv.mean()

In [59]:
study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=5)

[32m[I 2021-09-07 18:17:04,327][0m A new study created in memory with name: no-name-669cbea4-64d2-4975-89b2-24169f037f1a[0m
[32m[I 2021-09-07 18:17:09,783][0m Trial 0 finished with value: 71.91576546952292 and parameters: {'alpha': 5.762847441791629, 'random_state': 42}. Best is trial 0 with value: 71.91576546952292.[0m
[32m[I 2021-09-07 18:17:14,756][0m Trial 1 finished with value: 71.96975402765922 and parameters: {'alpha': 638.0273559039456, 'random_state': 42}. Best is trial 0 with value: 71.91576546952292.[0m
[32m[I 2021-09-07 18:17:20,262][0m Trial 2 finished with value: 71.92140109333948 and parameters: {'alpha': 4.022219388431114, 'random_state': 42}. Best is trial 0 with value: 71.91576546952292.[0m
[32m[I 2021-09-07 18:17:25,364][0m Trial 3 finished with value: 71.89943165425491 and parameters: {'alpha': 11.61301735273177, 'random_state': 42}. Best is trial 3 with value: 71.89943165425491.[0m
[32m[I 2021-09-07 18:17:31,036][0m Trial 4 finished with value: 71.

In [54]:
best_params = study.best_trial.params
best_params

{'alpha': 57.69636092679144, 'random_state': 42}

# save configs

In [33]:
import yaml
import os
    
    
with open('../params_model_linear.yaml', 'w') as outfile:
    yaml.dump(best_params, outfile, default_flow_style=False)