In [4]:
import datetime
from sklearn import metrics, model_selection, ensemble
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import KBinsDiscretizer
import xgboost as xgb
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

  import pandas.util.testing as tm


In [6]:
try:
    from CustomPipeline import *
except:
    import sys
    sys.path.insert(0,'/content/sample_data')
    from CustomPipeline import *
    print("ok")

In [7]:
try: 
    import optuna
except:
    !pip install optuna
    import optuna

In [8]:
try:
    train = pd.read_csv("./train_for_crossval.csv", delimiter=",", sep='.')
except:
    train = pd.read_csv("./sample_data/train_for_crossval.csv", delimiter=",", sep='.')

In [9]:
X = train.query("target > 6").drop(["target"], axis=1)
y = train.query("target > 6")["target"] * 100

In [10]:
num_train = train.select_dtypes([int, float])
cat_train = train.select_dtypes(object)

num = list(num_train.drop(['target'],axis=1))
cat = list(cat_train)

In [11]:
rmse = make_scorer(mean_squared_error, squared=False)

pipeline_num = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), 
    ('normal', PowerTransformer()), 
    ('scaling', MinMaxScaler()),
#     ('scaling', StandardScaler()), 
#     ('bins', KBinsDiscretizer(n_bins = self.n_bins))
])

pipeline_cat = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoding', OneHotEncoder(handle_unknown='ignore')),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', pipeline_num, num),
        ('cat', pipeline_cat, cat),
        ], remainder="drop")

def objective(trial):
    
    param_model = {
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 100.0),
    }
    
    pipeline_ridge = Pipeline(steps=[('preprocessor', preprocessor),  
                                     ('model', linear_model.Ridge(**param_model)),
                                     ])

    rmse_mean_cv = cross_val_score(pipeline_ridge, X, y, cv=5, scoring=rmse)
    print(rmse_mean_cv)

    return rmse_mean_cv.mean()

In [12]:
study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=5)

[32m[I 2021-09-02 12:11:40,518][0m A new study created in memory with name: no-name-c1c82b22-e89b-4d9f-afb8-24f7602c3dfe[0m
[32m[I 2021-09-02 12:11:53,302][0m Trial 0 finished with value: 72.95747082386181 and parameters: {'alpha': 0.11576237046851422}. Best is trial 0 with value: 72.95747082386181.[0m


[72.49450489 73.0987833  73.41692086 73.03473225 72.74241282]


[32m[I 2021-09-02 12:12:07,003][0m Trial 1 finished with value: 72.94409470179218 and parameters: {'alpha': 31.067329417208146}. Best is trial 1 with value: 72.94409470179218.[0m


[72.48789707 73.08095326 73.3948419  73.02389375 72.73288752]


[32m[I 2021-09-02 12:12:20,147][0m Trial 2 finished with value: 72.94291242898367 and parameters: {'alpha': 45.97949266795876}. Best is trial 2 with value: 72.94291242898367.[0m


[72.48593972 73.08093899 73.39279912 73.02280737 72.73207695]


[32m[I 2021-09-02 12:12:33,367][0m Trial 3 finished with value: 72.95832641658863 and parameters: {'alpha': 0.0021467819527770846}. Best is trial 2 with value: 72.94291242898367.[0m


[72.49454709 73.10208683 73.4174799  73.03495241 72.74256586]


[32m[I 2021-09-02 12:12:49,090][0m Trial 4 finished with value: 72.95471270549181 and parameters: {'alpha': 0.7591719415648475}. Best is trial 2 with value: 72.94291242898367.[0m


[72.49427205 73.08951047 73.41452013 73.03364753 72.74161334]


In [14]:
best_params = study.best_trial.params
best_params

{'alpha': 45.97949266795876}

# save configs

In [15]:
import yaml
import os


directory = './Configs'
if not os.path.exists(directory):
    os.makedirs(directory)
    
    
with open('./Configs/params_model_linear.yml', 'w') as outfile:
    yaml.dump(best_params, outfile, default_flow_style=False)