In [7]:
# import sklearn.neighbors._base
# import sys
# sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

import datetime
from sklearn import metrics, model_selection, ensemble
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import KBinsDiscretizer
import xgboost as xgb
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [8]:
try:
    from catboost import Pool, CatBoostRegressor, cv
except:
    !pip install catboost 
    from catboost import Pool, CatBoostRegressor, cv

In [9]:
try:
    from CustomPipeline import *
except:
    import sys
    sys.path.insert(0,'/content/sample_data')
    from CustomPipeline import *
    print("ok")

In [10]:
try: 
    import optuna
except:
    !pip install optuna
    import optuna

In [11]:
try:
    train = pd.read_csv("./train_for_crossval.csv", delimiter=",", sep='.')
except:
    train = pd.read_csv("./sample_data/train_for_crossval.csv", delimiter=",", sep='.')

In [12]:
X = train.query("target > 6").drop(["target"], axis=1)
y = train.query("target > 6")["target"] * 100

In [13]:
num_train = train.select_dtypes([int, float])
cat_train = train.select_dtypes(object)

num = list(num_train.drop(['target'],axis=1))
cat = list(cat_train)

In [20]:
rmse = make_scorer(mean_squared_error, squared=False)

pipeline_num = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaling', StandardScaler()),  
    ('normal', PowerTransformer()), 
#     ('bins', KBinsDiscretizer(n_bins = self.n_bins))
])

pipeline_cat = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoding', OneHotEncoder(handle_unknown='ignore')),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', pipeline_num, num),
        ('cat', pipeline_cat, cat),
        ], remainder="drop")

def objective(trial):
    
    param_model = {
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 100.0),
    }
    
#     param_preprocessor = {
#         'n_bins': trial.suggest_int('n_bins', 50, 90),
#     }
    
#     preprocessor = CustomPipeline(num_columns=num, cat_columns=cat, **param_preprocessor).get_preprocessor()
    
#     pipeline_ridge = PipelineModelModel(preprocessor, linear_model.Ridge(**param_model))
    
    pipeline_ridge = Pipeline(steps=[('preprocessor', preprocessor),  
                                     ('model', linear_model.Ridge(**param_model)),
                                     ])

    rmse_mean_cv = cross_val_score(pipeline_ridge, X, y, cv=5, scoring=rmse)
    print(rmse_mean_cv)

    return rmse_mean_cv.mean()

In [21]:
study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=5)

[32m[I 2021-08-27 14:22:50,888][0m A new study created in memory with name: no-name-b445a9d7-2e1d-4222-bc37-852a3cdc3548[0m
[32m[I 2021-08-27 14:23:12,610][0m Trial 0 finished with value: 72.93788029344756 and parameters: {'alpha': 0.007009659502738396}. Best is trial 0 with value: 72.93788029344756.[0m


[72.48309911 73.0764727  73.38181441 73.02144378 72.72657146]


[32m[I 2021-08-27 14:23:28,616][0m Trial 1 finished with value: 72.92947686962509 and parameters: {'alpha': 0.05444988082574249}. Best is trial 1 with value: 72.92947686962509.[0m


[72.47568833 73.06331972 73.37181345 73.01985521 72.71670763]


[32m[I 2021-08-27 14:23:44,809][0m Trial 2 finished with value: 72.92914489881133 and parameters: {'alpha': 0.04628995455938748}. Best is trial 2 with value: 72.92914489881133.[0m


[72.47416234 73.06408005 73.37188138 73.01885493 72.71674579]


[32m[I 2021-08-27 14:24:07,497][0m Trial 3 finished with value: 72.93910935926233 and parameters: {'alpha': 0.005398764004963791}. Best is trial 2 with value: 72.92914489881133.[0m


[72.48580695 73.07744541 73.38371294 73.02122862 72.72735288]


[32m[I 2021-08-27 14:24:22,464][0m Trial 4 finished with value: 73.75428583094131 and parameters: {'alpha': 6.86889851354573}. Best is trial 2 with value: 72.92914489881133.[0m


[73.27839186 73.88746484 74.24965357 73.81545753 73.54046136]


[32m[I 2021-08-27 14:24:38,280][0m Trial 5 finished with value: 72.99318318846402 and parameters: {'alpha': 0.5193151900487678}. Best is trial 2 with value: 72.92914489881133.[0m


[72.54671249 73.10237894 73.45462335 73.07460568 72.78759548]


[32m[I 2021-08-27 14:24:54,270][0m Trial 6 finished with value: 72.92966803718078 and parameters: {'alpha': 0.05808422360014425}. Best is trial 2 with value: 72.92914489881133.[0m


[72.47639411 73.06301577 73.37187348 73.02031783 72.716739  ]


[32m[I 2021-08-27 14:25:10,746][0m Trial 7 finished with value: 72.93074474413311 and parameters: {'alpha': 0.024318619209938355}. Best is trial 2 with value: 72.92914489881133.[0m


[72.47202806 73.06777322 73.37431533 73.01889134 72.72071577]


[32m[I 2021-08-27 14:25:26,157][0m Trial 8 finished with value: 73.17591326823636 and parameters: {'alpha': 1.1360667003295353}. Best is trial 2 with value: 72.92914489881133.[0m


[72.71600274 73.27268349 73.65013822 73.23123656 73.00950534]


[32m[I 2021-08-27 14:25:41,630][0m Trial 9 finished with value: 72.9525816386327 and parameters: {'alpha': 0.26745615764303565}. Best is trial 2 with value: 72.92914489881133.[0m


[72.5101524  73.06758945 73.40817154 73.04278914 72.73420567]


In [None]:
# 73.63670041849919 с бинами
# 73.9315395633810 без бинов
# 74.38762034243373 pca + бины
# 73.61114198701696 бины+кластер
# 73.10832265456563 настраеваемые бины
# 71.47909093792476 kaggle
# 0.7110066356517013 без умножения на 100
# 71.0780270945487. c умножением на 100
# lasso с бинами

In [None]:
best_params = study.best_trial.params
best_params