In [26]:
from sklearn import metrics, model_selection, ensemble
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import KBinsDiscretizer
import xgboost as xgb
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingRegressor
import xgboost
from sklearn.cluster import KMeans

import yaml


pd.set_option('display.max_columns', None)

In [27]:
try: 
    import optuna
except:
    !pip install optuna
    import optuna

In [28]:
try:
    from catboost import Pool, CatBoostRegressor, cv
except:
    !pip install catboost 
    from catboost import Pool, CatBoostRegressor, cv

In [29]:
try:
    from CustomPipeline import *
except:
    import sys
    sys.path.insert(0,'/content/sample_data')
    from CustomPipeline import *
    print("ok")

In [30]:
try:
    train = pd.read_csv("./Datasets/train_anomaly.csv", delimiter=",", sep='.')
except:
    train = pd.read_csv("./sample_data/train_anomaly.csv", delimiter=",", sep='.')

In [31]:
with open("./Configs/params_model_linear.yml", 'r') as stream:
    try:
        params_linear = yaml.safe_load(stream)
        print(params_linear)
    except yaml.YAMLError as exc:
        print(exc)

{'alpha': 45.97949266795876}


In [32]:
with open("./Configs/params_model_xgb.yml", 'r') as stream:
    try:
        params_xgb = yaml.safe_load(stream)
        print(params_xgb)
    except yaml.YAMLError as exc:
        print(exc)

FileNotFoundError: [Errno 2] No such file or directory: './Configs/params_model_xgb.yml'

In [34]:
test = pd.read_csv("./Datasets/test_anomaly.csv", delimiter=",", sep='.', index_col='id')

In [35]:
RANDOM_STATE = 42

In [36]:
train.drop("id", axis=1, inplace=True)
train["target"] = 100 * train["target"]

In [37]:
train, test_ensemble = train_test_split(train, test_size=0.1, random_state=RANDOM_STATE)
train_estimators, train_ensemble = train_test_split(train, test_size=0.2, random_state=RANDOM_STATE)

In [38]:
X = train_estimators.drop(["target"], axis=1)
y = train_estimators["target"]

In [39]:
num = TypesOfColumns(X).get_num()
cat = TypesOfColumns(X).get_cat()
cat_idx = TypesOfColumns(X).get_cat_idx()

# linear ridge

In [42]:
n_bins=63

X_lr = train_estimators.query("target > 600").drop(["target"], axis=1)
y_lr = train_estimators.query("target > 600")["target"]

model = linear_model.Ridge(**params_linear)
lr = LinearWrapper(model, bins_linear=n_bins)

# xgb

In [None]:
# params_xgb = {
#     'tree_method':'gpu_hist',
#     'random_state': 1, 
#     'n_jobs': 4,
#     'booster': 'gbtree',
#     'n_estimators': 10000,
#     'learning_rate': 0.035,
#     'reg_lambda': 1.22,
#     'reg_alpha': 36.04,
#     'subsample': 0.9,
#     'colsample_bytree': 0.11,
#     'max_depth': 3,
#     'min_child_weight': 6
# }
model = xgboost.XGBRegressor(**params_xgb)
xgb = XGBWrapper(model, cat=cat, num=num)

# StackingRegressor

In [None]:
models = [lr, xgb]

for i, model in enumerate(models):
    if model == lr:
        model.fit(X_lr, y_lr)
    else:
        model.fit(X, y)

    pred = model.predict(train_ensemble.drop(['target'], axis=1))
    train_ensemble[str(i)] = pred
    
    pred = model.predict(test_ensemble.drop(['target'], axis=1))
    test_ensemble[str(i)] = pred
    
    pred = model.predict(test)
    test[str(i)] = pred

In [11]:
X = df_ensemble.drop('target', axis=1)
y = df_ensemble['target']

num_train = X.select_dtypes([int, float])
cat_train = X.select_dtypes(object)

num = list(num_train)
cat = list(cat_train)

rmse = make_scorer(mean_squared_error, squared=False)

pipeline_num = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaling', StandardScaler()),  
    ('normal', PowerTransformer()), 
])
pipeline_cat = Pipeline(steps=[
    # ('imputer', SimpleImputer(strategy='most_frequent')),
    # ('encoding', OrdinalEncoder()),
    ('encoding', OneHotEncoder(handle_unknown='ignore')),
])
preprocessor = ColumnTransformer(n_jobs=-1,
    transformers=[
        ('num', pipeline_num, num),
        ('cat', pipeline_cat, cat),
        ], remainder="passthrough")

preprocessor.fit(X)
transform = preprocessor.transform(X)


In [44]:
def objective(trial):
    
    param_model = {
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 1.0),
        'random_state':trial.suggest_categorical("random_state", [0, 42]),
    }
    
    pipeline_ridge = Pipeline(steps=[ 
                                     ('model', linear_model.Lasso(**param_model)),
                                     ])

    rmse_mean_cv = cross_val_score(pipeline_ridge, transform, y, cv=5, scoring=rmse)
    print(rmse_mean_cv)

    return rmse_mean_cv.mean()

In [None]:
study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=10)

[32m[I 2021-08-30 20:55:01,978][0m A new study created in memory with name: no-name-9a24571d-6850-4d78-bcb6-ba37cb07191a[0m
  max_iter, tol, rng, random, positive)


In [None]:
best_params = study.best_trial.params
best_params 

# save predict

In [None]:
model = linear_model.Lasso(**best_params)
model.fit(transform, y)
pred = model.predict(preprocessor.transform(test))

In [None]:
test['target'] = pred

In [None]:
test

In [None]:
test[['target']].to_csv('./ensemble_for_submit_with_clusters.csv', index=True)