In [None]:
# %%capture
# !pip install pycaret

In [None]:
import numpy as np
import pandas as pd
import copy
import random
import os
import time

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# from pycaret.regression import *

import warnings
warnings.filterwarnings('ignore')

In [None]:
def set_seed(seed = 0):
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    return random_state

SEED=912
random_state = set_seed(SEED)

In [None]:
class ContinuousStratifiedKFold(StratifiedKFold):
    def split(self, x, y, groups=None):
        num_bins = int(np.floor(1 + np.log2(len(y))))
        bins = pd.cut(y, bins=num_bins, labels=False)
        return super().split(x, bins, groups)

In [None]:
train = pd.read_csv('../input/meta-train-data/original_meta_train_data.csv', low_memory=False)
train.drop(train.loc[train['target'] == 0].index, axis=0, inplace=True)
train.reset_index(drop=True, inplace=True)

# train['avg'] = [np.mean([row['m1'], row['m2'], row['m3']]) for _, row in train.iterrows()]
# train['error'] = [row['target'] - row['avg'] for _, row in train.iterrows()]
# res = pd.DataFrame({'avg': train['avg'], 'target': train['error']})
# res.reset_index(drop=True, inplace=True)

# kf = ContinuousStratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
# for f, (t_, v_) in enumerate(kf.split(res, res.target)):
#     res.loc[v_, 'fold'] = f
# res['fold'] = res['fold'].astype(int)

kf = ContinuousStratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
for f, (t_, v_) in enumerate(kf.split(train, train.target)):
    train.loc[v_, 'fold'] = f
train['fold'] = train['fold'].astype(int)

In [None]:
# def get_data(data, fold):
#     X_train = data.loc[data.fold != fold, 'avg'].tolist()
#     y_train = data.loc[data.fold != fold, 'target'].values
#     X_val = data.loc[data.fold == fold, 'avg'].tolist()
#     y_val = data.loc[data.fold == fold, 'target'].values
#     return X_train, y_train, X_val, y_val

def get_data(data, fold):
    X_train = data.loc[data.fold != fold, ['m1', 'm2', 'm3']]
    y_train = data.loc[data.fold != fold, 'target'].values
    X_val = data.loc[data.fold == fold, ['m1', 'm2', 'm3']]
    y_val = data.loc[data.fold == fold, 'target'].values
    return X_train, y_train, X_val, y_val

In [None]:
X_t, y_t, X_v, y_v = get_data(train, 0)
X_t['target'] = y_t
X_v['target'] = y_v
train_df = copy.deepcopy(X_t)
test_df = copy.deepcopy(X_v)

# X_t, y_t, X_v, y_v = get_data(res, 0)
# train_df = pd.DataFrame({'X': X_t, 'target': y_t})
# test_df = pd.DataFrame({'X': X_v, 'target': y_v})

# exp = setup(data = train_df, target = 'target', session_id=SEED,
#             normalize = True, transformation = True, silent=True, 
#             transform_target = True, transform_target_method='yeo-johnson',
#             remove_outliers=True, polynomial_features=True, 
#             trigonometry_features=True, feature_selection=True, 
#             feature_ratio=True, feature_interaction=True, fold_shuffle=True)

In [None]:
# all_models = compare_models(n_select=12, exclude=['xgboost', 'catboost'])

In [None]:
# finalized_models = []
# for _, model in enumerate(all_models[:10]):
#     fine_tuned = tune_model(model)
#     finalized = finalize_model(fine_tuned)
#     finalized_models.append(finalized)

In [None]:
# X_test = copy.deepcopy(test_df)
# X_test.drop(columns=['target'], axis=1, inplace=True)
# y_test = copy.deepcopy(test_df['target'])

# best_single_estimator = finalized_models[0]
# y_pred = predict_model(estimator=best_single_estimator, data=X_test)

# print(f'* best_single model rmse: {mean_squared_error(y_test, y_pred.Label.values, squared=False)}')

In [None]:
# X_train = copy.deepcopy(train_df)
# X_train.drop(columns=['target'], axis=1, inplace=True)
# y_train = copy.deepcopy(train_df['target'])

# print(f"* X_Train shape: {X_train.shape} y_train shape: {y_train.shape}")

# blend_map = {}
# def cross_val_score(estimator, n_splits=5, random_state=SEED):
#     cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
#     cv_iter = list(cv.split(X_train, y_train))
#     scores = []
#     for train_index, test_index in cv_iter:
#         y_true = y_train.iloc[test_index]
#         y_pred = predict_model(estimator=estimator, data=X_train.iloc[test_index,:])
#         scores.append(mean_squared_error(y_true, y_pred.Label.values, squared=False))
#     return np.array(scores).mean()

# def hyperopt_train_test(estimator_list):
#     estimator = blend_models(estimator_list=[finalized_models[i] for i in estimator_list], choose_better = True, fold=3)
#     res = cross_val_score(estimator).mean()
#     blend_map[time.time()] = (estimator, res)
#     return res

# def optimise(params):
#     estimator_list = []
#     for key, val in params.items():
#         if val == 1:
#             estimator_list.append(int(key))
#     rmse = hyperopt_train_test(estimator_list)
#     return {'loss': rmse, 'status': STATUS_OK}

# model_selection_space = {
#     '0': hp.choice('0', [0, 1]),
#     '1': hp.choice('1', [0, 1]),
#     '2': hp.choice('2', [0, 1]),
#     '3': hp.choice('3', [0, 1]),
#     '4': hp.choice('4', [0, 1]),
#     '5': hp.choice('5', [0, 1]),
#     '6': hp.choice('6', [0, 1]),
#     '7': hp.choice('7', [0, 1]),
#     '8': hp.choice('8', [0, 1]),
#     '9': hp.choice('9', [0, 1]),
# }

# blend_trials = Trials()
# blend_best = fmin(optimise, model_selection_space, algo=tpe.suggest, max_evals=30, trials=blend_trials, rstate=random_state)

# best_blend_res = float('inf')
# best_blend_key = 0
# for k, v in blend_map.items():
#     if v[1] < best_blend_res:
#         best_blend_key = k
#         best_blend_res = v[1]

# X_test = copy.deepcopy(test_df)
# X_test.drop(columns=['target'], axis=1, inplace=True)
# y_test = copy.deepcopy(test_df['target'])

# blend_estimator = blend_map[best_blend_key][0]
# y_pred = predict_model(estimator=blend_estimator, data=X_test)

# print(f'* Blend model rmse: {mean_squared_error(y_test, y_pred.Label.values, squared=False)}')

# model_path = '/kaggle/working/blend_estimator'
# model, path = save_model(blend_estimator, model_path)

In [None]:
# X_train = copy.deepcopy(train_df)
# X_train.drop(columns=['target'], axis=1, inplace=True)
# y_train = copy.deepcopy(train_df['target'])

# print(f"* X_Train shape: {X_train.shape} y_train shape: {y_train.shape}")

# stack_map = {}
# def cross_val_score(estimator, n_splits=5, random_state=SEED):
#     cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
#     cv_iter = list(cv.split(X_train, y_train))
#     scores = []
#     for train_index, test_index in cv_iter:
#         y_true = y_train.iloc[test_index]
#         y_pred = predict_model(estimator=estimator, data=X_train.iloc[test_index,:])
#         scores.append(mean_squared_error(y_true, y_pred.Label.values, squared=False))
#     return np.array(scores).mean()

# def hyperopt_train_test(estimator_list, meta_index):
#     estimator = stack_models(estimator_list=[finalized_models[i] for i in estimator_list], fold=3, meta_model=finalized_models[meta_index], choose_better = True)
#     res = cross_val_score(estimator).mean()
#     stack_map[time.time()] = (estimator, res)
#     return res

# def optimise(params):
#     meta_index = params.pop('meta')
#     estimator_list = []
#     for key, val in params.items():
#         if val == 1:
#             estimator_list.append(int(key))
#     rmse = hyperopt_train_test(estimator_list, meta_index)
#     return {'loss': rmse, 'status': STATUS_OK}


# model_selection_space = {
#     '0': hp.choice('0', [0, 1]),
#     '1': hp.choice('1', [0, 1]),
#     '2': hp.choice('2', [0, 1]),
#     '3': hp.choice('3', [0, 1]),
#     '4': hp.choice('4', [0, 1]),
#     '5': hp.choice('5', [0, 1]),
#     '6': hp.choice('6', [0, 1]),
#     '7': hp.choice('7', [0, 1]),
#     '8': hp.choice('8', [0, 1]),
#     '9': hp.choice('9', [0, 1]),
#     'meta': hp.choice('meta', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
# }

# stack_trials = Trials()
# stack_best = fmin(optimise, model_selection_space, algo=tpe.suggest, max_evals=30, trials=stack_trials, rstate=random_state)

# best_stack_res = float('inf')
# best_stack_key = 0
# for k, v in stack_map.items():
#     if v[1] < best_stack_res:
#         best_stack_key = k
#         best_stack_res = v[1]

# X_test = copy.deepcopy(test_df)
# X_test.drop(columns=['target'], axis=1, inplace=True)
# y_test = copy.deepcopy(test_df['target'])

# stack_estimator = stack_map[best_stack_key][0]
# y_pred = predict_model(estimator=stack_estimator, data=X_test)

# print(f'* Stack model rmse: {mean_squared_error(y_test, y_pred.Label.values, squared=False)}')


# model_path = '/kaggle/working/stack_estimator'
# model, path = save_model(stack_estimator, model_path)

In [None]:
# model_path = '/kaggle/working/best_estimator'
# model, path = save_model(finalized_models[0], model_path)

In [None]:
X_train = copy.deepcopy(train_df)
X_train.drop(columns=['target'], axis=1, inplace=True)
y_train = copy.deepcopy(train_df['target'])

print(f"* X_Train shape: {X_train.shape} y_train shape: {y_train.shape}")

def cross_val_score(X, Y, Z, n_splits=5, random_state=SEED):
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    cv_iter = list(cv.split(X_train, y_train))
    scores = []
    for train_index, test_index in cv_iter:
        y_true = y_train.iloc[test_index]
        temp = X_train.iloc[test_index,:]
        y_pred = (temp['m1'] * X) + (temp['m2'] * Y) + (temp['m3'] * Z)
        scores.append(mean_squared_error(y_true, y_pred, squared=False))
    return np.array(scores).mean()

def hyperopt_train_test(X, Y, Z):
    return cross_val_score(X, Y, Z)

def optimise(params):
    X = params.get('X')
    Y = params.get('Y')
    Z = params.get('Z')
    rmse = hyperopt_train_test(X, Y, Z)
    return {'loss': rmse, 'status': STATUS_OK}


param_space = {
    'X': hp.loguniform('X', np.log(0.0001), np.log(0.9)),
    'Y': hp.loguniform('Y', np.log(0.0001), np.log(0.9)),
    'Z': hp.loguniform('Z', np.log(0.0001), np.log(0.9)),
}

trials = Trials()
best = fmin(optimise, param_space, algo=tpe.suggest, max_evals=20000, trials=trials, rstate=random_state)
print(best)
X_test = copy.deepcopy(test_df)
X_test.drop(columns=['target'], axis=1, inplace=True)
y_test = copy.deepcopy(test_df['target'])

y_pred = (X_test['m1'] * best.get('X')) + (X_test['m2'] * best.get('Y')) + (X_test['m3'] * best.get('Z')) 

print(f'* best rmse: {mean_squared_error(y_test, y_pred, squared=False)}')

# * X_Train shape: (2266, 3) y_train shape: (2266,)
# 100%|██████████| 20000/20000 [1:45:05<00:00,  3.17trial/s, best loss: 0.08548933236554794]
# {'X': 0.00015000387489372483, 'Y': 0.1143697711814775, 'Z': 0.8894452627596913}
# * best rmse: 0.09701197982575435

In [None]:
# error = predict_model(estimator=finalized_models[0], data=pd.DataFrame({'X': train['avg'][:10].values}))
# pred = train['avg'][:10].values + error.Label.values
# mean_squared_error(train['target'][:10].values, pred, squared=False)

In [None]:
# mean_squared_error(train['target'][:10].values, train['avg'][:10].values, squared=False)