# Check meta-model training

In [1]:
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

from pathlib import Path
import sys
sys.path.insert(0, Path(".").absolute().parent.as_posix())

from PIL import Image

import pandas as pd

import numpy as np
from common.dataset import FilesFromCsvDataset, TransformedDataset
from common.meta import get_metafeatures, get_imsize_and_targets

In [2]:
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns

sns.set_style('darkgrid')
sns.set(rc={'figure.figsize':(12, 10)})

## Prepare data

In [3]:
def create_topk_df(df, k):
    topk_values = np.argsort(df.values, axis=1)[:, -k:]
    cols = ["top_{}".format(k - i) for i in range(k)]
    topk_df = pd.DataFrame(topk_values, index=df.index, columns=cols)
    return topk_df


def get_metafeatures(prediction_files):
    dfs = [pd.read_csv(f, index_col='id') for f in prediction_files]
    for i, df in enumerate(dfs):
        df.columns = ["f{}_{}".format(i, c) for c in df.columns]
    meta_features = pd.concat([df for df in dfs], axis=1)
    return meta_features


def get_topk_metafeatures(prediction_files, k=5):
    dfs = [pd.read_csv(f, index_col='id') for f in prediction_files]
    dfs = [create_topk_df(df, k=k) for df in dfs]
    for i, df in enumerate(dfs):
        df.columns = ["f{}_{}".format(i, c) for c in df.columns]
    meta_features = pd.concat([df for df in dfs], axis=1)
    return meta_features, dfs

In [7]:
meta_features_path = Path("../output")
meta_features_list = [
    meta_features_path / "val_probas_inceptionresnetv2_350_resized_crop" / "20180428_1622" / "probas.csv",
    meta_features_path / "val_probas_inceptionv4_350_resized_crop" / "20180428_1633" / "probas.csv",
    meta_features_path / "val_probas_nasnetalarge_350_resized_crop" / "20180428_1654" / "probas.csv",
]
# meta_features, dfs = get_topk_metafeatures(meta_features_list)
meta_features = get_metafeatures(meta_features_list)

In [8]:
# meta_features.loc[6, ['f0_c124', 'f0_c46', 'f1_c124', 'f1_c46']]

In [9]:
meta_features.head()

Unnamed: 0_level_0,f0_c0,f0_c1,f0_c2,f0_c3,f0_c4,f0_c5,f0_c6,f0_c7,f0_c8,f0_c9,...,f2_c118,f2_c119,f2_c120,f2_c121,f2_c122,f2_c123,f2_c124,f2_c125,f2_c126,f2_c127
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6302,1.373452e-11,1.510409e-10,1.260431e-12,4.921817e-12,1.109817e-10,2.23659e-12,5.07353e-11,2.922072e-12,2.468959e-12,5.981444e-12,...,1.210329e-07,8.096192e-08,1.443976e-07,2.385026e-08,7.237251e-08,9.203653e-08,2.834459e-08,3.299012e-07,4.945206e-07,1.290316e-07
3349,6.708128e-07,5.170703e-07,1.846702e-07,4.019679e-07,2.481107e-07,3.374197e-07,2.376433e-06,9.245439e-07,0.004313464,3.201822e-07,...,4.495173e-06,6.100723e-06,2.764779e-05,3.801815e-06,0.0005896711,5.913841e-06,2.2019e-06,1.681049e-05,4.249565e-06,2.703849e-06
484,1.588944e-07,3.46054e-06,1.504418e-05,3.487897e-06,3.223231e-08,1.037788e-06,5.816736e-07,1.317494e-08,5.965195e-07,3.312266e-07,...,0.9882467,3.446758e-05,8.426102e-06,5.871575e-06,4.653336e-06,1.017057e-05,5.020957e-06,7.487863e-06,3.937877e-05,6.771946e-06
2677,5.284062e-11,1.166769e-08,2.62324e-11,6.023646e-10,3.024778e-07,3.700055e-10,2.080192e-10,8.870108e-11,1.285723e-10,1.011352e-10,...,2.152033e-08,6.093003e-07,1.255923e-07,2.138172e-08,3.975764e-08,1.062255e-07,1.688745e-06,5.186873e-07,1.641713e-06,2.073703e-07
1517,0.0001060053,2.394826e-07,4.06639e-06,1.615211e-05,1.085549e-07,6.640157e-05,1.039728e-06,5.748253e-08,1.454809e-06,2.930471e-05,...,0.0004788781,0.06484089,4.440381e-06,1.617641e-06,3.059961e-06,2.804265e-06,1.50074e-05,4.778179e-06,5.958621e-06,8.954229e-05


In [10]:
meta_features.shape

(6300, 384)

In [11]:
dataset = FilesFromCsvDataset("../output/filtered_val_dataset.csv")
dataset = TransformedDataset(dataset,
                             transforms=lambda x: (x, Image.open(x).size),
                             target_transforms=lambda l: l - 1)
df_imsize_targets = get_imsize_and_targets(dataset)

In [12]:
print(df_imsize_targets.shape)
df_imsize_targets.head()

(6291, 3)


Unnamed: 0,height,target,width
6302,800,47,800
3349,550,78,730
484,800,118,800
2677,480,26,640
1517,800,21,800


In [13]:
def min_max_scale(df, col_name):
    m1 = df[col_name].min()
    m2 = df[col_name].max()
    df.loc[:, col_name] = (df[col_name] - m1) / (m2 - m1 + 1e-10)

In [14]:
df_imsize_targets.loc[:, 'size'] = df_imsize_targets['width'] * df_imsize_targets['height']

min_max_scale(df_imsize_targets, 'width')
min_max_scale(df_imsize_targets, 'height')
min_max_scale(df_imsize_targets, 'size')

In [15]:
X = pd.concat([meta_features, df_imsize_targets[['width', 'height', 'size']]], axis=1)
X.dropna(inplace=True)
y = df_imsize_targets.loc[X.index, 'target']

In [16]:
X.shape, y.shape

((6291, 387), (6291,))

In [17]:
X.head()

Unnamed: 0,f0_c0,f0_c1,f0_c2,f0_c3,f0_c4,f0_c5,f0_c6,f0_c7,f0_c8,f0_c9,...,f2_c121,f2_c122,f2_c123,f2_c124,f2_c125,f2_c126,f2_c127,width,height,size
1,1.786763e-10,2.216359e-09,3.354208e-11,5.480898e-09,1.016345e-10,2.880471e-09,5.597683e-10,1.103374e-08,3.588944e-11,3.318691e-09,...,4.24807e-08,1.542437e-07,1.546837e-07,2.47648e-06,1.771896e-07,3.419645e-07,3.491577e-08,0.00417,0.004338,0.00039
2,1.454174e-06,4.968147e-09,0.0001728579,0.0001411724,9.133302e-09,1.432796e-06,1.634426e-08,7.527779e-06,1.150549e-08,1.903884e-09,...,5.956465e-05,4.131234e-06,1.26395e-05,3.168242e-05,0.0006963891,7.767742e-06,5.908821e-05,0.085129,0.088576,0.014782
3,7.776277e-09,4.009195e-08,8.143656e-09,4.890069e-09,1.291318e-07,9.818061e-09,3.529312e-07,2.192131e-08,1.592583e-07,7.115647e-08,...,1.951818e-07,3.182044e-07,5.872605e-05,2.711008e-07,2.351123e-06,1.549696e-07,7.264244e-07,0.067755,0.027657,0.005999
4,9.465966e-08,9.050931e-09,4.340031e-09,0.00126972,9.529219e-09,4.22553e-09,1.333823e-08,0.0005796793,2.795894e-09,1.915915e-10,...,3.195381e-06,2.326123e-06,1.075147e-05,7.18811e-06,0.9196233,4.219503e-06,3.570874e-06,0.085129,0.043565,0.009204
5,1.250429e-07,2.871629e-11,2.409671e-09,1.104301e-05,2.534062e-10,3.698222e-07,1.303015e-09,5.900128e-08,1.484111e-10,1.620004e-09,...,5.987777e-09,2.220414e-08,3.893851e-08,1.038806e-08,1.222228e-08,9.131732e-09,4.092041e-08,0.093815,0.097614,0.017097


In [282]:
y.head()

1     37
2     62
3     32
4    125
5     17
Name: target, dtype: int64

In [283]:
# meta_features.loc[6302, :], df_imsize_targets.loc[6302, 'width'], y.loc[6302]

In [284]:
# dataset[0]

In [285]:
# sns.heatmap(pd.concat([X, y], axis=1).corr(), linewidths=.5);
# plt.yticks(rotation=0);
# plt.xticks(rotation=30);
# sns.set(font_scale=2)

In [18]:
misclassifed = {1: {'recall': 0.84, 'wrong_classes': [(87, 4)]},
 3: {'recall': 0.5625, 'wrong_classes': [(2, 7), (28, 5)]},
 14: {'recall': 0.3, 'wrong_classes': [(3, 4), (28, 5), (62, 8), (125, 6)]},
 18: {'recall': 0.66, 'wrong_classes': [(127, 7)]},
 21: {'recall': 0.7872340425531915, 'wrong_classes': [(16, 4)]},
 22: {'recall': 0.7551020408163265, 'wrong_classes': [(62, 6)]},
 26: {'recall': 0.6938775510204082, 'wrong_classes': [(111, 9)]},
 27: {'recall': 0.8979591836734694, 'wrong_classes': [(23, 4)]},
 30: {'recall': 0.7916666666666666, 'wrong_classes': [(69, 6)]},
 34: {'recall': 0.7916666666666666, 'wrong_classes': [(12, 4), (69, 4)]},
 38: {'recall': 0.68, 'wrong_classes': [(86, 11), (108, 5)]},
 48: {'recall': 0.7346938775510204, 'wrong_classes': [(124, 5)]},
 49: {'recall': 0.6530612244897959, 'wrong_classes': [(19, 4), (53, 12)]},
 50: {'recall': 0.75, 'wrong_classes': [(52, 4)]},
 53: {'recall': 0.7755102040816326, 'wrong_classes': [(19, 4)]},
 57: {'recall': 0.8775510204081632, 'wrong_classes': [(2, 4)]},
 58: {'recall': 0.86, 'wrong_classes': [(41, 4)]},
 62: {'recall': 0.3, 'wrong_classes': [(14, 6), (22, 5), (25, 8), (28, 9)]},
 65: {'recall': 0.48, 'wrong_classes': [(31, 5), (39, 7), (56, 6), (101, 4)]},
 66: {'recall': 0.8541666666666666, 'wrong_classes': [(112, 5)]},
 69: {'recall': 0.7959183673469388, 'wrong_classes': [(116, 4)]},
 81: {'recall': 0.88, 'wrong_classes': [(126, 4)]},
 86: {'recall': 0.9, 'wrong_classes': [(38, 4)]},
 87: {'recall': 0.74, 'wrong_classes': [(1, 4), (53, 5)]},
 96: {'recall': 0.8, 'wrong_classes': [(88, 4)]},
 99: {'recall': 0.7959183673469388, 'wrong_classes': [(19, 7)]},
 104: {'recall': 0.6875, 'wrong_classes': [(59, 12)]},
 107: {'recall': 0.86, 'wrong_classes': [(4, 4)]},
 108: {'recall': 0.875, 'wrong_classes': [(38, 6)]},
 111: {'recall': 0.8, 'wrong_classes': [(26, 7)]},
 112: {'recall': 0.9, 'wrong_classes': [(66, 4)]},
 113: {'recall': 0.78, 'wrong_classes': [(81, 8)]},
 114: {'recall': 0.8163265306122449, 'wrong_classes': [(120, 5)]},
 123: {'recall': 0.6122448979591837, 'wrong_classes': [(64, 17)]},
 126: {'recall': 0.8125, 'wrong_classes': [(81, 7)]},
 127: {'recall': 0.8125, 'wrong_classes': [(18, 5)]}}

## Meta-features learning, 128 models

In [286]:
# Xy = pd.concat([X, y], axis=1)
# meta_features.loc[53, 'f0_c41']

In [19]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import recall_score, precision_score, accuracy_score

class K (0, ..., 127)

In [72]:
n_classes = 128
seed = 555
n = len(meta_features_list)


for class_index in [14, 62]:

    print("-- class : ", class_index)    
    cols = ['f{}_c{}'.format(i, class_index) for i in range(n)] 
    for c, _ in misclassifed[class_index]['wrong_classes']:        
        cols += ['f{}_c{}'.format(i, c) for i in range(n)] 
    cols += ['size', 'width', 'height']
    _X = X[cols].values
#     _X = X.values
    _y = (y == class_index).values.astype(np.int)

    # clip probabilities:
    # _X = np.clip(_X, 0.00001, 0.99999)
    
    splt = StratifiedShuffleSplit(n_splits=7, test_size=0.25, random_state=seed)
    train_index, test_index = next(splt.split(_X, _y))

    _X_train = _X[train_index, :]
    _X_test = _X[test_index, :]
    _y_train = _y[train_index]
    _y_test = _y[test_index]
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    log_reg = LogisticRegression(random_state=seed)
    params = {
        "C": np.logspace(0, 4, 20),
        "penalty": ["l2", ],
    }
    gs = GridSearchCV(log_reg, params, scoring="neg_log_loss", cv=cv, n_jobs=10)    
    gs.fit(_X_train, _y_train)
    print(gs.best_params_, gs.best_score_)
    # _y_probas = gs.best_estimator_.predict_proba(_X_test)
    _y_pred = gs.best_estimator_.predict(_X_test)
    
    _y_pred_base = (_X_test[:, :n].mean(axis=1) > 0.5).astype(np.int)
    
    print("Recall: {} vs {}".format(recall_score(_y_test, _y_pred), recall_score(_y_test, _y_pred_base)))
    print("Precision: {} vs {}".format(precision_score(_y_test, _y_pred), precision_score(_y_test, _y_pred_base)))
    print("Accuracy: {} vs {}".format(accuracy_score(_y_test, _y_pred), accuracy_score(_y_test, _y_pred_base)))

-- class :  14
{'C': 6.951927961775605, 'penalty': 'l2'} -0.024665787591909554
Recall: 0.38461538461538464 vs 0.3076923076923077
Precision: 0.7142857142857143 vs 0.6666666666666666
Accuracy: 0.9936427209154481 vs 0.993006993006993
-- class :  62
{'C': 4.281332398719393, 'penalty': 'l2'} -0.026060559781892072
Recall: 0.23076923076923078 vs 0.38461538461538464
Precision: 1.0 vs 0.625
Accuracy: 0.9936427209154481 vs 0.993006993006993


In [55]:
class_index = 62
_y = (y == class_index).values.astype(np.int)
cols = ['f{}_c{}'.format(i, class_index) for i in range(n)]
_y_pred_base = (_X[:, np.where(X.columns.isin(cols))[0]].mean(axis=1) > 0.5).astype(np.int)
recall_score(_y, _y_pred_base)

0.24

In [51]:
def beatiful_coef(coefs, feature_names):
    return pd.DataFrame(coefs.transpose(), index=feature_names, columns=['coef']).sort_values('coef', ascending=False)

In [54]:
# beatiful_coef(gs.best_estimator_.coef_, feature_names=X[cols].columns)

In [56]:
from xgboost import XGBClassifier

In [89]:
n_classes = 128
seed = 555
n = len(meta_features_list)


for class_index in [14,]:

    print("-- class : ", class_index)
    
    cols = ['f{}_c{}'.format(i, class_index) for i in range(n)] 
    for c, _ in misclassifed[class_index]['wrong_classes']:        
        cols += ['f{}_c{}'.format(i, c) for i in range(n)] 
    cols += ['size', 'width', 'height']
    _X = X[cols].values
    _y = (y == class_index).values.astype(np.int)

    # clip probabilities:
    # _X = np.clip(_X, 0.00001, 0.99999)
    
    splt = StratifiedShuffleSplit(n_splits=7, test_size=0.25, random_state=seed)
    train_index, test_index = next(splt.split(_X, _y))

    _X_train = _X[train_index, :]
    _X_test = _X[test_index, :]
    _y_train = _y[train_index]
    _y_test = _y[test_index]
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    clf = XGBClassifier(random_state=seed)
    params = {
        "max_depth": [3, 4, 5],
        "learning_rate": [0.01, 0.003, 0.001]
    }
    gs = GridSearchCV(clf, params, scoring="neg_log_loss", cv=cv, n_jobs=10)    
    gs.fit(_X_train, _y_train)
    print(gs.best_params_, gs.best_score_) 
    _y_probas = gs.best_estimator_.predict_proba(_X_test)
#     _y_pred = gs.best_estimator_.predict(_X_test)
    _y_pred = (_y_probas[:, 1] > 0.5).astype(np.int)
    
    _y_pred_base = (_X_test[:, :n].mean(axis=1) > 0.5).astype(np.int)

    print("Recall: {} vs {}".format(recall_score(_y_test, _y_pred), recall_score(_y_test, _y_pred_base)))
    print("Precision: {} vs {}".format(precision_score(_y_test, _y_pred), precision_score(_y_test, _y_pred_base)))
    print("Accuracy: {} vs {}".format(accuracy_score(_y_test, _y_pred), accuracy_score(_y_test, _y_pred_base)))

-- class :  14
{'learning_rate': 0.01, 'max_depth': 4} -0.21471614459463984
Recall: 0.5384615384615384 vs 0.3076923076923077
Precision: 0.5384615384615384 vs 0.6666666666666666
Accuracy: 0.9923712650985378 vs 0.993006993006993


In [90]:
best_model = gs.best_estimator_

In [96]:
["{}: {}".format(c, v) for c, v in zip(X[cols].columns, best_model.feature_importances_)]

['f0_c14: 0.1505376398563385',
 'f1_c14: 0.0',
 'f2_c14: 0.24946236610412598',
 'f0_c3: 0.009677419438958168',
 'f1_c3: 0.0010752688394859433',
 'f2_c3: 0.08602150529623032',
 'f0_c28: 0.10215053707361221',
 'f1_c28: 0.03548387065529823',
 'f2_c28: 0.07741935551166534',
 'f0_c62: 0.0010752688394859433',
 'f1_c62: 0.017204301431775093',
 'f2_c62: 0.0',
 'f0_c125: 0.07956989109516144',
 'f1_c125: 0.13548387587070465',
 'f2_c125: 0.04838709533214569',
 'size: 0.0',
 'width: 0.004301075357943773',
 'height: 0.0021505376789718866']

In [93]:
best_model.feature_importances_

array([0.15053764, 0.        , 0.24946237, 0.00967742, 0.00107527,
       0.08602151, 0.10215054, 0.03548387, 0.07741936, 0.00107527,
       0.0172043 , 0.        , 0.07956989, 0.13548388, 0.0483871 ,
       0.        , 0.00430108, 0.00215054], dtype=float32)

CatBoost

In [50]:
from sklearn.model_selection import StratifiedKFold
import catboost as cat

In [126]:
cat_train = cat.Pool(_X_train, label=_y_train)
cat_test = cat.Pool(_X_test)

In [127]:
params = {
    "iterations": 10,
    "loss_function": "MultiClass",
    "eval_metric": "Accuracy",
    "learning_rate": 0.01,
    "l2_leaf_reg": 3,
    "depth": 4,
    "od_type": "Iter",
    "od_wait": 50,    
}

In [128]:
cv_results = cat.cv(cat_train, params=params, nfold=5)

0:	learn: 0.0571204	test: 0.0379284	best: 0.0379284 (0)	total: 12.2s	remaining: 1m 49s
1:	learn: 0.1171925	test: 0.0910784	best: 0.0910784 (1)	total: 24.8s	remaining: 1m 39s


KeyboardInterrupt: 

In [104]:
best_iterations = np.argmax(cv_results['test-%s-mean' % params['eval_metric']])
params['iterations'] = best_iterations + 1

cat_model = cat.train(params=params, pool=cat_train)

  return getattr(obj, method)(*args, **kwds)


0:	learn: 0.0758574	total: 11.8s	remaining: 1m 34s
1:	learn: 0.1301385	total: 25.5s	remaining: 1m 29s
2:	learn: 0.1739723	total: 40.1s	remaining: 1m 20s
3:	learn: 0.2784465	total: 53.6s	remaining: 1m 7s
4:	learn: 0.3445378	total: 1m 6s	remaining: 53.5s
5:	learn: 0.3967749	total: 1m 20s	remaining: 40.1s
6:	learn: 0.4047241	total: 1m 33s	remaining: 26.7s
7:	learn: 0.4397002	total: 1m 46s	remaining: 13.3s
8:	learn: 0.4635476	total: 1m 59s	remaining: 0us


In [129]:
_y_pred = cat_model.predict(cat_test, prediction_type="Class").ravel().astype(np.int)
accuracy_score(_y_test, _y_pred)

CatboostError: Data cat_features in predict()=[] are not equal data cat_features in fit()=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9].

In [118]:
_y_pred

array([  5, 116,  15, ..., 116,  54, 111])

In [119]:
_y_test

array([65, 52, 38, ..., 40, 19, 65])

xgboost

In [58]:
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

In [87]:
xgb_train = xgb.DMatrix(_X_train, label=_y_train)
xgb_test = xgb.DMatrix(_X_test)

In [93]:
params = {
    "objective": "multi:softmax",
    "booster": "gbtree",
    "eval_metric": "mlogloss",
    "eta": 0.01,
    "gamma": 0.1,
    "max_depth": 4,
    "num_class": 128,
    "colsample_bytree": 0.7,
    "seed": 1272,     
    "subsample": 0.7,
}

In [94]:
cv_results = xgb.cv(params=params, dtrain=xgb_train,
                    num_boost_round=1000, early_stopping_rounds=10,
                    nfold=5, verbose_eval=1)

[0]	train-mlogloss:4.84599+3.4308e-05	test-mlogloss:4.84655+0.000121354
[1]	train-mlogloss:4.83983+0.000200851	test-mlogloss:4.84086+0.000270838
[2]	train-mlogloss:4.83364+0.000273281	test-mlogloss:4.8352+0.000393063
[3]	train-mlogloss:4.82739+0.000368472	test-mlogloss:4.82947+0.000549695
[4]	train-mlogloss:4.8214+0.000370113	test-mlogloss:4.82394+0.000565074
[5]	train-mlogloss:4.81526+0.000428553	test-mlogloss:4.81829+0.000607652
[6]	train-mlogloss:4.80916+0.000552534	test-mlogloss:4.8127+0.000656493
[7]	train-mlogloss:4.80295+0.000538376	test-mlogloss:4.80701+0.000668191
[8]	train-mlogloss:4.79685+0.000636949	test-mlogloss:4.8014+0.000741941
[9]	train-mlogloss:4.79066+0.000667106	test-mlogloss:4.79572+0.000763709
[10]	train-mlogloss:4.7845+0.000608217	test-mlogloss:4.79007+0.000752721
[11]	train-mlogloss:4.77844+0.000554622	test-mlogloss:4.78457+0.000704097
[12]	train-mlogloss:4.77237+0.000668272	test-mlogloss:4.77909+0.000686543
[13]	train-mlogloss:4.76641+0.000615831	test-mlogloss:

KeyboardInterrupt: 

In [89]:
best_num_round = np.argmin(cv_results['test-%s-mean' % params['eval_metric']])

model = xgb.train(params, dtrain=xgb_train, num_boost_round=best_num_round)

  return getattr(obj, method)(*args, **kwds)


In [122]:
_y_pred = model.predict(xgb_test).astype(np.int)
accuracy_score(_y_test, _y_pred)

0.005826271186440678

In [124]:
_y_pred, _y_test

(array([  5, 116,  18, ..., 116,  54, 111]),
 array([65, 52, 38, ..., 40, 19, 65]))

In [125]:
_X_test[0, :]

array([100,  39,  65, 101,  31,  56,  39,  65,  31, 101, 400, 400])

In [130]:
model.get_fscore()

{'f0': 3886,
 'f1': 3644,
 'f10': 2187,
 'f11': 2456,
 'f2': 3394,
 'f3': 4451,
 'f4': 8753,
 'f5': 4269,
 'f6': 3998,
 'f7': 3827,
 'f8': 4336,
 'f9': 7765}

Train_meta framework

In [21]:
from hyperopt import fmin, tpe, hp, tpe, STATUS_OK, Trials

In [109]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

CV_SPLIT = StratifiedKFold(n_splits=5, shuffle=True, random_state=555)
MODEL = Pipeline
SCORINGS = ["neg_log_loss", "accuracy"]

In [120]:
n_trials = 10
scorings = SCORINGS
cv = CV_SPLIT
estimator_cls = MODEL
model_params = {
    "steps": [
        ("scaler", StandardScaler),
        ("log_reg", LogisticRegression)
    ]
}
model_hp_params = {
    "log_reg": {
        "C": hp.loguniform("C", 0, 4),
        "random_state": hp.randint("random_state", 12345)
    }
}
model_hp_params.update(model_params)
fit_params = {}
n_jobs = 10
debug = True

In [126]:
from sklearn.model_selection import cross_validate

def hp_score(model_hp_params):
    if estimator_cls == Pipeline:
        steps=model_hp_params['steps']
        nsteps = []
        for name, fn in steps:
            nsteps.append((name, fn(**model_hp_params[name]) if name in model_hp_params else fn()))
        estimator = estimator_cls(steps=nsteps)
    else:
        estimator = estimator_cls(**model_hp_params)

    scores = cross_validate(estimator, _X, _y, cv=cv, scoring=scorings,
                            fit_params=fit_params, 
                            return_train_score=True,
                            n_jobs=n_jobs, verbose=debug)

    print("CV scores:")
    for scoring in scorings:
        print("{} : \n\t train: {} \n\t test: {}".format(scoring, 
                                                   scores["train_{}".format(scoring)].tolist(),
                                                   scores["test_{}".format(scoring)].tolist()))

    mean_test_loss = np.abs(np.mean(scores["test_{}".format(scorings[0])]))
    return {
        'loss': mean_test_loss,
        'status': STATUS_OK
    }


def hp_optimize(score_fn, params_space, max_evals):
    trials = Trials()
    best_params = fmin(score_fn, params_space, algo=tpe.suggest, trials=trials, max_evals=max_evals, verbose=debug)
    return best_params, trials


In [127]:
best_params, trials = hp_optimize(hp_score, model_hp_params, max_evals=n_trials)
best_params.update(model_params)

print("Best parameters: \n{}".format(best_params))
print("Best trial : \n{}".format(trials.best_trial))

# print("Train meta model on complete dataset")
# estimator = estimator_cls(**best_params)



CV scores:
neg_log_loss : 
	 train: [-3.5480578202532933, -3.573541458420192, -3.556483475390815, -3.5734635082382846, -3.559566390562097] 
	 test: [-3.876761071432662, -3.7987699002087667, -3.861823010894172, -3.8203330980363828, -3.856333740654891]
accuracy : 
	 train: [0.21692276990620635, 0.21612452604270604, 0.2190988835725678, 0.22164948453608246, 0.21369539551357733] 
	 test: [0.15546875, 0.17109375, 0.15137254901960784, 0.15717722534081796, 0.16708023159636062]


[Parallel(n_jobs=10)]: Done   5 out of   5 | elapsed:    3.7s finished


CV scores:
neg_log_loss : 
	 train: [-3.5335622329113594, -3.560580454612384, -3.543369753758897, -3.559195938468275, -3.5462211255181173] 
	 test: [-3.8788090871378316, -3.7920604390463724, -3.8588589626925316, -3.8285753684615313, -3.8568271982794786]
accuracy : 
	 train: [0.21951706246258232, 0.21971662342845738, 0.22149122807017543, 0.22620935765265662, 0.2170405352223534] 
	 test: [0.15390625, 0.175, 0.15294117647058825, 0.16038492381716118, 0.16790736145574855]


[Parallel(n_jobs=10)]: Done   5 out of   5 | elapsed:    3.7s finished


CV scores:
neg_log_loss : 
	 train: [-3.5335107548027653, -3.560536443855174, -3.543325165563332, -3.5591428628079376, -3.5461749022047337] 
	 test: [-3.8787994822713303, -3.792036417279269, -3.8588529614396476, -3.8286617195329233, -3.856841095793991]
accuracy : 
	 train: [0.21951706246258232, 0.21971662342845738, 0.22169059011164274, 0.22620935765265662, 0.2170405352223534] 
	 test: [0.15390625, 0.175, 0.15294117647058825, 0.16038492381716118, 0.16790736145574855]


[Parallel(n_jobs=10)]: Done   5 out of   5 | elapsed:    3.7s finished
[Parallel(n_jobs=10)]: Done   5 out of   5 | elapsed:    3.6s finished


CV scores:
neg_log_loss : 
	 train: [-3.5301695480330655, -3.5577411073066374, -3.5405150856591243, -3.555634485038779, -3.543262366624294] 
	 test: [-3.878288170768221, -3.7906628162536706, -3.858487046629541, -3.836962701899376, -3.8580629924731498]
accuracy : 
	 train: [0.22031530632608262, 0.22051486729195768, 0.22208931419457736, 0.22759714512291832, 0.21881149153876425] 
	 test: [0.15390625, 0.17578125, 0.15372549019607842, 0.161186848436247, 0.1687344913151365]


[Parallel(n_jobs=10)]: Done   5 out of   5 | elapsed:    3.3s finished


CV scores:
neg_log_loss : 
	 train: [-3.6412149405083256, -3.6623541350780595, -3.647051159054922, -3.6636754148795325, -3.649798346226116] 
	 test: [-3.910411087285746, -3.8537241934283726, -3.9015714957865724, -3.8591563945811527, -3.8921377188584545]
accuracy : 
	 train: [0.2021552584314508, 0.19976052684094991, 0.2031499202551834, 0.20796986518636004, 0.20070838252656434] 
	 test: [0.14609375, 0.15703125, 0.1419607843137255, 0.14194065757818766, 0.1555004135649297]




CV scores:
neg_log_loss : 
	 train: [-3.5418475465750525, -3.5678841565703174, -3.550749534889018, -3.567418335967986, -3.5537636996286] 
	 test: [-3.877359549333575, -3.7957722507117437, -3.8603290674897295, -3.821554630797394, -3.855913665578232]
accuracy : 
	 train: [0.21772101376970665, 0.2179205747355817, 0.2204944178628389, 0.22303727200634418, 0.2154663518299882] 
	 test: [0.15390625, 0.171875, 0.15058823529411763, 0.15797914995990378, 0.16708023159636062]


[Parallel(n_jobs=10)]: Done   5 out of   5 | elapsed:    3.6s finished
[Parallel(n_jobs=10)]: Done   5 out of   5 | elapsed:    3.6s finished


CV scores:
neg_log_loss : 
	 train: [-3.5302253370252634, -3.557786554505024, -3.5405603840559934, -3.555695370667215, -3.543309809502242] 
	 test: [-3.878294492633208, -3.7906850542229513, -3.858491684502919, -3.8367612337592343, -3.8580357675958092]
accuracy : 
	 train: [0.22051486729195768, 0.22051486729195768, 0.22208931419457736, 0.22759714512291832, 0.21861471861471862] 
	 test: [0.15390625, 0.17578125, 0.15372549019607842, 0.161186848436247, 0.1687344913151365]




CV scores:
neg_log_loss : 
	 train: [-3.530103692287255, -3.557686402848865, -3.5404602807250956, -3.555561235270802, -3.543205117480558] 
	 test: [-3.8782802025325758, -3.790635995336726, -3.858481555953372, -3.837212224952376, -3.858094539802838]
accuracy : 
	 train: [0.22031530632608262, 0.22051486729195768, 0.22208931419457736, 0.22759714512291832, 0.21861471861471862] 
	 test: [0.15390625, 0.17578125, 0.15372549019607842, 0.161186848436247, 0.1687344913151365]


[Parallel(n_jobs=10)]: Done   5 out of   5 | elapsed:    3.7s finished


CV scores:
neg_log_loss : 
	 train: [-3.5328783101332215, -3.559998474759919, -3.542783448043214, -3.558495327145858, -3.5456159604034503] 
	 test: [-3.8786859094869968, -3.791770079503877, -3.858772218786317, -3.8297887195309777, -3.8570205853319]
accuracy : 
	 train: [0.21971662342845738, 0.21991618439433247, 0.22188995215311005, 0.22660586835844568, 0.21802439984258165] 
	 test: [0.15390625, 0.175, 0.15294117647058825, 0.16038492381716118, 0.16790736145574855]


[Parallel(n_jobs=10)]: Done   5 out of   5 | elapsed:    3.6s finished


CV scores:
neg_log_loss : 
	 train: [-3.53383311165198, -3.5608121551076777, -3.5436031694436383, -3.5594717026322664, -3.546461708470491] 
	 test: [-3.8788592191898728, -3.7921754444658866, -3.858895201865167, -3.828140446035383, -3.8567572498683806]
accuracy : 
	 train: [0.21931750149670726, 0.21971662342845738, 0.22129186602870812, 0.22620935765265662, 0.2170405352223534] 
	 test: [0.15390625, 0.17421875, 0.15294117647058825, 0.1595829991980754, 0.16790736145574855]
Best parameters: 
{'steps': [('scaler', <class 'sklearn.preprocessing.data.StandardScaler'>), ('log_reg', <class 'sklearn.linear_model.logistic.LogisticRegression'>)], 'random_state': 7824, 'C': 10.739394454022523}
Best trial : 
{'result': {'status': 'ok', 'loss': 3.8421858327821345}, 'exp_key': None, 'owner': None, 'state': 2, 'tid': 5, 'misc': {'tid': 5, 'idxs': {'random_state': [5], 'C': [5]}, 'cmd': ('domain_attachment', 'FMinIter_Domain'), 'vals': {'random_state': [7824], 'C': [10.739394454022523]}, 'workdir': None}

[Parallel(n_jobs=10)]: Done   5 out of   5 | elapsed:    3.6s finished


In [128]:
best_params

{'C': 10.739394454022523,
 'random_state': 7824,
 'steps': [('scaler', sklearn.preprocessing.data.StandardScaler),
  ('log_reg', sklearn.linear_model.logistic.LogisticRegression)]}

In [39]:
best_params.update(model_params)
estimator = estimator_cls(**best_params)

CatBoost

In [99]:
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier

CV_SPLIT = StratifiedKFold(n_splits=5, shuffle=True, random_state=555)
MODEL = CatBoostClassifier
SCORINGS = ["neg_log_loss", ] #"precision_macro", "recall_macro"]

In [100]:
n_trials = 2
scorings = SCORINGS
cv = CV_SPLIT
estimator_cls = MODEL
model_params = {
    "iterations": 5,
    "loss_function": "MultiClass",
    "od_type": "Iter",
    "od_wait": 50,
    "bootstrap_type": "Bernoulli",
    "task_type": "CPU",
    "verbose": True,
    "metric_period": 1
}
model_hp_params = {
    "depth": 2 + hp.randint("depth", 5),
    "learning_rate": hp.quniform("learning_rate", 0.001, 0.5, 0.005),
    "l2_leaf_reg": 2 + hp.randint("l2_leaf_reg", 2),
    "random_seed": hp.randint("random_seed", 12345),
    "subsample": hp.quniform("subsample", 0.5, 1.0, 0.01)
}
model_hp_params.update(model_params)
fit_params = {}
n_jobs = 10
debug = True

In [101]:
_X = X.values
_y = y.values

In [102]:
from sklearn.model_selection import cross_validate

def hp_score(model_hp_params):
    
    estimator = estimator_cls(**model_hp_params)

    scores = cross_validate(estimator, _X, _y, cv=cv, scoring=scorings,
                            fit_params=fit_params,
                            return_train_score=True,                            
                            n_jobs=n_jobs)

    print("CV scores:")
    for scoring in scorings:
        print("{} : \n\t train: {} \n\t test: {}".format(scoring, 
                                                   scores["train_{}".format(scoring)].tolist(),
                                                   scores["test_{}".format(scoring)].tolist()))

    mean_test_loss = np.abs(np.mean(scores["test_{}".format(scoring)]))
    return {
        'loss': mean_test_loss,
        'status': STATUS_OK
    }


def hp_optimize(score_fn, params_space, max_evals):
    trials = Trials()
    best_params = fmin(score_fn, params_space, algo=tpe.suggest, trials=trials, max_evals=max_evals, verbose=debug)
    return best_params, trials


In [103]:
best_params, trials = hp_optimize(hp_score, model_hp_params, max_evals=n_trials)
best_params.update(model_params)

print("Best parameters: \n{}".format(best_params))
print("Best trial : \n{}".format(trials.best_trial))

0:	learn: -4.7117042	total: 20.6s	remaining: 1m 22s
0:	learn: -4.7280812	total: 20.7s	remaining: 1m 22s
0:	learn: -4.7249822	total: 20.9s	remaining: 1m 23s
0:	learn: -4.6774325	total: 22.2s	remaining: 1m 28s
0:	learn: -4.7068981	total: 23.7s	remaining: 1m 34s
1:	learn: -4.5168791	total: 41.5s	remaining: 1m 2s
1:	learn: -4.5250657	total: 41.7s	remaining: 1m 2s
1:	learn: -4.6246349	total: 41.8s	remaining: 1m 2s
1:	learn: -4.6276355	total: 42.6s	remaining: 1m 3s
1:	learn: -4.5363106	total: 46.8s	remaining: 1m 10s
2:	learn: -4.4961720	total: 1m 2s	remaining: 41.9s
2:	learn: -4.3414154	total: 1m 3s	remaining: 42.2s
2:	learn: -4.4109650	total: 1m 3s	remaining: 42.3s
2:	learn: -4.4711836	total: 1m 5s	remaining: 43.8s
2:	learn: -4.4313898	total: 1m 7s	remaining: 44.9s
3:	learn: -4.3371345	total: 1m 24s	remaining: 21s
3:	learn: -4.3623223	total: 1m 25s	remaining: 21.3s
3:	learn: -4.2483936	total: 1m 26s	remaining: 21.6s
3:	learn: -4.3404164	total: 1m 27s	remaining: 21.8s
3:	learn: -4.2623334	to

Process ForkPoolWorker-477:
Process ForkPoolWorker-476:
Process ForkPoolWorker-478:
Traceback (most recent call last):
Process ForkPoolWorker-480:
Traceback (most recent call last):
Process ForkPoolWorker-479:
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*se

KeyboardInterrupt: 