# Check meta-model training as binary predictor

In [1]:
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

from pathlib import Path
import sys
sys.path.insert(0, Path(".").absolute().parent.as_posix())

from PIL import Image

import pandas as pd

import numpy as np
from common.dataset import FilesFromCsvDataset, TransformedDataset
from common.meta import get_metafeatures, get_imsize_and_targets

In [2]:
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns

sns.set_style('darkgrid')
sns.set(rc={'figure.figsize':(12, 10)})

In [3]:
meta_features_path = Path("../output")
meta_features_list = [
    meta_features_path / "val_probas_inceptionresnetv2_350_resized_crop" / "20180428_1622" / "probas.csv",
    meta_features_path / "val_probas_inceptionv4_350_resized_crop" / "20180428_1633" / "probas.csv",
    meta_features_path / "val_probas_nasnetalarge_350_resized_crop" / "20180428_1654" / "probas.csv",
]

dfs = [pd.read_csv(f, index_col='id') for f in meta_features_list]
df_probas = pd.concat(dfs, axis=0)
y_probas = df_probas.groupby('id').mean()

In [4]:
print(y_probas.shape)
y_probas.head()

(6300, 128)


Unnamed: 0_level_0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,...,c118,c119,c120,c121,c122,c123,c124,c125,c126,c127
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.46379e-08,8.065056e-07,7.792222e-08,7.491578e-08,2.797733e-08,1.451325e-07,2.913413e-08,6.184602e-07,1.00651e-08,2.935796e-06,...,2.227966e-07,2.282861e-08,1.963591e-08,1.422318e-08,6.270509e-08,5.168149e-08,8.267714e-07,6.070628e-07,1.14039e-07,1.168854e-08
2,9.347801e-06,1.624991e-06,0.0003003126,0.001182464,3.153259e-06,2.066257e-05,4.393156e-06,8.210427e-05,1.200259e-06,2.793317e-06,...,9.80263e-07,0.0003333903,5.266618e-06,2.500858e-05,1.383422e-06,4.214561e-06,1.071219e-05,0.0002327887,2.591772e-06,1.976304e-05
3,4.416102e-08,4.241639e-07,1.351062e-07,4.784809e-08,3.953332e-07,4.317448e-08,2.568201e-06,1.424658e-07,1.684594e-07,2.076342e-07,...,2.14685e-07,1.545074e-05,6.385568e-05,6.052857e-06,2.460202e-07,0.0008062696,3.916902e-07,1.67832e-06,9.834748e-08,2.041568e-06
4,3.394759e-06,2.958466e-06,1.998878e-06,0.001193497,1.526749e-06,2.524314e-06,7.697208e-07,0.0005268663,6.972285e-07,3.750449e-07,...,6.772507e-07,5.826728e-06,2.263593e-06,1.069762e-06,7.759116e-07,3.597562e-06,2.399399e-06,0.956977,1.40773e-06,1.207639e-06
5,1.737058e-07,1.281176e-09,4.902731e-09,6.457448e-06,6.476485e-09,8.372518e-07,3.624372e-08,3.835375e-08,3.213217e-09,8.829736e-08,...,3.043351e-09,2.643031e-07,1.48227e-08,3.292611e-09,7.460706e-09,1.318289e-08,5.479758e-09,8.172043e-09,3.189449e-09,1.807494e-08


## Prepare data

In [4]:
def create_topk_with_probas_df(df, k):
    df_values = df.values
    topk_values = np.argsort(df_values, axis=1)[:, -k:]
    topk_probas = np.zeros_like(topk_values, dtype=np.float)
    for i, indices in enumerate(topk_values):
        topk_probas[i, :] = df_values[i, indices]
    cols1 = ["top_{}".format(k - i) for i in range(k)]
    cols2 = ["top_{}_proba".format(k - i) for i in range(k)]
    data = np.concatenate([topk_values, topk_probas], axis=1)
    topk_df = pd.DataFrame(data, index=df.index, columns=cols1 + cols2)
    topk_df[cols1] = topk_df[cols1].astype(np.int)
    return topk_df


def get_topk_with_probas_metafeatures(prediction_files, k=5):
    dfs = [pd.read_csv(f, index_col='id') for f in prediction_files]
    dfs = [create_topk_with_probas_df(df, k=k) for df in dfs]
    for i, df in enumerate(dfs):
        df.columns = ["f{}_{}".format(i, c) for c in df.columns]
    meta_features = pd.concat([df for df in dfs], axis=1)
    return meta_features

In [6]:
y_topk_probas = create_topk_with_probas_df(y_probas, k=5)

In [7]:
y_topk_probas.head()

Unnamed: 0_level_0,top_5,top_4,top_3,top_2,top_1,top_5_proba,top_4_proba,top_3_proba,top_2_proba,top_1_proba
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,69,100,91,28,37,4e-06,8e-06,3e-05,3.2e-05,0.999899
2,103,102,14,28,62,0.010689,0.039366,0.080131,0.218062,0.629119
3,65,64,27,77,32,0.000908,0.000961,0.001034,0.006351,0.986752
4,41,3,62,14,125,0.000866,0.001193,0.011768,0.026672,0.956977
5,12,3,21,20,17,4e-06,6e-06,6.3e-05,6.4e-05,0.999848


In [8]:
dataset = FilesFromCsvDataset("../output/filtered_val_dataset.csv")
dataset = TransformedDataset(dataset,
                             transforms=lambda x: (x, Image.open(x).size),
                             target_transforms=lambda l: l - 1)
df_imsize_targets = get_imsize_and_targets(dataset)

In [9]:
print(df_imsize_targets.shape)
df_imsize_targets.head()

(6291, 3)


Unnamed: 0,height,target,width
6302,800,47,800
3349,550,78,730
484,800,118,800
2677,480,26,640
1517,800,21,800


In [11]:
y_true = df_imsize_targets['target']
y_topk_probas = y_topk_probas.loc[y_true.index, :]

In [12]:
y_true.name = 'y_true'

In [13]:
meta_trainval = pd.concat([y_topk_probas, y_true], axis=1)

In [14]:
meta_trainval.tail(10)

Unnamed: 0,top_5,top_4,top_3,top_2,top_1,top_5_proba,top_4_proba,top_3_proba,top_2_proba,top_1_proba,y_true
1047,31,60,77,33,70,0.001407,0.00156,0.003367,0.009249,0.97633,70
5362,97,124,53,87,1,1e-06,1e-06,5.5e-05,0.001591,0.99835,1
5185,50,102,79,18,127,0.000139,0.00118,0.005357,0.485176,0.507393,18
3007,18,119,93,102,95,0.00193,0.002708,0.018558,0.051776,0.918205,95
1666,93,20,3,95,21,0.008808,0.015458,0.019624,0.148286,0.774092,21
3589,6,127,79,66,112,0.001421,0.001738,0.006424,0.142786,0.846757,112
280,109,113,122,126,81,0.000161,0.000184,0.000192,0.430901,0.567797,126
2053,6,123,77,76,51,0.000571,0.000791,0.011284,0.089868,0.896155,76
1648,127,125,102,18,79,8e-06,1.6e-05,0.000139,0.000163,0.999639,79
3945,76,98,23,27,110,1.7e-05,2.4e-05,2.8e-05,3.2e-05,0.999821,110


In [15]:
meta_trainval.loc[:, "pos_class_id"] = meta_trainval["top_1"]

In [16]:
meta_trainval.head()

Unnamed: 0,top_5,top_4,top_3,top_2,top_1,top_5_proba,top_4_proba,top_3_proba,top_2_proba,top_1_proba,y_true,pos_class_id
6302,81,13,83,52,47,4e-06,4e-06,5e-06,2.5e-05,0.999943,47,47
3349,56,71,8,106,78,0.003627,0.004472,0.004776,0.047875,0.938301,78,78
484,2,20,44,21,118,7.7e-05,0.00012,0.000996,0.001776,0.995987,118,118
2677,48,81,99,111,26,7e-06,1e-05,3.8e-05,0.00056,0.999356,26,26
1517,44,20,21,119,16,0.002835,0.010127,0.023157,0.059811,0.897837,21,16


In [43]:
k = 5
meta_trainval.loc[:, "pos_class_id"] = meta_trainval["top_1"]
data = [meta_trainval, ] 
for i in range(2, k + 1):
    d = meta_trainval.copy()
    d.loc[:, "pos_class_id"] = d["top_{}".format(i)]
    data.append(d)

meta_trainval_ = pd.concat(data, axis=0)

In [44]:
meta_trainval_.loc[100, :]

Unnamed: 0,top_5,top_4,top_3,top_2,top_1,top_5_proba,top_4_proba,top_3_proba,top_2_proba,top_1_proba,y_true,pos_class_id
100,80,10,32,51,76,0.00182,0.002202,0.009819,0.012623,0.973327,76,76
100,80,10,32,51,76,0.00182,0.002202,0.009819,0.012623,0.973327,76,51
100,80,10,32,51,76,0.00182,0.002202,0.009819,0.012623,0.973327,76,32
100,80,10,32,51,76,0.00182,0.002202,0.009819,0.012623,0.973327,76,10
100,80,10,32,51,76,0.00182,0.002202,0.009819,0.012623,0.973327,76,80


In [45]:
meta_trainval_.loc[:, 'target'] = (meta_trainval_['y_true'] == meta_trainval_['pos_class_id']).astype(np.int)

In [105]:
print(meta_trainval_.shape)
meta_trainval_.loc[3333, :]

(31455, 13)


Unnamed: 0,top_5,top_4,top_3,top_2,top_1,top_5_proba,top_4_proba,top_3_proba,top_2_proba,top_1_proba,y_true,pos_class_id,target
3333,72,86,76,6,51,0.060209,0.103765,0.119072,0.130597,0.463619,76,51,0
3333,72,86,76,6,51,0.060209,0.103765,0.119072,0.130597,0.463619,76,6,0
3333,72,86,76,6,51,0.060209,0.103765,0.119072,0.130597,0.463619,76,76,1
3333,72,86,76,6,51,0.060209,0.103765,0.119072,0.130597,0.463619,76,86,0
3333,72,86,76,6,51,0.060209,0.103765,0.119072,0.130597,0.463619,76,72,0


In [47]:
train_columns = meta_trainval_.columns.tolist()
train_columns.remove('y_true')
train_columns.remove('target')

In [120]:
len(meta_trainval_) / 5

6291.0

In [123]:
meta_trainval_[mask_top].shape[0] / (len(meta_trainval_) / 5)

0.8772850103322206

In [124]:
mask_top = (meta_trainval_['pos_class_id'] == meta_trainval_['top_1']) & (meta_trainval_['target'] == 1)
print(meta_trainval_[mask_top].shape, meta_trainval_[mask_top].shape[0] / (len(meta_trainval_) / 5))
meta_trainval_[mask_top].head()

(5519, 13) 0.8772850103322206


Unnamed: 0,top_5,top_4,top_3,top_2,top_1,top_5_proba,top_4_proba,top_3_proba,top_2_proba,top_1_proba,y_true,pos_class_id,target
6302,81,13,83,52,47,4e-06,4e-06,5e-06,2.5e-05,0.999943,47,47,1
3349,56,71,8,106,78,0.003627,0.004472,0.004776,0.047875,0.938301,78,78,1
484,2,20,44,21,118,7.7e-05,0.00012,0.000996,0.001776,0.995987,118,118,1
2677,48,81,99,111,26,7e-06,1e-05,3.8e-05,0.00056,0.999356,26,26,1
5908,85,37,100,69,116,6.5e-05,9.6e-05,0.000111,0.014535,0.984703,116,116,1


In [125]:
mask_top = (meta_trainval_['pos_class_id'] == meta_trainval_['top_2']) & (meta_trainval_['target'] == 1)
print(meta_trainval_[mask_top].shape, meta_trainval_[mask_top].shape[0] / (len(meta_trainval_) / 5))
meta_trainval_[mask_top].head()

(480, 13) 0.07629947544110634


Unnamed: 0,top_5,top_4,top_3,top_2,top_1,top_5_proba,top_4_proba,top_3_proba,top_2_proba,top_1_proba,y_true,pos_class_id,target
916,43,101,31,65,39,0.02959798,0.04417373,0.2398183,0.289503,0.381428,65,65,1
4853,24,45,78,56,106,0.0006592789,0.001644349,0.005020495,0.105461,0.885488,56,56,1
3126,39,11,88,29,36,9.423055e-08,1.046994e-07,4.866039e-07,1e-06,0.999998,29,29,1
3175,2,3,25,22,62,0.03258823,0.05591428,0.08045318,0.314523,0.466907,22,22,1
638,119,104,127,84,59,0.00181104,0.002163361,0.003847145,0.128065,0.862018,84,84,1


In [126]:
mask_top = (meta_trainval_['pos_class_id'] == meta_trainval_['top_3']) & (meta_trainval_['target'] == 1)
print(meta_trainval_[mask_top].shape, meta_trainval_[mask_top].shape[0] / (len(meta_trainval_) / 5))
meta_trainval_[mask_top].head()

(140, 13) 0.022254013670322682


Unnamed: 0,top_5,top_4,top_3,top_2,top_1,top_5_proba,top_4_proba,top_3_proba,top_2_proba,top_1_proba,y_true,pos_class_id,target
1517,44,20,21,119,16,0.002835,0.010127,0.023157,0.059811,0.897837,21,21,1
347,108,117,107,86,38,0.007261,0.043711,0.091113,0.110699,0.746043,107,107,1
4584,127,95,7,18,102,0.004364,0.027536,0.037916,0.052563,0.875009,7,7,1
1142,64,121,120,60,61,0.053782,0.070356,0.095659,0.127539,0.48064,120,120,1
125,95,127,102,18,7,0.022071,0.048307,0.073341,0.24155,0.604248,102,102,1


In [127]:
mask_top = (meta_trainval_['pos_class_id'] == meta_trainval_['top_4']) & (meta_trainval_['target'] == 1)
print(meta_trainval_[mask_top].shape, meta_trainval_[mask_top].shape[0] / (len(meta_trainval_) / 5))
meta_trainval_[mask_top].head()

(59, 13) 0.009378477189635987


Unnamed: 0,top_5,top_4,top_3,top_2,top_1,top_5_proba,top_4_proba,top_3_proba,top_2_proba,top_1_proba,y_true,pos_class_id,target
4998,12,127,79,18,7,0.007701,0.07428,0.103385,0.185535,0.61436,127,127,1
1637,3,14,28,22,62,0.058805,0.085025,0.156178,0.204352,0.486785,14,14,1
3121,22,90,34,21,44,0.003845,0.008967,0.00949,0.160142,0.806794,90,90,1
4255,97,49,53,87,1,0.002789,0.007316,0.057496,0.252311,0.679545,49,49,1
2080,97,49,1,87,53,0.00163,0.038874,0.047671,0.054677,0.856167,49,49,1


In [128]:
mask_top = (meta_trainval_['pos_class_id'] == meta_trainval_['top_5']) & (meta_trainval_['target'] == 1)
print(meta_trainval_[mask_top].shape, meta_trainval_[mask_top].shape[0] / (len(meta_trainval_) / 5))
meta_trainval_[mask_top].head()

(21, 13) 0.0033381020505484026


Unnamed: 0,top_5,top_4,top_3,top_2,top_1,top_5_proba,top_4_proba,top_3_proba,top_2_proba,top_1_proba,y_true,pos_class_id,target
5102,116,69,74,114,41,0.072114,0.083198,0.090794,0.114956,0.510313,116,116,1
3871,29,50,88,52,96,0.0075,0.010138,0.038395,0.441316,0.498914,29,29,1
5292,26,111,124,126,81,0.043802,0.044589,0.067075,0.304749,0.524564,26,26,1
895,58,18,102,7,125,0.021013,0.058851,0.079689,0.202766,0.509786,58,58,1
606,88,50,29,96,52,0.005366,0.009776,0.044885,0.124369,0.810559,88,88,1


In [74]:
from sklearn.metrics import accuracy_score

accuracy_score(meta_trainval_['y_true'], meta_trainval_['top_1'])

0.8772850103322206

Limit to top 2

In [264]:
k = 2
meta_trainval.loc[:, "pos_class_id"] = meta_trainval["top_1"]
data = [meta_trainval, ] 
for i in range(2, k + 1):
    d = meta_trainval.copy()
    d.loc[:, "pos_class_id"] = d["top_{}".format(i)]
    data.append(d)

meta_trainval_ = pd.concat(data, axis=0)

meta_trainval_.loc[:, 'target'] = (meta_trainval_['y_true'] == meta_trainval_['pos_class_id']).astype(np.int)

In [265]:
meta_trainval_.head()

Unnamed: 0,top_5,top_4,top_3,top_2,top_1,top_5_proba,top_4_proba,top_3_proba,top_2_proba,top_1_proba,y_true,pos_class_id,target
6302,81,13,83,52,47,4e-06,4e-06,5e-06,2.5e-05,0.999943,47,47,1
3349,56,71,8,106,78,0.003627,0.004472,0.004776,0.047875,0.938301,78,78,1
484,2,20,44,21,118,7.7e-05,0.00012,0.000996,0.001776,0.995987,118,118,1
2677,48,81,99,111,26,7e-06,1e-05,3.8e-05,0.00056,0.999356,26,26,1
1517,44,20,21,119,16,0.002835,0.010127,0.023157,0.059811,0.897837,21,16,0


In [341]:
meta_trainval_2 = pd.concat([meta_trainval_, pd.get_dummies(meta_trainval_['pos_class_id'], prefix='pos_class_id')], axis=1)

In [342]:
train_columns = meta_trainval_2.columns.tolist()
train_columns.remove('y_true')
train_columns.remove('target')

Trainval / Test split

In [343]:
from sklearn.model_selection import StratifiedShuffleSplit

In [344]:
seed = 31500
split = StratifiedShuffleSplit(random_state=seed, test_size=0.3)

In [345]:
x_total = meta_trainval_2[train_columns].values
y_total = meta_trainval_2['target'].values

In [346]:
for train_index, test_index in split.split(x_total, y_total):
    break

In [347]:

x_trainval = x_total[train_index]
y_trainval = y_total[train_index]

x_test = x_total[test_index]
y_test = y_total[test_index]

GBM as meta-model

In [348]:
import lightgbm as lgb

In [349]:
lgb_trainval = lgb.Dataset(x_trainval, label=y_trainval)

In [367]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'max_depth': 15,
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
}

In [368]:
num_boost_round = 2500

In [369]:
cv_results = lgb.cv(params, lgb_trainval, num_boost_round=num_boost_round, nfold=5, 
                    metrics='binary_error',                    
                    early_stopping_rounds=100, verbose_eval=50)

[50]	cv_agg's binary_error: 0.152153 + 0.00606558
[100]	cv_agg's binary_error: 0.146816 + 0.00688565
[150]	cv_agg's binary_error: 0.13864 + 0.00705357
[200]	cv_agg's binary_error: 0.135916 + 0.00789785
[250]	cv_agg's binary_error: 0.133644 + 0.00631278
[300]	cv_agg's binary_error: 0.133531 + 0.0083558
[350]	cv_agg's binary_error: 0.131146 + 0.00943185
[400]	cv_agg's binary_error: 0.13001 + 0.00876638
[450]	cv_agg's binary_error: 0.128421 + 0.00840288
[500]	cv_agg's binary_error: 0.12774 + 0.00673413
[550]	cv_agg's binary_error: 0.126604 + 0.00747943
[600]	cv_agg's binary_error: 0.126263 + 0.00753783
[650]	cv_agg's binary_error: 0.127626 + 0.00759669
[700]	cv_agg's binary_error: 0.12808 + 0.00751153


In [370]:
best_boost_round = np.argmin(cv_results['binary_error-mean'])

In [371]:
gbm = lgb.train(params, lgb_trainval, num_boost_round=best_boost_round)

In [372]:
y_test_probas = gbm.predict(x_test)

In [373]:
y_test_pred = (y_test_probas > 0.45).astype(np.int)

In [374]:
y_test_pred.shape, y_test.shape

((3775,), (3775,))

In [375]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_test_pred)

0.8603973509933774

In [376]:
y_test_true = meta_trainval_['y_true'].values[test_index]

In [380]:
pos_class_id_test = x_test[:, 10].astype(np.int)
top1_pred_test = x_test[:, 4].astype(np.int)

In [381]:
mask = y_test_pred > 0

In [382]:
pos_class_id_test[mask], y_test_true[mask]

(array([ 79,  35, 125, ...,  76,  59,  12]),
 array([ 79,  35, 125, ...,  76,  59,  12]))

In [383]:
accuracy_score(y_test_true[mask], pos_class_id_test[mask])

0.8275862068965517

In [384]:
accuracy_score(y_test_true[mask], top1_pred_test[mask])

0.8991250643335049