# Compare prediction probabilities on validation

In [1]:
from pathlib import Path
import sys
sys.path.insert(0, Path(".").absolute().parent.as_posix())

In [3]:
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns

import numpy as np
import pandas as pd

OUTPUT_PATH = Path(".").absolute().parent / "output" / "val_probas"

In [11]:
# Get true values
from PIL import Image
from common.dataset import FilesFromCsvDataset, TransformedDataset
from common.meta import get_metafeatures, get_imsize_and_targets


dataset = FilesFromCsvDataset("../output/filtered_val_dataset.csv")
dataset = TransformedDataset(dataset,
                             transforms=lambda x: (x, Image.open(x).size),
                             target_transforms=lambda l: l - 1)

df_imsize_targets = get_imsize_and_targets(dataset)
y_true = df_imsize_targets['target']

## Validation result from a single model

In [42]:
!ls {OUTPUT_PATH}

val_probas_inceptionresnetv2_350_resized_crop
val_probas_inceptionresnetv2_350_weighted_sampler2_resized_crop
val_probas_inceptionv4_350_cls_wts_adam_resized_crop
val_probas_inceptionv4_350_fc_random_resized_crop
val_probas_inceptionv4_350_resized_crop
val_probas_nasnetalarge_350_random_resized_crop
val_probas_nasnetalarge_350_resized_crop
val_probas_squeezenet_350


In [72]:
!ls {OUTPUT_PATH}/val_probas_inceptionv4_350_fc_random_resized_crop/

20180520_2032


In [68]:
prediction_file = OUTPUT_PATH / "val_probas_nasnetalarge_350_resized_crop" / "20180428_1654" / "probas.csv"

In [69]:
df = pd.read_csv(prediction_file, index_col='id')

In [70]:
y_probas = df.groupby('id').mean()
y_probas = y_probas.loc[y_true.index, :]
y_pred = np.argmax(y_probas.values, axis=1)

In [71]:
from sklearn.metrics import accuracy_score

1.0 - accuracy_score(y_true, y_pred)

0.1311397234144015

### Results:

| Model | Val Error | Test 30% Error |
| --- | --- | --- |
| inceptionresnetv2_350_resized_crop/20180428_1622 | 0.13829279923700522 | 0.15468 |
| inceptionresnetv2_350_weighted_sampler2_resized_crop/20180501_1710 | 0.1702432045779685 | 0.17890 |
| inceptionv4_350_cls_wts_adam_resized_crop/20180520_2026 | 0.14592274678111583 |  | 
| inceptionv4_350_fc_random_resized_crop/20180520_2032 | 0.14353838817358133 | 0.16614 |
| inceptionv4_350_resized_crop/20180428_1633 | 0.13718009855348912 | 0.15755 | 
| nasnetalarge_350_random_resized_crop/20180514_2202 | 0.1351136544269591 | 0.15026 |
| nasnetalarge_350_random_resized_crop/20180514_2232 | 0.14147194404705132 | 0.15442 |
| nasnetalarge_350_resized_crop/20180428_1654 | 0.1311397234144015 | 0.14635 |
| squeezenet_350/20180428_1447 | 0.24845016690510258 |



## Mean probability of multiple predictions

In [181]:
prediction_files = [
    OUTPUT_PATH / "val_probas_inceptionv4_350_resized_crop" / "20180428_1633" / "probas.csv",
    OUTPUT_PATH / "val_probas_inceptionv4_350_cls_wts_adam_resized_crop" / "20180520_2026" / "probas.csv",
    OUTPUT_PATH / "val_probas_inceptionv4_350_fc_random_resized_crop" / "20180520_2032" / "probas.csv",
    
    OUTPUT_PATH / "val_probas_inceptionresnetv2_350_resized_crop" / "20180428_1622" / "probas.csv",
    OUTPUT_PATH / "val_probas_inceptionresnetv2_350_weighted_sampler2_resized_crop" / "20180501_1710" / "probas.csv",
    
    OUTPUT_PATH / "val_probas_nasnetalarge_350_resized_crop" / "20180428_1654" / "probas.csv",
    OUTPUT_PATH / "val_probas_nasnetalarge_350_random_resized_crop" / "20180514_2232" / "probas.csv",
    OUTPUT_PATH / "val_probas_nasnetalarge_350_random_resized_crop" / "20180514_2202" / "probas.csv",
]

In [184]:
dfs = [pd.read_csv(f, index_col='id') for f in prediction_files]
assert len(dfs) == len(prediction_files)

In [185]:
df_probas = pd.concat(dfs, axis=0)
y_probas = df_probas.groupby('id').mean()
y_probas = y_probas.loc[y_true.index, :]
y_pred = np.argmax(y_probas.values, axis=1)

In [186]:
from sklearn.metrics import accuracy_score

1.0 - accuracy_score(y_true, y_pred)

0.11937688761723098

Geometric mean probability

In [109]:
def gmean(arr):
    l = len(arr)
    prod = np.prod(arr)
    return np.power(prod, 1.0/l)

In [112]:
# y_probas_gmean = df_probas.groupby('id').agg(gmean)
# y_probas_gmean = y_probas_gmean.loc[y_true.index, :]
# y_pred_gmean = np.argmax(y_probas_gmean.values, axis=1)
# y_pred = np.argmax(y_probas.values, axis=1)

# 1.0 - accuracy_score(y_true, y_pred)

### Random search of weights

In [113]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [182]:
from sklearn.model_selection import StratifiedKFold

splt = StratifiedKFold(n_splits=5, random_state=17)
dfs = [pd.read_csv(f, index_col='id') for f in prediction_files]

In [206]:
def score(params):
    weights = [params["w_{}".format(i)] for i in range(len(dfs))]
    
    print("Trial: weights: {}".format(weights), end=" -> ")
    weighted_dfs = [w * df for w, df in zip(weights, dfs)]
    df_probas = pd.concat(weighted_dfs, axis=0)
    y_probas = df_probas.groupby('id').sum()
    y_probas = y_probas.loc[y_true.index, :]
    y_pred = np.argmax(y_probas.values, axis=1)
    
    res_cv = []
    res_test = []
    for train_index, test_index in splt.split(y_true.values[:, None], y_true.values):        
        res_cv.append(1.0 - accuracy_score(y_true.values[train_index], y_pred[train_index]))
        res_test.append(1.0 - accuracy_score(y_true.values[test_index], y_pred[test_index]))        
    
    res = np.mean(res_cv)
    res_test = np.mean(res_test)
    print("Score: cv={} | test={}".format(res, res_test))
    return {'loss': res, 'status': STATUS_OK, 'val_score': res_test}
    
    
def get_uniform(name):
    return hp.uniform(name, 0.10, 0.90)
    
    
def optimize(trials, max_evals):
    params = {}
    for i in range(len(prediction_files)):
        n = "w_{}".format(i)
        params[n] = get_uniform(n)   
    best = fmin(score, params, algo=tpe.suggest, trials=trials, max_evals=max_evals)    
    return best


trials = Trials()
best_weights = optimize(trials, max_evals=500)
best_weights

In [157]:
best_weights, trials.best_trial

({'w_0': 0.8654605129896079,
  'w_1': 0.5321224537280229,
  'w_2': 0.1886420238665844,
  'w_3': 0.7444695911930883,
  'w_4': 0.24655462551634644,
  'w_5': 0.5547166610070141,
  'w_6': 0.4298804200059071,
  'w_7': 0.3334794430497531},
 {'book_time': datetime.datetime(2018, 5, 20, 21, 41, 59, 34000),
  'exp_key': None,
  'misc': {'cmd': ('domain_attachment', 'FMinIter_Domain'),
   'idxs': {'w_0': [161],
    'w_1': [161],
    'w_2': [161],
    'w_3': [161],
    'w_4': [161],
    'w_5': [161],
    'w_6': [161],
    'w_7': [161]},
   'tid': 161,
   'vals': {'w_0': [0.8654605129896079],
    'w_1': [0.5321224537280229],
    'w_2': [0.1886420238665844],
    'w_3': [0.7444695911930883],
    'w_4': [0.24655462551634644],
    'w_5': [0.5547166610070141],
    'w_6': [0.4298804200059071],
    'w_7': [0.3334794430497531]},
   'workdir': None},
  'owner': None,
  'refresh_time': datetime.datetime(2018, 5, 20, 21, 41, 59, 124000),
  'result': {'loss': 0.11698940006708616,
   'status': 'ok',
   'val_sc

In [190]:
weights = [best_weights["w_{}".format(i)] for i in range(len(dfs))]

weighted_dfs = [w * df for w, df in zip(weights, dfs)]
df_probas = pd.concat(weighted_dfs, axis=0)
y_probas = df_probas.groupby('id').sum()
y_probas = y_probas.loc[y_true.index, :]
y_pred = np.argmax(y_probas.values, axis=1)

1.0 - accuracy_score(y_true.values, y_pred)

0.11699252900969637

### Compute test probabilities

In [191]:
best_weights = {
    'w_0': 0.8654605129896079,
    'w_1': 0.5321224537280229,
    'w_2': 0.1886420238665844,
    'w_3': 0.7444695911930883,
    'w_4': 0.24655462551634644,
    'w_5': 0.5547166610070141,
    'w_6': 0.4298804200059071,
    'w_7': 0.3334794430497531
}

In [193]:
OUTPUT_PATH = Path(".").absolute().parent / "output" / "test_probas"

In [197]:
!ls {OUTPUT_PATH}/test_probas_inceptionv4_350_fc_random_resized_crop

20180520_2151


In [198]:
test_prediction_files = [
    
    OUTPUT_PATH / "test_probas_inceptionv4_350_resized_crop" / "20180429_1303" / "probas.csv",
    OUTPUT_PATH / "test_probas_inceptionv4_350_cls_wts_adam_resized_crop" / "20180520_2151" / "probas.csv",
    OUTPUT_PATH / "test_probas_inceptionv4_350_fc_random_resized_crop" / "20180520_2151" / "probas.csv",
    
    OUTPUT_PATH / "test_probas_inceptionresnetv2_350_resized_crop" / "20180429_1242" / "probas.csv",
    OUTPUT_PATH / "test_probas_inceptionresnetv2_350_weighted_sampler2_resized_crop" / "20180501_1725" / "probas.csv",
    
    OUTPUT_PATH / "test_probas_nasnetalarge_350_resized_crop" / "20180429_1406" / "probas.csv",
    OUTPUT_PATH / "test_probas_nasnetalarge_350_random_resized_crop" / "20180514_0821" / "probas.csv",
    OUTPUT_PATH / "test_probas_nasnetalarge_350_random_resized_crop" / "20180514_1034" / "probas.csv",    
]

In [208]:
weights = [best_weights["w_{}".format(i)] for i in range(len(test_prediction_files))]
test_dfs = [w * pd.read_csv(f, index_col='id') for w, f in zip(weights, test_prediction_files)]

In [209]:
sample_predicitions_df = pd.read_csv("/home/fast_storage/imaterialist-challenge-furniture-2018/sample_submission_randomlabel.csv", index_col='id')

In [210]:
sample_predicitions_df.head()

Unnamed: 0_level_0,predicted
id,Unnamed: 1_level_1
1,57
2,74
3,52
4,54
5,39


In [215]:
df_test_probas = pd.concat(test_dfs, axis=0)
y_test_probas = df_test_probas.groupby('id').sum()
y_test_pred = np.argmax(y_test_probas.values, axis=1)

df = sample_predicitions_df.copy()
df.loc[y_test_probas.index, 'predicted'] = y_test_pred + 1
df.to_csv("weighted_proba_8models.csv")

In [216]:
!cd ../ && python3 utils/update_test_predictions.py notebooks/weighted_proba_8models.csv notebooks/test_with_labels.csv notebooks/

In [217]:
!head -10 fixed_weighted_proba_8models.csv

id,predicted
1,12
2,71
3,91
4,54
5,126
6,76
7,94
8,8
9,127


```
id,predicted
1,12
2,71
3,91
4,54
5,126
6,76
7,94
8,8
9,127
```