# Compare predictions

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd

In [3]:
OUTPUT_PATH = Path(".").absolute().parent / "output" / "test"

In [4]:
# !head -50 ../output/inference_FurnitureInceptionV4_350_20180425_2258/predict.log

In [5]:
!ls -all ../output/test/test_retinanet_cls_only/20180514_2037/predictions.csv

-rw-r--r-- 1 1000 1000 106081 May 14 20:51 ../output/test/test_retinanet_cls_only/20180514_2037/predictions.csv


In [6]:
# !head -50 ../output/inference_FurnitureInceptionResNet299_20180426_1440/predict.log

In [7]:
prediction_files = [
    OUTPUT_PATH / "inference_FurnitureNASNetALarge_20180418_0635" / "predictions.csv",
    OUTPUT_PATH / "test_nasnetalarge_350_resized_crop" / "20180428_1455" / "predictions.csv",
    
    OUTPUT_PATH / "inference_FurnitureInceptionResNet299_20180426_1440" / "predictions.csv",
    
#     OUTPUT_PATH / "inference_FurnitureInceptionV4_350_20180425_2258" / "predictions.csv",
#     OUTPUT_PATH / "inference_FurnitureInceptionV4_350_20180428_0808" / "predictions.csv",    
# #     OUTPUT_PATH / "test_inceptionv4_350_fc_random_resized_crop" / "20180507_1116" / "predictions.csv",    
# #     OUTPUT_PATH / "test_resnet101_350_finetune_random_resized_crop" / "20180509_0937" / "predictions.csv",
    OUTPUT_PATH / "test_nasnetalarge_350_random_resized_crop" / "20180514_0917" / "predictions.csv",
    OUTPUT_PATH / "test_nasnetalarge_350_random_resized_crop" / "20180514_1126" / "predictions.csv",    
]

# prediction_files = [
#     "kaggle_maj_votes_2nasnet_2incv4_incv4rc.csv",
#     OUTPUT_PATH / "test_retinanet_cls_only" / "20180514_2037" / "predictions.csv",
# ]

In [8]:
names = [f.parent.name.replace("inference_", "") for f in prediction_files]
names
# names = ["kaggle_maj_votes_2nasnet_2incv4_incv4rc", "20180514_2037"]

['FurnitureNASNetALarge_20180418_0635',
 '20180428_1455',
 'FurnitureInceptionResNet299_20180426_1440',
 '20180514_0917',
 '20180514_1126']

In [9]:
dfs = [pd.read_csv(f, index_col='id') for f in prediction_files]

merged_df = pd.concat([df for df in dfs], axis=1)
merged_df.columns = names

In [10]:
merged_df.head(10)

Unnamed: 0_level_0,FurnitureNASNetALarge_20180418_0635,20180428_1455,FurnitureInceptionResNet299_20180426_1440,20180514_0917,20180514_1126
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,12,12,12,12,12
2,71,71,71,61,71
3,91,91,91,91,91
4,54,54,54,54,54
5,126,126,126,126,126
6,76,76,76,76,76
7,94,94,94,94,94
8,8,8,8,8,8
9,127,127,127,127,127
10,117,117,117,117,117


In [11]:
disagreement_mask = merged_df.mean(axis=1).apply(lambda x: int(x) != x)
print(disagreement_mask.sum(), disagreement_mask.shape[0])

1739 12800


In [12]:
merged_df[disagreement_mask].head(10)

Unnamed: 0_level_0,FurnitureNASNetALarge_20180418_0635,20180428_1455,FurnitureInceptionResNet299_20180426_1440,20180514_0917,20180514_1126
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12,128,19,19,19,19
14,77,77,45,77,77
20,108,108,27,49,49
24,15,63,63,15,26
26,89,89,89,97,97
38,75,61,61,61,61
47,82,127,43,43,43
48,15,15,63,15,15
52,127,127,127,127,110
76,14,30,30,30,30


In [13]:
merged_df[disagreement_mask].tail(10)

Unnamed: 0_level_0,FurnitureNASNetALarge_20180418_0635,20180428_1455,FurnitureInceptionResNet299_20180426_1440,20180514_0917,20180514_1126
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12723,30,30,30,37,37
12729,77,81,77,77,77
12739,105,105,73,100,100
12741,84,84,84,84,51
12747,42,113,13,113,113
12766,60,106,106,106,106
12776,79,115,37,79,79
12787,54,54,54,54,50
12795,2,2,88,2,88
12797,75,75,75,61,61


In [63]:
def get_decision_fn(weights):    
    def fn(row):
        votes = np.zeros(128 + 1, dtype=np.int)
        for r, w in zip(row, weights):
            votes[r] += w
        return np.argmax(votes)
    return fn

In [64]:
merged_df.loc[:, 'MajVote'] = merged_df.apply(get_decision_fn(weights=[1] * merged_df.shape[1]), axis=1)

In [217]:
merged_df[disagreement_mask].head(10)

Unnamed: 0_level_0,FurnitureNASNetALarge_20180418_0635,20180428_1455,FurnitureInceptionResNet299_20180426_1440,20180514_0917,20180514_1126,MajVote
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,71,71,71,61,71,71
12,128,19,19,19,19,19
14,77,77,45,77,77,77
24,15,63,63,15,26,15
26,89,89,89,97,97,89
38,75,61,61,61,61,61
43,3,58,58,58,58,58
47,82,127,43,43,43,43
52,127,127,127,127,110,127
61,27,112,27,112,112,112


In [34]:
merged_df[disagreement_mask].tail(10)

Unnamed: 0_level_0,FurnitureNASNetALarge_20180418_0635,20180428_1455,FurnitureInceptionResNet299_20180426_1440,20180514_0917,20180514_1126,MajVote
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12723,30,30,30,37,37,30
12729,77,81,77,77,77,77
12739,105,105,73,100,100,100
12741,84,84,84,84,51,84
12747,42,113,13,113,113,113
12766,60,106,106,106,106,106
12776,79,115,37,79,79,79
12787,54,54,54,54,50,54
12795,2,2,88,2,88,2
12797,75,75,75,61,61,75


In [35]:
merged_df['MajVote'].to_csv("maj_votes_4nasnet_incv4rc.csv", header=["predicted"])

In [36]:
!head maj_votes_4nasnet_incv4rc.csv

id,predicted
1,12
2,71
3,91
4,54
5,126
6,76
7,94
8,8
9,127


In [37]:
!cd ../ && python3 utils/update_test_predictions.py notebooks/maj_votes_4nasnet_incv4rc.csv notebooks/test_with_labels.csv notebooks/

## Probas on validation

In [14]:
from pathlib import Path
import sys
sys.path.insert(0, Path(".").absolute().parent.as_posix())

In [15]:
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns


In [16]:
import numpy as np
import pandas as pd


OUTPUT_PATH = Path(".").absolute().parent / "output" / "val_probas"

In [17]:
# !head -50 ../output/val_probas/val_probas_nasnetalarge_350_random_resized_crop/20180514_2232/predict.log

In [18]:
# prediction_files = [
#     OUTPUT_PATH / "val_probas_inceptionresnetv2_350_resized_crop" / "20180428_1622" / "probas.csv",
#     OUTPUT_PATH / "val_probas_inceptionv4_350_resized_crop" / "20180428_1633" / "probas.csv",
#     OUTPUT_PATH / "val_probas_nasnetalarge_350_resized_crop" / "20180428_1654" / "probas.csv",    
# #     OUTPUT_PATH / "val_probas_inceptionresnetv2_350_weighted_sampler2_resized_crop" / "20180501_1710" / "probas.csv"
# ]

In [19]:
prediction_files = [
    OUTPUT_PATH / "val_probas_inceptionv4_350_resized_crop" / "20180428_1633" / "probas.csv",
    
    OUTPUT_PATH / "val_probas_inceptionresnetv2_350_resized_crop" / "20180428_1622" / "probas.csv",
    OUTPUT_PATH / "val_probas_inceptionresnetv2_350_weighted_sampler2_resized_crop" / "20180501_1710" / "probas.csv",
    
    OUTPUT_PATH / "val_probas_nasnetalarge_350_resized_crop" / "20180428_1654" / "probas.csv",
    OUTPUT_PATH / "val_probas_nasnetalarge_350_random_resized_crop" / "20180514_2232" / "probas.csv",
    OUTPUT_PATH / "val_probas_nasnetalarge_350_random_resized_crop" / "20180514_2202" / "probas.csv",

]

In [20]:
dfs = [pd.read_csv(f, index_col='id') for f in prediction_files]
assert len(dfs) == len(prediction_files)

# merged_df = pd.concat([df for df in dfs], axis=1)
# merged_df.columns = names

In [21]:
df_probas = pd.concat(dfs, axis=0)

### Probability mean/gmean value

In [22]:
y_probas = df_probas.groupby('id').mean()
y_probas.head()

Unnamed: 0_level_0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,...,c118,c119,c120,c121,c122,c123,c124,c125,c126,c127
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,8.099996e-08,4.26587e-07,4.541907e-08,5.151554e-08,3.870617e-08,3.217309e-07,7.076326e-08,9.465426e-07,1.436254e-08,2.117107e-06,...,1.25328e-07,4.684346e-08,3.235977e-08,1.542368e-07,3.320246e-07,4.47544e-08,6.428985e-07,3.916781e-07,7.666353e-08,1.286044e-08
2,2.484259e-05,9.986498e-06,0.001597136,0.001278619,6.304915e-06,4.26703e-05,9.965587e-06,0.0002885103,5.956633e-06,5.569256e-06,...,2.196419e-06,0.000906416,1.769369e-05,0.001486118,1.520937e-05,7.971722e-06,2.111096e-05,0.0001613956,9.314521e-06,0.000215033
3,2.587277e-07,3.339431e-06,4.295129e-07,1.654437e-06,1.168454e-06,3.593112e-07,3.280693e-06,6.895329e-07,1.011343e-06,7.19398e-07,...,3.013027e-07,1.655807e-05,3.919396e-05,5.053013e-06,1.049569e-06,0.001979417,7.044605e-07,2.336234e-06,1.239963e-06,2.398526e-06
4,8.503485e-06,2.650881e-06,3.532965e-06,0.0021965,2.384373e-06,2.206019e-06,2.224744e-06,0.001680988,2.378237e-06,9.128945e-07,...,6.558547e-07,1.942276e-05,3.471601e-06,2.19829e-06,1.364051e-06,6.047988e-06,2.447377e-06,0.9053854,1.835148e-06,5.252214e-06
5,7.039414e-06,2.159147e-08,7.138873e-07,7.225547e-05,4.930302e-08,1.265603e-05,4.930162e-08,3.251224e-07,2.168924e-08,1.0765e-07,...,3.082958e-07,1.943459e-05,1.187292e-07,1.533542e-07,1.623822e-08,1.870446e-08,2.737252e-08,2.558466e-08,1.295808e-08,1.162813e-07


In [25]:
def gmean(arr):
    l = len(arr)
    prod = np.prod(arr)
    return np.power(prod, 1.0/l)

In [26]:
from PIL import Image
from common.dataset import FilesFromCsvDataset, TransformedDataset
from common.meta import get_metafeatures, get_imsize_and_targets


dataset = FilesFromCsvDataset("../output/filtered_val_dataset.csv")
dataset = TransformedDataset(dataset,
                             transforms=lambda x: (x, Image.open(x).size),
                             target_transforms=lambda l: l - 1)

df_imsize_targets = get_imsize_and_targets(dataset)

y_true = df_imsize_targets['target']
y_probas = y_probas.loc[y_true.index, :]

In [None]:
y_probas_gmean = df_probas.groupby('id').agg(gmean)
y_probas_gmean = y_probas_gmean.loc[y_true.index, :]
y_pred_gmean = np.argmax(y_probas_gmean.values, axis=1)

In [125]:
y_pred = np.argmax(y_probas.values, axis=1)

In [126]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score, accuracy_score

In [127]:
# 1.0 - accuracy_score(y_true, y_pred), 1.0 - accuracy_score(y_true, y_pred_gmean)

In [128]:
1.0 - accuracy_score(y_true, y_pred)

0.12096646002225397

Random search on weights

In [129]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [130]:
y_true = df_imsize_targets['target']


def score(params):
    weights = [params["w_{}".format(i)] for i in range(len(prediction_files))]
    
    print("Trial: weights: {}".format(weights), end=" -> ")
    
    dfs = [w * pd.read_csv(f, index_col='id') for w, f in zip(weights, prediction_files)]
    df_probas = pd.concat(dfs, axis=0)
    y_probas = df_probas.groupby('id').sum()
    y_probas = y_probas.loc[y_true.index, :]
    y_pred = np.argmax(y_probas.values, axis=1)
    
    res = 1.0 - accuracy_score(y_true, y_pred)
    print("Score: {}".format(res))
    return {'loss': res, 'status': STATUS_OK}
    
    
def get_uniform(name):
    return hp.uniform(name, 0.2, 0.8)
    
    
def optimize(trials):
    params = {}
    for i in range(len(prediction_files)):
        n = "w_{}".format(i)
        params[n] = get_uniform(n)   
    best = fmin(score, params, algo=tpe.suggest, trials=trials, max_evals=200)    
    return best


trials = Trials()
best_weights = optimize(trials)
best_weights

Trial: weights: [0.32038462851493993, 0.2895648090774444, 0.6053060906857075, 0.7142163821747616, 0.6150732458829893, 0.741551417945066] -> Score: 0.12144333174376087
Trial: weights: [0.6381176956987362, 0.25575777365144703, 0.4571436705424128, 0.6608323475744745, 0.7305738020670691, 0.263884341049077] -> Score: 0.11985375933873788
Trial: weights: [0.3616935203835524, 0.5932277442209484, 0.33563145344531725, 0.4025890697792883, 0.29975323748563976, 0.6297866394444784] -> Score: 0.12080750278175167
Trial: weights: [0.7776524172570742, 0.7721033006083982, 0.5112949406515839, 0.2833970395171603, 0.2274055891956006, 0.761332626613455] -> Score: 0.12017167381974247
Trial: weights: [0.49777197153778535, 0.31034521110679136, 0.25964578249999926, 0.2641075935126929, 0.3700624414101603, 0.733391745284188] -> Score: 0.11937688761723098
Trial: weights: [0.4207096401520402, 0.5812018365091522, 0.6912874498012802, 0.28945969735721105, 0.5298130904587948, 0.7508375820979252] -> Score: 0.121920203465

{'w_0': 0.7390826642363332,
 'w_1': 0.47540518110214636,
 'w_2': 0.6590564068735124,
 'w_3': 0.7898424107198876,
 'w_4': 0.378873115681332,
 'w_5': 0.3868429981334225}

In [131]:
best_weights, trials.best_trial

({'w_0': 0.7390826642363332,
  'w_1': 0.47540518110214636,
  'w_2': 0.6590564068735124,
  'w_3': 0.7898424107198876,
  'w_4': 0.378873115681332,
  'w_5': 0.3868429981334225},
 {'book_time': datetime.datetime(2018, 5, 15, 21, 14, 28, 977000),
  'exp_key': None,
  'misc': {'cmd': ('domain_attachment', 'FMinIter_Domain'),
   'idxs': {'w_0': [191],
    'w_1': [191],
    'w_2': [191],
    'w_3': [191],
    'w_4': [191],
    'w_5': [191]},
   'tid': 191,
   'vals': {'w_0': [0.7390826642363332],
    'w_1': [0.47540518110214636],
    'w_2': [0.6590564068735124],
    'w_3': [0.7898424107198876],
    'w_4': [0.378873115681332],
    'w_5': [0.3868429981334225]},
   'workdir': None},
  'owner': None,
  'refresh_time': datetime.datetime(2018, 5, 15, 21, 14, 29, 974000),
  'result': {'loss': 0.11762835797170557, 'status': 'ok'},
  'spec': None,
  'state': 2,
  'tid': 191,
  'version': 0})

In [132]:
weights = [best_weights["w_{}".format(i)] for i in range(len(prediction_files))]
# weights = [0.7, 0.25, 0.65, 0.42, 0.48]

dfs = [w * pd.read_csv(f, index_col='id') for w, f in zip(weights, prediction_files)]
assert len(dfs) == len(prediction_files)

# merged_df = pd.concat([df for df in dfs], axis=1)
# merged_df.columns = names

In [133]:
df_probas = pd.concat(dfs, axis=0)

y_probas = df_probas.groupby('id').sum()


y_probas = y_probas.loc[y_true.index, :]
y_pred = np.argmax(y_probas.values, axis=1)

In [134]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score, accuracy_score

In [135]:
1.0 - accuracy_score(y_true, y_pred)

0.11762835797170557

In [137]:
best_weights

{'w_0': 0.7390826642363332,
 'w_1': 0.47540518110214636,
 'w_2': 0.6590564068735124,
 'w_3': 0.7898424107198876,
 'w_4': 0.378873115681332,
 'w_5': 0.3868429981334225}

Results:

    OUTPUT_PATH / "val_probas_inceptionv4_350_resized_crop" / "20180428_1633" / "probas.csv",
    
    OUTPUT_PATH / "val_probas_inceptionresnetv2_350_resized_crop" / "20180428_1622" / "probas.csv",
    
    OUTPUT_PATH / "val_probas_nasnetalarge_350_resized_crop" / "20180428_1654" / "probas.csv",
    OUTPUT_PATH / "val_probas_nasnetalarge_350_random_resized_crop" / "20180514_2232" / "probas.csv",
    OUTPUT_PATH / "val_probas_nasnetalarge_350_random_resized_crop" / "20180514_2202" / "probas.csv",
    
{'w_0': 0.6952457891587356,     'w_1': 0.24113876273518597,     'w_2': 0.6427090350030666,     'w_3': 0.41338501244162207,     'w_4': 0.4838561104971754}  -> 0.11858210141471948



    OUTPUT_PATH / "val_probas_inceptionv4_350_resized_crop" / "20180428_1633" / "probas.csv",
    
    OUTPUT_PATH / "val_probas_inceptionresnetv2_350_resized_crop" / "20180428_1622" / "probas.csv",
    OUTPUT_PATH / "val_probas_inceptionresnetv2_350_weighted_sampler2_resized_crop" / "20180501_1710" / "probas.csv",
    
    OUTPUT_PATH / "val_probas_nasnetalarge_350_resized_crop" / "20180428_1654" / "probas.csv",
    OUTPUT_PATH / "val_probas_nasnetalarge_350_random_resized_crop" / "20180514_2232" / "probas.csv",
    OUTPUT_PATH / "val_probas_nasnetalarge_350_random_resized_crop" / "20180514_2202" / "probas.csv",


{'w_0': 0.7390826642363332, 'w_1': 0.47540518110214636, 'w_2': 0.6590564068735124, 'w_3': 0.7898424107198876, 'w_4': 0.378873115681332, 'w_5': 0.3868429981334225} -> 
0.11762835797170557

### Learn weights between models

In [23]:
import os
os.environ['JOBLIB_TEMP_FOLDER']="/tmp"

In [27]:
df_probas_aligned = pd.concat(dfs, axis=1)
df_probas_aligned = df_probas_aligned.loc[y_true.index, :]
df_probas_aligned.shape

(6291, 768)

In [281]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit

seed = 15

holdout = StratifiedShuffleSplit(test_size=0.3, random_state=seed)

x_all = df_probas_aligned.values
y_all = y_true.values
train_index, test_index = next(holdout.split(x_all, y_all))

x_trainval = x_all[train_index]
x_test = x_all[test_index]

y_trainval = y_all[train_index]
y_test = y_all[test_index]

In [286]:
split = StratifiedKFold(n_splits=4, random_state=seed)

log_reg_cv = LogisticRegressionCV(cv=split, n_jobs=5, max_iter=750, verbose=True)

In [287]:
log_reg_cv.fit(x_trainval, y_trainval)

[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   41.7s
[Parallel(n_jobs=5)]: Done 190 tasks      | elapsed:  3.4min
[Parallel(n_jobs=5)]: Done 440 tasks      | elapsed:  8.2min
[Parallel(n_jobs=5)]: Done 512 out of 512 | elapsed: 10.0min finished


LogisticRegressionCV(Cs=10, class_weight=None,
           cv=StratifiedKFold(n_splits=4, random_state=15, shuffle=False),
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=750, multi_class='ovr', n_jobs=5, penalty='l2',
           random_state=None, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=True)

In [290]:
[np.mean(log_reg_cv.scores_[i], axis=0).max() for i in range(10)]

[0.9986282735399113,
 0.9977577016097718,
 0.9980007060364007,
 0.9943301553171191,
 0.9995659722222222,
 0.9988450434509585,
 0.9984216165563875,
 0.9988984047026512,
 0.9991034375321204,
 0.9990751745976383]

In [288]:
y_test_pred = log_reg_cv.predict(x_test)

In [289]:
1.0 - accuracy_score(y_test, y_test_pred)

0.13082627118644063

In [153]:
from sklearn.externals import joblib

joblib.dump(log_reg_cv, "trained_log_reg_cv.pkl")

['trained_log_reg_cv.pkl']

By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`
is equal to the number of observations known to be in group :math:`i` but
predicted to be in group :math:`j`.

In [56]:
cm = confusion_matrix(y_true, y_pred)
recall_per_class = recall_score(y_true, y_pred, average=None)

In [57]:
recall_per_class

array([0.88      , 0.84      , 0.92      , 0.52083333, 0.96      ,
       0.91489362, 0.89361702, 0.92      , 0.9375    , 0.9787234 ,
       0.96      , 0.97959184, 0.9375    , 0.88      , 0.32      ,
       1.        , 0.89583333, 0.9375    , 0.68      , 0.96      ,
       0.87234043, 0.78723404, 0.81632653, 0.95918367, 0.81632653,
       0.9375    , 0.69387755, 0.89795918, 0.85714286, 0.82      ,
       0.79166667, 0.95918367, 0.98      , 0.9375    , 0.77083333,
       0.95833333, 0.95918367, 0.97959184, 0.72      , 0.98      ,
       0.98      , 0.95833333, 1.        , 1.        , 0.89583333,
       1.        , 0.82      , 0.89583333, 0.71428571, 0.67346939,
       0.75      , 0.89795918, 0.84      , 0.7755102 , 1.        ,
       0.98      , 0.85714286, 0.87755102, 0.88      , 0.93877551,
       0.89583333, 0.88      , 0.28      , 0.97959184, 0.83333333,
       0.5       , 0.85416667, 1.        , 0.97916667, 0.79591837,
       0.97916667, 0.93877551, 0.98      , 0.81632653, 0.96   

In [58]:
idx = np.arange(len(cm))
mcm = cm - cm[idx, idx] * np.eye(len(cm), dtype=np.int)

In [59]:
misclassifed = {}

for i in range(128):
    classes = np.where(mcm[i, :] >= 4)[0]
    values = mcm[i, classes]
    if len(classes) > 0:
        misclassifed[i] = {
            'recall': recall_per_class[i],
            'wrong_classes':[(c, v) for c, v in zip(classes, values)]
        }

In [60]:
misclassifed

{1: {'recall': 0.84, 'wrong_classes': [(87, 5)]},
 3: {'recall': 0.5208333333333334, 'wrong_classes': [(2, 7), (28, 5)]},
 14: {'recall': 0.32, 'wrong_classes': [(3, 5), (28, 6), (62, 7), (125, 6)]},
 18: {'recall': 0.68, 'wrong_classes': [(79, 4), (127, 6)]},
 21: {'recall': 0.7872340425531915, 'wrong_classes': [(16, 4)]},
 26: {'recall': 0.6938775510204082, 'wrong_classes': [(111, 10)]},
 27: {'recall': 0.8979591836734694, 'wrong_classes': [(23, 4)]},
 30: {'recall': 0.7916666666666666, 'wrong_classes': [(69, 5)]},
 34: {'recall': 0.7708333333333334, 'wrong_classes': [(12, 4), (69, 5)]},
 38: {'recall': 0.72, 'wrong_classes': [(86, 10), (108, 4)]},
 48: {'recall': 0.7142857142857143, 'wrong_classes': [(124, 6)]},
 49: {'recall': 0.673469387755102, 'wrong_classes': [(19, 4), (53, 10)]},
 50: {'recall': 0.75, 'wrong_classes': [(52, 4)]},
 53: {'recall': 0.7755102040816326, 'wrong_classes': [(19, 5)]},
 57: {'recall': 0.8775510204081632, 'wrong_classes': [(2, 4)]},
 58: {'recall': 0.88,

In [25]:
misclassifed_t = {}

for i in range(128):
    classes = np.where(cm[:, i] >= 4)[0]
    values = cm[classes, i]
    if len(classes) > 1:
        misclassifed_t[i] = {
            'true_classes':[(c, v) for c, v in zip(classes, values)]
        }

In [61]:
# misclassifed_t

In [195]:
# class_index = 62
# cols = ['c{}'.format(c) for c, _ in misclassifed[class_index]['wrong_classes']] + ['c{}'.format(class_index)]

In [14]:
# df_probas.loc[y_true[y_true == class_index].index[:10], cols]

In [40]:
y_probas_mean = df_probas.groupby('id').mean()
y_probas_mean = y_probas_mean.loc[y_true.index, :]

y_pred_mean = np.argmax(y_probas_mean.values, axis=1)

from sklearn.metrics import recall_score, precision_score

recall_score(y_true, y_pred_mean, average='micro'), precision_score(y_true, y_pred_mean, average='micro')

(0.8761723096487045, 0.8761723096487045)

In [41]:
def gmean(arr):
    l = len(arr)
    prod = np.prod(arr)
    return np.power(prod, 1.0/l)

In [42]:
y_probas_gmean = df_probas.groupby('id').agg(gmean)
y_probas_gmean = y_probas_gmean.loc[y_true.index, :]

y_pred_gmean = np.argmax(y_probas_gmean.values, axis=1)

recall_score(y_true, y_pred_gmean, average='micro'), precision_score(y_true, y_pred_gmean, average='micro')

(0.8761723096487045, 0.8761723096487045)

In [45]:
y_probas_gmean.loc[1, 'c0'], y_probas_mean.loc[1, 'c0']

(6.305511012639352e-09, 1.1421323731959247e-07)

## Predict on test data

In [138]:
from pathlib import Path
import sys
sys.path.insert(0, Path(".").absolute().parent.as_posix())

In [139]:
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns

In [140]:
import numpy as np
import pandas as pd


OUTPUT_PATH = Path(".").absolute().parent / "output" / "test_probas"

In [102]:
# prediction_files = [
#     OUTPUT_PATH / "val_probas_inceptionv4_350_resized_crop" / "20180428_1633" / "probas.csv",
    
#     OUTPUT_PATH / "val_probas_inceptionresnetv2_350_resized_crop" / "20180428_1622" / "probas.csv",
    
#     OUTPUT_PATH / "val_probas_nasnetalarge_350_resized_crop" / "20180428_1654" / "probas.csv",
#     OUTPUT_PATH / "val_probas_nasnetalarge_350_random_resized_crop" / "20180514_2202" / "probas.csv",
#     OUTPUT_PATH / "val_probas_nasnetalarge_350_random_resized_crop" / "20180514_2232" / "probas.csv",    
# ]

In [103]:
# !ls ../output/test_probas/test_probas_nasnetalarge_350_random_resized_crop/20180514_1034/

In [142]:
test_prediction_files = [
    OUTPUT_PATH / "test_probas_inceptionv4_350_resized_crop" / "20180429_1303" / "probas.csv",
    
    OUTPUT_PATH / "test_probas_inceptionresnetv2_350_resized_crop" / "20180429_1242" / "probas.csv",
    OUTPUT_PATH / "test_probas_inceptionresnetv2_350_weighted_sampler2_resized_crop" / "20180501_1725" / "probas.csv",
    
    OUTPUT_PATH / "test_probas_nasnetalarge_350_resized_crop" / "20180429_1406" / "probas.csv",
    OUTPUT_PATH / "test_probas_nasnetalarge_350_random_resized_crop" / "20180514_0821" / "probas.csv",
    OUTPUT_PATH / "test_probas_nasnetalarge_350_random_resized_crop" / "20180514_1034" / "probas.csv",    
]

In [143]:
test_dfs = [pd.read_csv(f, index_col='id') for f in test_prediction_files]

# merged_df = pd.concat([df for df in dfs], axis=1)
# merged_df.columns = names

In [144]:
df_test_probas = pd.concat(test_dfs, axis=0)

y_test_probas = df_test_probas.groupby('id').mean()
# y_test_probas.head()

In [145]:
# Add mising values:
missing_indices = list(set(merged_df['MajVote'].index) - set(y_test_probas.index))
missing_values = merged_df.loc[missing_indices, 'MajVote'].values
for idx, v in zip(missing_indices, missing_values):
    y_test_probas.loc[idx, :] = 0.01
    col = y_test_probas.columns[v - 1]
    y_test_probas.loc[idx, col] = 0.99
    
y_test_pred = np.argmax(y_test_probas.values, axis=1) + 1   
print(y_test_pred.shape)

df = pd.DataFrame({"predicted": y_test_pred}, index=y_test_probas.index)
df.to_csv("mean_proba_incv4_incv4rc_3nasnet.csv")

(12800,)


In [225]:
!cd ../ && python3 utils/update_test_predictions.py notebooks/mean_proba_incv4_incv4rc_3nasnet.csv notebooks/test_with_labels.csv notebooks/

In [226]:
!head -10 fixed_mean_proba_incv4_incv4rc_3nasnet.csv

id,predicted
1,12
2,71
3,91
4,54
5,126
6,76
7,94
8,8
9,127


In [227]:
!head -10 fixed_maj_votes_4nasnet_incv4rc.csv

id,predicted
1,12
2,71
3,91
4,54
5,126
6,76
7,94
8,8
9,127


### Weighted probas 

In [146]:
best_weights

{'w_0': 0.7390826642363332,
 'w_1': 0.47540518110214636,
 'w_2': 0.6590564068735124,
 'w_3': 0.7898424107198876,
 'w_4': 0.378873115681332,
 'w_5': 0.3868429981334225}

In [147]:
weights = [best_weights["w_{}".format(i)] for i in range(len(test_prediction_files))]
test_dfs = [w * pd.read_csv(f, index_col='id') for w, f in zip(weights, test_prediction_files)]

In [148]:
df_test_probas = pd.concat(test_dfs, axis=0)
y_test_probas = df_test_probas.groupby('id').mean()

In [150]:
# Add mising values:
missing_indices = list(set(merged_df['MajVote'].index) - set(y_test_probas.index))
missing_values = merged_df.loc[missing_indices, 'MajVote'].values
for idx, v in zip(missing_indices, missing_values):
    y_test_probas.loc[idx, :] = 0.01
    col = y_test_probas.columns[v - 1]
    y_test_probas.loc[idx, col] = 0.99
    
y_test_pred = np.argmax(y_test_probas.values, axis=1) + 1   
print(y_test_pred.shape)

df = pd.DataFrame({"predicted": y_test_pred}, index=y_test_probas.index)
df.to_csv("weighted_proba_incv4_2incv4rc_3nasnet.csv")

(12800,)


In [151]:
!cd ../ && python3 utils/update_test_predictions.py notebooks/weighted_proba_incv4_2incv4rc_3nasnet.csv notebooks/test_with_labels.csv notebooks/

In [152]:
!head -10 fixed_weighted_proba_incv4_2incv4rc_3nasnet.csv

id,predicted
1,12
2,71
3,91
4,54
5,126
6,76
7,94
8,8
9,127


### Predict with trained model:

In [257]:
# To the same indices:
index = dfs[0].index
for i in range(1, len(prediction_files)):
    dfs[i] = dfs[i].loc[index, :]

In [259]:
df_test_probas_aligned = pd.concat(dfs, axis=1)
df_test_probas_aligned.shape

(12639, 640)

In [260]:
log_reg_cv = joblib.load("trained_log_reg_cv.pkl")

In [269]:
y_test_probas = log_reg_cv.predict_proba(df_test_probas_aligned.values)
y_test_probas_df = pd.DataFrame(y_test_probas, index=df_test_probas_aligned.index)

In [277]:
# Add mising values:
missing_indices = list(set(merged_df['MajVote'].index) - set(y_test_probas_df.index))
missing_values = merged_df.loc[missing_indices, 'MajVote'].values
for idx, v in zip(missing_indices, missing_values):    
    y_test_probas_df.loc[idx, :] = 0.01
    col = y_test_probas_df.columns[v - 1]
    y_test_probas_df.loc[idx, col] = 0.99
    
y_test_pred = np.argmax(y_test_probas_df.values, axis=1) + 1   
print(y_test_pred.shape)

df = pd.DataFrame({"predicted": y_test_pred}, index=y_test_probas_df.index).sort_index()
df.to_csv("log_reg_proba_incv4_incv4rc_3nasnet.csv")

(12800,)


In [278]:
!cd ../ && python3 utils/update_test_predictions.py notebooks/log_reg_proba_incv4_incv4rc_3nasnet.csv notebooks/test_with_labels.csv notebooks/

In [279]:
!head -10 fixed_log_reg_proba_incv4_incv4rc_3nasnet.csv

id,predicted
1,12
2,71
3,91
4,54
5,126
6,76
7,94
8,8
9,127


In [280]:
!head -10 fixed_maj_votes_4nasnet_incv4rc.csv

id,predicted
1,12
2,71
3,91
4,54
5,126
6,76
7,94
8,8
9,127


In [220]:
y_probas = y_probas.sort_index()
y_probas.tail()

Unnamed: 0_level_0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,...,c118,c119,c120,c121,c122,c123,c124,c125,c126,c127
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12796,3.740855e-08,7.699472e-07,3.621496e-07,3.964026e-08,1.351679e-08,6.045437e-08,2.199541e-07,6.69059e-09,3.975444e-07,6.771315e-09,...,5.479484e-09,6.028931e-07,1.559961e-07,6.559622e-08,8.916581e-08,1.017504e-07,8.690805e-08,2.232733e-08,2.711755e-09,1.691902e-09
12797,5.812775e-06,4.587319e-06,2.474041e-05,2.48818e-05,1.054539e-05,7.284781e-06,6.714315e-06,2.000873e-05,4.21314e-06,3.69569e-06,...,4.620517e-06,2.139555e-05,2.764587e-05,1.579153e-05,7.022183e-06,0.004496449,1.263526e-05,3.531694e-05,1.338969e-05,7.176692e-06
12798,5.18635e-05,2.736125e-06,0.2193131,0.0009613796,1.518391e-06,6.302892e-06,1.375831e-05,5.459569e-06,3.268561e-06,9.933181e-07,...,3.199144e-06,2.90024e-05,2.855623e-06,6.058239e-06,2.601914e-06,7.006069e-07,2.281592e-06,8.103623e-06,1.303272e-06,1.23996e-06
12799,0.8749835,8.248669e-06,0.0007530487,0.001940677,1.398804e-05,0.003930042,0.000124398,1.117339e-05,4.665101e-05,3.284426e-05,...,0.0003624489,0.004786405,0.0001282868,0.002173255,6.709419e-05,4.579863e-06,9.111378e-06,7.063308e-05,1.432265e-05,7.423627e-05
12800,4.885252e-08,2.691364e-08,8.78699e-08,1.913607e-07,3.794453e-08,8.208963e-08,9.52156e-08,2.03238e-08,8.779747e-08,6.181915e-09,...,1.766298e-08,1.143649e-07,5.289379e-08,4.477025e-08,2.463165e-07,2.244294e-07,1.63868e-07,9.434708e-09,2.923246e-08,6.561676e-07


In [221]:
y_pred = np.argmax(y_probas.values, axis=1) + 1

In [222]:
y_maj_votes = merged_df['MajVote'].values

In [223]:
(y_maj_votes == y_pred).sum(), (y_maj_votes != y_pred).sum(), len(y_maj_votes)

(12318, 482, 12800)

In [225]:
y_pred[y_maj_votes != y_pred]

array([ 45, 110,  91, 124,   4, 100, 102,  21,  90,  97,   1,   6,  27,
        15,  97,  50,  30,  24, 115,  49,  45,   4,  14,  50,  16,  80,
        42,  63,   4,  51,  51,  87, 128, 116,  71, 108,  98,  66,  35,
        63, 100,  81,  63,  45,   4,  21,  27, 112,  87,  45,  54,  22,
       124,  30, 126,  62, 116,  44, 116,   1,  89,  18,  18, 108,  96,
       112, 127,  69,  51,  31,  90,  47,   1, 104,  85,  98,  15,  15,
       109,  50,  63, 103,  66,   4,   3,  84,  43,  61, 127, 118,  83,
        93, 126,  37, 125,  10, 103,   3,  21, 112, 110, 124,  94,   8,
        39,  84,  69, 121, 107, 124,  27,  76,   1,  42, 106,  54,  82,
        83,  31,  73, 103,  18,  91, 117,  92, 115,  16,  86,  19,  92,
        97,   2,   8,  66,  65,  53,  71,  35,  74, 104,   5, 110,   4,
        27,  59,  70, 113, 118,   4,  21, 105,  63, 117,  97,  63,  61,
        66,  69,   7,  98, 110,  67,  15,   4, 115,  45,  66, 103,  45,
        63,  42,  61,  66,  52,  20,  11,  35,  27, 124,  96,  1

In [228]:
df = pd.DataFrame({"predicted": y_pred}, index=y_probas.index)
df.to_csv("mean_proba_nasnet_incv4_incv4rc_ws2.csv")

In [229]:
!head -10 mean_proba_nasnet_incv4_incv4rc_ws2.csv

id,predicted
1,12
2,71
3,91
4,54
5,126
6,76
7,94
8,8
9,127
