# Compare predictions

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

In [2]:
OUTPUT_PATH = Path(".").absolute().parent / "output"

In [3]:
# !head -50 ../output/inference_FurnitureInceptionV4_350_20180425_2258/predict.log

In [62]:
!ls -all ../output/test_inceptionv4_350_fc_random_resized_crop/20180506_2029/

total 124
drwxr-xr-x 3 root root   4096 May  6 20:44 .
drwxr-xr-x 5 root root   4096 May  6 20:29 ..
-rw-r--r-- 1 root root 106141 May  6 20:44 predictions.csv
-rw-r--r-- 1 root root   2860 May  6 20:44 predict.log
drwxr-xr-x 3 root root   4096 May  6 20:29 tensorboard
-rw-r--r-- 1 root root   1161 May  6 20:29 test_inceptionv4_350_fc_random_resized_crop.py


In [63]:
prediction_files = [
#     OUTPUT_PATH / "inference_FurnitureNASNetALarge_20180418_0635" / "predictions.csv",
    OUTPUT_PATH / "test_nasnetalarge_350_resized_crop" / "20180428_1455" / "predictions.csv",
#     OUTPUT_PATH / "inference_FurnitureInceptionResNet299_20180426_1440" / "predictions.csv",
#     OUTPUT_PATH / "inference_FurnitureInceptionV4_350_20180425_2258" / "predictions.csv",
#     OUTPUT_PATH / "inference_FurnitureInceptionV4_350_20180428_0808" / "predictions.csv",
#     OUTPUT_PATH / "test_inceptionresnetv2_350_weighted_sampler2_resized_crop" / "20180501_1750" / "predictions.csv",    
#     OUTPUT_PATH / "test_resnet152_350_resized_crop" / "20180502_2043" / "predictions.csv",
#     OUTPUT_PATH / "test_inceptionv4_350_fc_random_resized_crop" / "20180506_1914" / "predictions.csv",
    OUTPUT_PATH / "test_inceptionv4_350_fc_random_resized_crop" / "20180506_2029" / "predictions.csv",    
]

# prediction_files = [
#     "maj_votes_2nasnet_2incv4_incv4rc_resnet.csv",
#     OUTPUT_PATH / "test_inceptionv4_350_fc_random_resized_crop" / "20180506_1914" / "predictions.csv",
# ]

In [64]:
names = [f.parent.name.replace("inference_", "") for f in prediction_files]
names
# names = ["maj_votes_2nasnet_2incv4_incv4rc_resnet", "20180506_1914"]

['20180428_1455', '20180506_2029']

In [65]:
dfs = [pd.read_csv(f, index_col='id') for f in prediction_files]

merged_df = pd.concat([df for df in dfs], axis=1)
merged_df.columns = names

In [66]:
merged_df.head(10)

Unnamed: 0_level_0,20180428_1455,20180506_2029
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,12,12
2,71,71
3,91,91
4,54,54
5,126,126
6,76,76
7,94,94
8,8,8
9,127,127
10,117,117


In [67]:
disagreement_mask = merged_df.mean(axis=1).apply(lambda x: int(x) != x)
print(disagreement_mask.sum(), disagreement_mask.shape[0])

1057 12800


In [68]:
merged_df[disagreement_mask].head(10)

Unnamed: 0_level_0,20180428_1455,20180506_2029
id,Unnamed: 1_level_1,Unnamed: 2_level_1
20,108,125
39,8,103
50,90,37
89,17,120
100,116,55
101,113,80
104,53,84
129,8,89
133,56,53
143,105,20


In [61]:
merged_df[disagreement_mask].tail(10)

Unnamed: 0_level_0,20180428_1455,20180506_1959
id,Unnamed: 1_level_1,Unnamed: 2_level_1
12566,17,116
12570,113,78
12599,53,112
12638,14,65
12643,100,49
12652,111,14
12664,117,90
12667,45,32
12673,123,38
12756,17,38


In [60]:
def get_decision_fn(weights):    
    def fn(row):
        votes = np.zeros(128 + 1, dtype=np.int)
        for r, w in zip(row, weights):
            votes[r] += w
        return np.argmax(votes)
    return fn

In [39]:
merged_df.loc[:, 'MajVote'] = merged_df.apply(get_decision_fn(weights=[1] * merged_df.shape[1]), axis=1)

In [40]:
merged_df[disagreement_mask].head(10)

Unnamed: 0_level_0,FurnitureNASNetALarge_20180418_0635,20180428_1455,FurnitureInceptionResNet299_20180426_1440,FurnitureInceptionV4_350_20180425_2258,FurnitureInceptionV4_350_20180428_0808,20180501_1750,20180502_2043,20180506_1914,MajVote
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5,126,126,126,126,126,63,126,126,126
12,128,19,19,19,19,19,128,19,19
29,47,47,47,47,47,37,47,47,47
38,75,61,61,61,104,61,61,83,61
39,8,8,8,8,8,8,8,89,8
43,3,58,58,58,58,58,58,58,58
45,97,97,97,97,97,53,97,97,97
46,90,90,90,90,90,8,90,90,90
47,82,127,43,43,43,43,43,43,43
56,115,115,115,115,115,115,121,115,115


In [41]:
merged_df[disagreement_mask].tail(10)

Unnamed: 0_level_0,FurnitureNASNetALarge_20180418_0635,20180428_1455,FurnitureInceptionResNet299_20180426_1440,FurnitureInceptionV4_350_20180425_2258,FurnitureInceptionV4_350_20180428_0808,20180501_1750,20180502_2043,20180506_1914,MajVote
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
12774,4,4,4,4,4,4,4,37,4
12776,79,115,37,121,121,71,121,121,121
12777,118,118,118,118,118,118,118,108,118
12779,17,17,17,17,17,17,17,38,17
12780,102,102,102,57,57,102,102,57,102
12781,23,4,4,4,23,15,4,15,4
12788,113,113,113,113,113,113,113,67,113
12791,27,112,27,112,27,27,112,27,27
12795,2,2,88,2,2,2,2,88,2
12798,3,58,58,58,58,58,58,58,58


In [42]:
merged_df['MajVote'].to_csv("maj_votes_2nasnet_2incv4_3incv4rc_resnet.csv", header=["predicted"])

In [43]:
!head maj_votes_2nasnet_2incv4_3incv4rc_resnet.csv

id,predicted
1,12
2,71
3,91
4,54
5,126
6,76
7,94
8,8
9,127


## Probas on validation

In [180]:
from pathlib import Path
import sys
sys.path.insert(0, Path(".").absolute().parent.as_posix())

In [181]:
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns


In [182]:
import numpy as np
import pandas as pd

from PIL import Image


OUTPUT_PATH = Path(".").absolute().parent / "output"

In [185]:
# !ls ../output/val_probas_inceptionresnetv2_350_weighted_sampler2_resized_crop/

In [193]:
prediction_files = [
    OUTPUT_PATH / "val_probas_inceptionresnetv2_350_resized_crop" / "20180428_1622" / "probas.csv",
    OUTPUT_PATH / "val_probas_inceptionv4_350_resized_crop" / "20180428_1633" / "probas.csv",
    OUTPUT_PATH / "val_probas_nasnetalarge_350_resized_crop" / "20180428_1654" / "probas.csv",    
]

In [202]:
prediction_files = [
    OUTPUT_PATH / "val_probas_inceptionresnetv2_350_resized_crop" / "20180428_1622" / "probas.csv",
    OUTPUT_PATH / "val_probas_inceptionv4_350_resized_crop" / "20180428_1633" / "probas.csv",
    OUTPUT_PATH / "val_probas_nasnetalarge_350_resized_crop" / "20180428_1654" / "probas.csv",    
    OUTPUT_PATH / "val_probas_inceptionresnetv2_350_weighted_sampler2_resized_crop" / "20180501_1710" / "probas.csv",        
]

In [203]:
dfs = [pd.read_csv(f, index_col='id') for f in prediction_files]

# merged_df = pd.concat([df for df in dfs], axis=1)
# merged_df.columns = names

In [204]:
df_probas = pd.concat(dfs, axis=0)

In [205]:
y_probas = df_probas.groupby('id').mean()
y_probas.head()

Unnamed: 0_level_0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,...,c118,c119,c120,c121,c122,c123,c124,c125,c126,c127
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.142132e-07,6.172438e-07,5.954983e-08,6.790467e-08,4.27405e-08,3.353087e-07,8.198711e-08,1.401706e-06,1.613855e-08,3.158318e-06,...,1.74553e-07,6.696046e-08,4.561732e-08,2.210249e-07,4.079582e-07,6.002456e-08,9.546675e-07,5.366485e-07,9.529454e-08,1.795009e-08
2,2.378047e-05,1.911221e-06,0.0003813768,0.0008873822,4.337529e-06,5.411072e-05,4.765262e-06,0.0002086249,2.246416e-06,4.256942e-06,...,1.12008e-06,0.001173155,1.83163e-05,0.002207372,1.008246e-05,4.819439e-06,2.082802e-05,0.0002006639,2.958735e-06,0.0002942054
3,1.432289e-07,4.822773e-07,1.426817e-07,3.659403e-08,1.371402e-06,1.994741e-07,4.667346e-06,3.050844e-07,6.060177e-07,3.001461e-07,...,2.584748e-07,2.388654e-05,5.779009e-05,6.963761e-06,8.794139e-07,0.002958808,6.51916e-07,1.788477e-06,4.593745e-07,3.342676e-06
4,1.083368e-05,2.481374e-06,2.585387e-06,0.002635155,1.39547e-06,1.997643e-06,1.36741e-06,0.002393932,7.990591e-07,3.199877e-07,...,5.415177e-07,2.36261e-05,3.23882e-06,1.18037e-06,8.352326e-07,4.082013e-06,1.97831e-06,0.9389314,1.304322e-06,5.513055e-06
5,1.040423e-05,1.855698e-08,1.063067e-06,0.0001078275,7.015116e-08,1.894737e-05,7.047569e-08,4.792833e-07,2.67692e-08,1.558628e-07,...,4.621694e-07,2.913073e-05,1.556012e-07,2.278507e-07,1.007028e-08,2.325471e-08,3.807204e-08,3.752913e-08,1.506875e-08,1.69187e-07


In [206]:
from common.dataset import FilesFromCsvDataset, TransformedDataset
from common.meta import get_metafeatures, get_imsize_and_targets


dataset = FilesFromCsvDataset("../output/filtered_val_dataset.csv")
dataset = TransformedDataset(dataset,
                             transforms=lambda x: (x, Image.open(x).size),
                             target_transforms=lambda l: l - 1)

df_imsize_targets = get_imsize_and_targets(dataset)

y_true = df_imsize_targets['target']
y_probas = y_probas.loc[y_true.index, :]

In [207]:
y_pred = np.argmax(y_probas.values, axis=1)

In [208]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score

By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`
is equal to the number of observations known to be in group :math:`i` but
predicted to be in group :math:`j`.

In [209]:
cm = confusion_matrix(y_true, y_pred)
recall_per_class = recall_score(y_true, y_pred, average=None)

In [210]:
recall_per_class

array([0.86      , 0.82      , 0.92      , 0.5625    , 0.96      ,
       0.91489362, 0.93617021, 0.92      , 0.89583333, 0.95744681,
       0.96      , 0.97959184, 0.91666667, 0.9       , 0.4       ,
       1.        , 0.875     , 0.9375    , 0.72      , 0.94      ,
       0.87234043, 0.76595745, 0.71428571, 0.93877551, 0.81632653,
       0.875     , 0.7755102 , 0.89795918, 0.75510204, 0.8       ,
       0.79166667, 0.97959184, 0.98      , 0.95833333, 0.77083333,
       0.97916667, 0.93877551, 0.97959184, 0.72      , 0.98      ,
       0.98      , 0.95833333, 0.98      , 1.        , 0.875     ,
       1.        , 0.84      , 0.875     , 0.71428571, 0.67346939,
       0.75      , 0.89795918, 0.86      , 0.75510204, 1.        ,
       0.96      , 0.79591837, 0.87755102, 0.86      , 0.91836735,
       0.91666667, 0.86      , 0.32      , 0.97959184, 0.83333333,
       0.54      , 0.85416667, 1.        , 0.95833333, 0.7755102 ,
       0.97916667, 0.95918367, 0.98      , 0.83673469, 0.96   

In [201]:
recall_per_class

array([0.86      , 0.84      , 0.9       , 0.5625    , 0.98      ,
       0.93617021, 0.89361702, 0.92      , 0.9375    , 0.93617021,
       0.96      , 0.97959184, 0.91666667, 0.9       , 0.3       ,
       1.        , 0.89583333, 0.9375    , 0.66      , 0.92      ,
       0.87234043, 0.78723404, 0.75510204, 0.95918367, 0.81632653,
       0.91666667, 0.69387755, 0.89795918, 0.83673469, 0.8       ,
       0.79166667, 0.97959184, 0.98      , 0.95833333, 0.79166667,
       0.97916667, 0.93877551, 0.97959184, 0.68      , 0.98      ,
       0.98      , 0.97916667, 1.        , 1.        , 0.875     ,
       1.        , 0.84      , 0.89583333, 0.73469388, 0.65306122,
       0.75      , 0.91836735, 0.86      , 0.7755102 , 1.        ,
       0.96      , 0.81632653, 0.87755102, 0.86      , 0.91836735,
       0.91666667, 0.88      , 0.3       , 0.97959184, 0.875     ,
       0.48      , 0.85416667, 1.        , 0.97916667, 0.79591837,
       0.95833333, 0.93877551, 0.98      , 0.81632653, 0.94   

In [211]:
idx = np.arange(len(cm))
mcm = cm - cm[idx, idx] * np.eye(len(cm), dtype=np.int)

In [212]:
misclassifed = {}

for i in range(128):
    classes = np.where(mcm[i, :] >= 4)[0]
    values = mcm[i, classes]
    if len(classes) > 0:
        misclassifed[i] = {
            'recall': recall_per_class[i],
            'wrong_classes':[(c, v) for c, v in zip(classes, values)]
        }

In [213]:
misclassifed

{1: {'recall': 0.82, 'wrong_classes': [(87, 4)]},
 3: {'recall': 0.5625, 'wrong_classes': [(2, 7), (28, 4)]},
 14: {'recall': 0.4, 'wrong_classes': [(3, 4), (62, 7), (125, 6)]},
 18: {'recall': 0.72, 'wrong_classes': [(127, 5)]},
 21: {'recall': 0.7659574468085106, 'wrong_classes': [(16, 4)]},
 22: {'recall': 0.7142857142857143, 'wrong_classes': [(62, 8)]},
 25: {'recall': 0.875, 'wrong_classes': [(62, 4)]},
 26: {'recall': 0.7755102040816326, 'wrong_classes': [(111, 6)]},
 27: {'recall': 0.8979591836734694, 'wrong_classes': [(23, 4)]},
 28: {'recall': 0.7551020408163265, 'wrong_classes': [(3, 6)]},
 30: {'recall': 0.7916666666666666, 'wrong_classes': [(69, 6)]},
 34: {'recall': 0.7708333333333334, 'wrong_classes': [(12, 4), (69, 4)]},
 38: {'recall': 0.72, 'wrong_classes': [(86, 10), (108, 4)]},
 48: {'recall': 0.7142857142857143, 'wrong_classes': [(124, 6)]},
 49: {'recall': 0.673469387755102, 'wrong_classes': [(19, 4), (53, 11)]},
 50: {'recall': 0.75, 'wrong_classes': [(52, 4)]},
 

In [179]:
misclassifed

{1: {'recall': 0.84, 'wrong_classes': [(87, 4)]},
 3: {'recall': 0.5625, 'wrong_classes': [(2, 7), (28, 5)]},
 14: {'recall': 0.3, 'wrong_classes': [(3, 4), (28, 5), (62, 8), (125, 6)]},
 18: {'recall': 0.66, 'wrong_classes': [(127, 7)]},
 21: {'recall': 0.7872340425531915, 'wrong_classes': [(16, 4)]},
 22: {'recall': 0.7551020408163265, 'wrong_classes': [(62, 6)]},
 26: {'recall': 0.6938775510204082, 'wrong_classes': [(111, 9)]},
 27: {'recall': 0.8979591836734694, 'wrong_classes': [(23, 4)]},
 30: {'recall': 0.7916666666666666, 'wrong_classes': [(69, 6)]},
 34: {'recall': 0.7916666666666666, 'wrong_classes': [(12, 4), (69, 4)]},
 38: {'recall': 0.68, 'wrong_classes': [(86, 11), (108, 5)]},
 48: {'recall': 0.7346938775510204, 'wrong_classes': [(124, 5)]},
 49: {'recall': 0.6530612244897959, 'wrong_classes': [(19, 4), (53, 12)]},
 50: {'recall': 0.75, 'wrong_classes': [(52, 4)]},
 53: {'recall': 0.7755102040816326, 'wrong_classes': [(19, 4)]},
 57: {'recall': 0.8775510204081632, 'wrong

In [25]:
misclassifed_t = {}

for i in range(128):
    classes = np.where(cm[:, i] >= 4)[0]
    values = cm[classes, i]
    if len(classes) > 1:
        misclassifed_t[i] = {
            'true_classes':[(c, v) for c, v in zip(classes, values)]
        }

In [26]:
misclassifed_t

{1: {'true_classes': [(1, 42), (87, 4)]},
 2: {'true_classes': [(2, 45), (3, 7), (57, 4)]},
 3: {'true_classes': [(3, 27), (14, 4)]},
 4: {'true_classes': [(4, 49), (107, 4)]},
 12: {'true_classes': [(12, 44), (34, 4)]},
 14: {'true_classes': [(14, 15), (62, 6)]},
 16: {'true_classes': [(16, 43), (21, 4)]},
 18: {'true_classes': [(18, 33), (127, 5)]},
 19: {'true_classes': [(19, 46), (49, 4), (53, 4), (99, 7)]},
 22: {'true_classes': [(22, 37), (62, 5)]},
 23: {'true_classes': [(23, 47), (27, 4)]},
 25: {'true_classes': [(25, 44), (62, 8)]},
 26: {'true_classes': [(26, 34), (111, 7)]},
 28: {'true_classes': [(3, 5), (14, 5), (28, 41), (62, 9)]},
 31: {'true_classes': [(31, 48), (65, 5)]},
 38: {'true_classes': [(38, 34), (86, 4), (108, 6)]},
 39: {'true_classes': [(39, 49), (65, 7)]},
 41: {'true_classes': [(41, 47), (58, 4)]},
 52: {'true_classes': [(50, 4), (52, 43)]},
 53: {'true_classes': [(49, 12), (53, 38), (87, 5)]},
 56: {'true_classes': [(56, 40), (65, 6)]},
 59: {'true_classe

In [170]:
df_probas.head()

Unnamed: 0_level_0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,...,c118,c119,c120,c121,c122,c123,c124,c125,c126,c127
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6302,1.373452e-11,1.510409e-10,1.260431e-12,4.921817e-12,1.109817e-10,2.23659e-12,5.07353e-11,2.922072e-12,2.468959e-12,5.981444e-12,...,8.672143e-13,2.229347e-13,1.81405e-11,7.032357e-14,1.983395e-10,6.569913e-12,1.698414e-11,5.340068e-11,4.467111e-11,6.92629e-13
3349,6.708128e-07,5.170703e-07,1.846702e-07,4.019679e-07,2.481107e-07,3.374197e-07,2.376433e-06,9.245439e-07,0.004313464,3.201822e-07,...,2.315184e-07,2.930983e-07,1.619783e-05,1.609546e-06,0.0003666026,6.810326e-07,1.135143e-06,4.54206e-07,6.519358e-07,7.396901e-07
484,1.588944e-07,3.46054e-06,1.504418e-05,3.487897e-06,3.223231e-08,1.037788e-06,5.816736e-07,1.317494e-08,5.965195e-07,3.312266e-07,...,0.9997158,2.687145e-08,7.48398e-10,1.66301e-09,1.542122e-07,3.481257e-09,2.401089e-08,2.147751e-09,9.340559e-07,7.470094e-09
2677,5.284062e-11,1.166769e-08,2.62324e-11,6.023646e-10,3.024778e-07,3.700055e-10,2.080192e-10,8.870108e-11,1.285723e-10,1.011352e-10,...,1.989998e-11,6.330309e-10,2.516007e-08,1.031392e-10,9.538543e-10,4.859603e-09,1.651155e-07,9.416713e-10,1.15199e-08,9.996565e-11
1517,0.0001060053,2.394826e-07,4.06639e-06,1.615211e-05,1.085549e-07,6.640157e-05,1.039728e-06,5.748253e-08,1.454809e-06,2.930471e-05,...,1.193192e-05,0.1042851,2.138914e-07,1.049197e-07,1.285039e-07,2.57296e-08,4.993359e-07,7.153245e-08,4.385286e-08,1.651578e-06


In [195]:
class_index = 62
cols = ['c{}'.format(c) for c, _ in misclassifed[class_index]['wrong_classes']] + ['c{}'.format(class_index)]

In [27]:
df_probas.loc[y_true[y_true == class_index].index[:10], cols]

NameError: name 'class_index' is not defined

Same on test data

In [28]:
from pathlib import Path
import sys
sys.path.insert(0, Path(".").absolute().parent.as_posix())

In [30]:
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns

In [31]:
import numpy as np
import pandas as pd

from PIL import Image


OUTPUT_PATH = Path(".").absolute().parent / "output"

In [215]:
prediction_files = [
    OUTPUT_PATH / "test_probas_inceptionresnetv2_350_resized_crop" / "20180429_1242" / "probas.csv",
    OUTPUT_PATH / "test_probas_inceptionv4_350_resized_crop" / "20180429_1303" / "probas.csv",
    OUTPUT_PATH / "test_probas_nasnetalarge_350_resized_crop" / "20180429_1406" / "probas.csv",
    OUTPUT_PATH / "test_probas_inceptionresnetv2_350_weighted_sampler2_resized_crop" / "20180501_1725" / "probas.csv",    
]

In [216]:
dfs = [pd.read_csv(f, index_col='id') for f in prediction_files]

# merged_df = pd.concat([df for df in dfs], axis=1)
# merged_df.columns = names

In [217]:
df_probas = pd.concat(dfs, axis=0)

In [218]:
y_probas = df_probas.groupby('id').mean()
y_probas.head()

Unnamed: 0_level_0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,...,c118,c119,c120,c121,c122,c123,c124,c125,c126,c127
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,8.912504e-07,7.503365e-07,1.738389e-06,2e-06,4.381432e-07,7.707925e-07,9.45705e-07,7.333056e-07,1.624929e-07,7.409995e-07,...,1.646692e-07,5.56959e-07,2.156962e-06,4.956204e-07,4.15683e-07,2.661459e-07,5.785049e-07,4e-06,1.891047e-07,3.028284e-07
2,1.316189e-05,1.908036e-06,1.089418e-05,5.9e-05,1.443225e-05,1.348888e-05,2.944533e-05,9.746005e-05,2.568036e-06,3.107332e-06,...,3.381718e-06,0.0004205738,4.857768e-05,0.000123385,1.109794e-05,0.0001441601,0.0001078523,0.000301,5.269051e-05,9.692375e-05
3,1.602098e-06,5.981606e-08,1.21784e-08,2e-06,2.168214e-08,2.372393e-07,4.328313e-08,2.529006e-07,1.024756e-08,1.071161e-08,...,4.610815e-08,7.397583e-06,2.685228e-07,1.268696e-07,1.803629e-08,9.695865e-08,8.556419e-09,4e-06,1.318076e-08,1.603795e-07
4,0.002428594,0.02710631,0.008021799,0.000949,0.0001091015,0.0005171085,4.652084e-05,7.215493e-06,0.003152695,4.071674e-06,...,0.0004102423,0.0001497937,1.057597e-05,0.000195499,4.858759e-05,1.048289e-05,0.0008812576,2.1e-05,4.311096e-06,1.563177e-05
5,9.091457e-06,3.020426e-06,5.647092e-06,0.001673,2.00259e-06,2.349683e-06,3.458457e-06,0.0002757767,8.478133e-07,2.789346e-07,...,4.153027e-07,5.10582e-05,1.36059e-05,7.114927e-06,1.735761e-06,1.769032e-05,4.047222e-06,0.685021,1.195989e-06,8.61616e-05


In [219]:
missing_indices = list(set(merged_df['MajVote'].index) - set(y_probas.index))
missing_values = merged_df.loc[missing_indices, 'MajVote'].values
for idx, v in zip(missing_indices, missing_values):
    y_probas.loc[idx, :] = 0.01
    col = y_probas.columns[v - 1]
    y_probas.loc[idx, col] = 0.99

In [220]:
y_probas = y_probas.sort_index()
y_probas.tail()

Unnamed: 0_level_0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,...,c118,c119,c120,c121,c122,c123,c124,c125,c126,c127
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12796,3.740855e-08,7.699472e-07,3.621496e-07,3.964026e-08,1.351679e-08,6.045437e-08,2.199541e-07,6.69059e-09,3.975444e-07,6.771315e-09,...,5.479484e-09,6.028931e-07,1.559961e-07,6.559622e-08,8.916581e-08,1.017504e-07,8.690805e-08,2.232733e-08,2.711755e-09,1.691902e-09
12797,5.812775e-06,4.587319e-06,2.474041e-05,2.48818e-05,1.054539e-05,7.284781e-06,6.714315e-06,2.000873e-05,4.21314e-06,3.69569e-06,...,4.620517e-06,2.139555e-05,2.764587e-05,1.579153e-05,7.022183e-06,0.004496449,1.263526e-05,3.531694e-05,1.338969e-05,7.176692e-06
12798,5.18635e-05,2.736125e-06,0.2193131,0.0009613796,1.518391e-06,6.302892e-06,1.375831e-05,5.459569e-06,3.268561e-06,9.933181e-07,...,3.199144e-06,2.90024e-05,2.855623e-06,6.058239e-06,2.601914e-06,7.006069e-07,2.281592e-06,8.103623e-06,1.303272e-06,1.23996e-06
12799,0.8749835,8.248669e-06,0.0007530487,0.001940677,1.398804e-05,0.003930042,0.000124398,1.117339e-05,4.665101e-05,3.284426e-05,...,0.0003624489,0.004786405,0.0001282868,0.002173255,6.709419e-05,4.579863e-06,9.111378e-06,7.063308e-05,1.432265e-05,7.423627e-05
12800,4.885252e-08,2.691364e-08,8.78699e-08,1.913607e-07,3.794453e-08,8.208963e-08,9.52156e-08,2.03238e-08,8.779747e-08,6.181915e-09,...,1.766298e-08,1.143649e-07,5.289379e-08,4.477025e-08,2.463165e-07,2.244294e-07,1.63868e-07,9.434708e-09,2.923246e-08,6.561676e-07


In [221]:
y_pred = np.argmax(y_probas.values, axis=1) + 1

In [222]:
y_maj_votes = merged_df['MajVote'].values

In [223]:
(y_maj_votes == y_pred).sum(), (y_maj_votes != y_pred).sum(), len(y_maj_votes)

(12318, 482, 12800)

In [225]:
y_pred[y_maj_votes != y_pred]

array([ 45, 110,  91, 124,   4, 100, 102,  21,  90,  97,   1,   6,  27,
        15,  97,  50,  30,  24, 115,  49,  45,   4,  14,  50,  16,  80,
        42,  63,   4,  51,  51,  87, 128, 116,  71, 108,  98,  66,  35,
        63, 100,  81,  63,  45,   4,  21,  27, 112,  87,  45,  54,  22,
       124,  30, 126,  62, 116,  44, 116,   1,  89,  18,  18, 108,  96,
       112, 127,  69,  51,  31,  90,  47,   1, 104,  85,  98,  15,  15,
       109,  50,  63, 103,  66,   4,   3,  84,  43,  61, 127, 118,  83,
        93, 126,  37, 125,  10, 103,   3,  21, 112, 110, 124,  94,   8,
        39,  84,  69, 121, 107, 124,  27,  76,   1,  42, 106,  54,  82,
        83,  31,  73, 103,  18,  91, 117,  92, 115,  16,  86,  19,  92,
        97,   2,   8,  66,  65,  53,  71,  35,  74, 104,   5, 110,   4,
        27,  59,  70, 113, 118,   4,  21, 105,  63, 117,  97,  63,  61,
        66,  69,   7,  98, 110,  67,  15,   4, 115,  45,  66, 103,  45,
        63,  42,  61,  66,  52,  20,  11,  35,  27, 124,  96,  1

In [228]:
df = pd.DataFrame({"predicted": y_pred}, index=y_probas.index)
df.to_csv("mean_proba_nasnet_incv4_incv4rc_ws2.csv")

In [229]:
!head -10 mean_proba_nasnet_incv4_incv4rc_ws2.csv

id,predicted
1,12
2,71
3,91
4,54
5,126
6,76
7,94
8,8
9,127
