### imports

In [52]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
from sklearn.model_selection import StratifiedShuffleSplit

import env
from utils import KaggleCameraDataset, one_hot_decision_function, unhot

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## useful routines

In [2]:
def get_proba(proba_path):
    return pd.read_csv(proba_path).as_matrix(columns=map(str, range(10)))

In [4]:
P = get_proba('../models/proba1.csv')
P[:3, :3]

array([[  2.30427057e-01,   4.84325022e-01,   5.16182883e-03],
       [  4.63270843e-02,   1.69466971e-03,   1.10656278e-04],
       [  1.59370884e-01,   1.33133292e-01,   2.71781767e-03]])

## #1 average different predictions (from raw probs)

In [3]:
def avg_proba(*proba_paths, **kwargs):
    print kwargs.get('weights', None)
    return np.exp(np.average([np.log(1e-16 + get_proba(path)) for path in proba_paths], axis=0, weights=kwargs.get('weights', None)))

In [6]:
Q = avg_proba('../models/proba1.csv', 
              '../models/proba_best.csv', 
               weights=[1., 1.])
Q

array([[  1.30430266e-01,   4.46921155e-01,   3.23994139e-02, ...,
          1.20126526e-03,   1.05716349e-02,   6.87616952e-02],
       [  3.24848127e-02,   8.78940028e-02,   4.32265330e-03, ...,
          8.78389790e-04,   6.59955412e-01,   4.10077199e-02],
       [  1.20631762e-01,   1.52186640e-01,   1.57862445e-03, ...,
          1.04656043e-03,   1.19606800e-01,   5.12661040e-01],
       ..., 
       [  1.16247602e-01,   4.08802994e-01,   5.17931608e-02, ...,
          3.80601780e-02,   1.16218922e-02,   1.64976966e-01],
       [  4.96196985e-01,   4.23220114e-03,   2.48291728e-03, ...,
          3.88299840e-01,   1.79937498e-02,   6.91680338e-02],
       [  2.15672876e-01,   3.66537021e-02,   2.02466367e-03, ...,
          1.86312740e-05,   3.95222452e-02,   1.85410858e-02]])

## #2 convert proba to predictions

In [4]:
def proba_to_subm(proba, subm_path):
    """
    Parameters
    ----------
    proba : (2640, 10) np.ndarray
    """
    test_dataset = KaggleCameraDataset('../data/', train=False, lazy=True)
    fnames = [os.path.split(fname)[-1] for fname in test_dataset.X]
    index_pred = unhot(one_hot_decision_function(proba))
    data = {'fname': fnames,
            'camera': [KaggleCameraDataset.target_labels()[int(c)] for c in index_pred]}
    df = pd.DataFrame(data, columns=['fname', 'camera'])
    df.to_csv(subm_path, index=False)

In [8]:
proba_to_subm(Q, 'subm.csv')

## #3 how predictions differ

In [5]:
def get_diff(subm_path1, subm_path2):
    df1 = pd.read_csv(subm_path1)
    df2 = pd.read_csv(subm_path2)
    return (df1 != df2)['camera'].sum()/2640.

In [10]:
get_diff('../models/submission1.csv', '../models/submission_best.csv')

The minimum supported version is 2.4.6



0.4693181818181818

# average everything

In [27]:
proba_dirs = [
    # DenseNet no augmentation (512x512 crops)
    (0.1, '0.579-#15'),
    # DenseNet + random horiz flips only
    (0.25, '0.691-#23-tta-horiz'),
    # DenseNet + random crops + aug + TTA x 10 + LR restart
    (0.25, '0.732-#23-tta10'),
    (1., '0.750-#25'),
    # ... but trained on 256x256 crops
    (1., '0.742-#36'),
    # ... but trained using Hinge Loss (512x512)
    (1., '0.750-#30'),
    # CNN_Small 1-FC no aug
    (0.25, '0.665-#39'),
    # CNN_Small 2-FC (using best TTA) + SGD-m
    (0.1, '0.517-#48'),
    #
    ##
    (2., '0.859-d5-#95'), # ema.9 0.885
    (1.5, 'dh5-#98'),     # ema.9 0.841
    (1.5, 'r3-#100'),     # ema.9 0.845
    (2., 'R3-#103'),      # ema.9 0.877
    (1., 'c6-#105'),      # ema.9 0.791
]
weights, dirs = zip(*proba_dirs)
P = avg_proba(*map(lambda p: '../submissions/{0}/proba.csv'.format(p), dirs), 
              weights=list(weights))
proba_to_subm(P, '../submissions/averaged.csv')

[0.25, 0.5, 0.5, 1.0, 1.0, 1.0, 0.5, 0.25, 1.5, 1.25, 1.25, 1.5, 1.25]


## pseudo-labels

In [32]:
# P from 0.908-#109
ind = np.arange(2640)[P.max(axis=1) > 0.8]
print len(ind)

835


In [49]:
test_data = KaggleCameraDataset('../data/', train=False)
X = np.zeros((len(ind), 512, 512, 3), dtype=np.uint8)
pos = 0
for i in ind:
    x = Image.open(test_data.X[i])
    x = np.array(x, dtype=np.uint8)
    X[pos, ...] = x
    pos += 1
y = np.argmax(P, axis=1)[ind]

### stratifically split into ~335 for validation and 500 for training

In [55]:
sss = StratifiedShuffleSplit(n_splits=1, train_size=500, random_state=1337)
train_ind, val_ind = list(sss.split(X, y))[0]
print train_ind.shape

(500,)


In [57]:
np.save('../data/X_pseudo_train.npy', X[train_ind])
np.save('../data/y_pseudo_train.npy', y[train_ind])
np.save('../data/X_pseudo_val.npy', X[val_ind])
np.save('../data/y_pseudo_val.npy', y[val_ind])