### imports

In [1]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
from sklearn.model_selection import StratifiedShuffleSplit

import env
from utils import (KaggleCameraDataset, one_hot_decision_function, 
                   unhot, softmax, inv_softmax, progress_iter, RNG, float32)

%matplotlib inline
%load_ext autoreload
%autoreload 2

## useful routines

In [2]:
def get_proba(proba_path):
    return pd.read_csv(proba_path).as_matrix(columns=map(str, range(10)))


def avg_proba(*proba_paths, **kwargs):
    print kwargs.get('weights', None)
    #return np.exp(np.average([np.log(1e-16 + get_proba(path)) for path in proba_paths], axis=0, weights=kwargs.get('weights', None)))
    P = softmax(np.average([inv_softmax(get_proba(path)) for path in proba_paths], axis=0, weights=kwargs.get('weights', None)))
    return P


def proba_to_subm(proba, subm_path):
    """
    Parameters
    ----------
    proba : (2640, 10) np.ndarray
    """
    test_dataset = KaggleCameraDataset('../data/', train=False, lazy=True)
    fnames = [os.path.split(fname)[-1] for fname in test_dataset.X]
    index_pred = unhot(one_hot_decision_function(proba))
    data = {'fname': fnames,
            'camera': [KaggleCameraDataset.target_labels()[int(c)] for c in index_pred]}
    df = pd.DataFrame(data, columns=['fname', 'camera'])
    df.to_csv(subm_path, index=False)
    

def get_diff(subm_path1, subm_path2):
    df1 = pd.read_csv(subm_path1)
    df2 = pd.read_csv(subm_path2)
    return (df1 != df2)['camera'].sum()/2640.

In [4]:
P = get_proba('../models/proba1.csv')
P[:3, :3]

array([[  2.30427057e-01,   4.84325022e-01,   5.16182883e-03],
       [  4.63270843e-02,   1.69466971e-03,   1.10656278e-04],
       [  1.59370884e-01,   1.33133292e-01,   2.71781767e-03]])

## #1 average different predictions (from raw probs)

In [6]:
Q = avg_proba('../models/proba1.csv', 
              '../models/proba_best.csv', 
               weights=[1., 1.])
Q

array([[  1.30430266e-01,   4.46921155e-01,   3.23994139e-02, ...,
          1.20126526e-03,   1.05716349e-02,   6.87616952e-02],
       [  3.24848127e-02,   8.78940028e-02,   4.32265330e-03, ...,
          8.78389790e-04,   6.59955412e-01,   4.10077199e-02],
       [  1.20631762e-01,   1.52186640e-01,   1.57862445e-03, ...,
          1.04656043e-03,   1.19606800e-01,   5.12661040e-01],
       ..., 
       [  1.16247602e-01,   4.08802994e-01,   5.17931608e-02, ...,
          3.80601780e-02,   1.16218922e-02,   1.64976966e-01],
       [  4.96196985e-01,   4.23220114e-03,   2.48291728e-03, ...,
          3.88299840e-01,   1.79937498e-02,   6.91680338e-02],
       [  2.15672876e-01,   3.66537021e-02,   2.02466367e-03, ...,
          1.86312740e-05,   3.95222452e-02,   1.85410858e-02]])

## #2 convert proba to predictions

In [8]:
proba_to_subm(Q, 'subm.csv')

## #3 how predictions differ

In [10]:
get_diff('../models/submission1.csv', '../models/submission_best.csv')

The minimum supported version is 2.4.6



0.4693181818181818

# average everything

### generations 1, 2 -> 0.913

In [8]:
proba_dirs = [
    #
    ## 1th-generation
    ### 
    # DenseNet no augmentation (512x512 crops)
    (0.1, '0.579-#15'),
    # DenseNet + random horiz flips only
    (0.25, '0.691-#23-tta-horiz'),
    # DenseNet + random crops + aug + TTA x 10 + LR restart
    (0.25, '0.732-#23-tta10'),
    (1., '0.750-#25'),
    # ... but trained on 256x256 crops
    (1., '0.742-#36'),
    # ... but trained using Hinge Loss (512x512)
    (1., '0.750-#30'),
    # CNN_Small 1-FC no aug
    (0.25, '0.665-#39'),
    # CNN_Small 2-FC (using best TTA) + SGD-m
    (0.1, '0.517-#48'),
    #
    ## 2nd-generation
    ###
    (2.0, '0.859-d5-#95'), # ema.9 0.885 | RETEST WITH NEW TTA256 [+] -> 0.871
    (1.5, 'dh5-#98'),      # ema.9 0.841 | RETEST WITH NEW TTA64  [+]
    (1.5, 'r3-#100'),      # ema.9 0.845 | RETEST WITH NEW TTA64  [-]
    (2.0, 'R3-#103'),      # ema.9 0.877 | RETEST WITH NEW TTA128 [+]
    (1.0, 'c6-#105'),      # ema.9 0.791 | RETEST WITH NEW TTA32  [+]
    
#     # 3rd-generation
#     ## 
#     (0.75, 'dc1-#131'),  # TTA64 | ema.9 0.9126
#     (1.0, 'dhc1-#132'), # TTA32 | ema.9 0.8822
#     (1.0, 'D1-#135'),   # TTA32 | ema.9 ~0.89
#     (1.0, 'Rc1-#133'),  # TTA32 | ema.9 0.9004
#     (1.0, 'Z1-#134'),   # TTA32 | ema.9 0.8995
#     (0.5, 'cc1-#137'),  # TTA32 | ema.9 0.7948
]
weights, dirs = zip(*proba_dirs)
P = avg_proba(*map(lambda p: '../submissions/{0}/proba.csv'.format(p), dirs), 
              weights=list(weights))
proba_to_subm(P, '../submissions/averaged_gen12.csv')

[0.1, 0.25, 0.25, 1.0, 1.0, 1.0, 0.25, 0.1, 2.0, 1.5, 1.5, 2.0, 1.0, 0.75, 1.0, 1.0, 1.0, 1.0, 0.5]


### generation 4th (blending)

In [55]:
proba_dirs = [
    # D121 256x256 d=0 weighted
    (1., '0.960-d-#181'),
    # BEST FROM D121 d=
    (0.5, '0.949-dw-0.3-#194'),
    # D121 512x512 d=0.1 (unw.)
    (0.5, '0.946-d-512-#191'),
    # R50 512x512 d=0.2 (unw.)
    (0.25, 'R-512-#189'),
    # R50 256x256 d=0 (unw.)
    (0.25, 'R-#190'),
    # TODO: C2 * 0.25
]
weights, dirs = zip(*proba_dirs)
P = avg_proba(*map(lambda p: '../submissions/{0}/proba.csv'.format(p), dirs), 
              weights=list(weights))
proba_to_subm(P, '../submissions/averaged_gen4.csv')

[1.0, 1.0, 1.0, 1.0, 1.0]


### blending

### average all approaches

In [None]:
proba_dirs = [
    #
    ##
    ### generations 1, 2
    (1., '0.913-#109-recomp'),   
]

weights, dirs = zip(*proba_dirs)
P = avg_proba(*map(lambda p: '../submissions/{0}/proba.csv'.format(p), dirs), 
              weights=list(weights))
proba_to_subm(P, '../submissions/all_averaged.csv')

## pseudo-labels[2]

In [11]:
P = get_proba('../submissions/0.960-d-#181/proba.csv')
THRESHOLD = 0.99
pseudo_ind = {}
for c in xrange(10):
    c_ind = np.arange(len(P))[P[:,c] > THRESHOLD].tolist()
    c_ind.sort(key=lambda i: -P[i, c]) # sorted such that c_ind[0] is the most confident image
    pseudo_ind[c] = c_ind
print map(len, pseudo_ind.values())

[248, 103, 237, 242, 236, 252, 251, 206, 223, 229]


In [12]:
pseudo_ind[0][:5]

[1742, 361, 1386, 963, 2436]

In [13]:
pseudo_ind[7][:3]

[2138, 2171, 2220]

In [14]:
with open('../data/pseudo_ind.json', 'w') as f:
    json.dump(pseudo_ind, f, indent=4, sort_keys=True)

In [31]:
with open('../data/pseudo_ind.json') as f:
    pseudo_ind = json.load(f)

In [28]:
pseudo_ind['0'][:5]

[1742, 361, 1386, 963, 2436]

### load and save most confident images for validation

In [18]:
test_dataset = KaggleCameraDataset('../data/', train=False)

In [19]:
N_PSEUDO_VAL_PER_CLASS = 24
N_PSEUDO_VAL = N_PSEUDO_VAL_PER_CLASS * 10
X_pseudo_val = np.zeros((N_PSEUDO_VAL, 512, 512, 3), dtype=np.uint8)

pos = 0
y_pseudo_val = []
manip = []
for c in xrange(10):
    for i in pseudo_ind[str(c)][:N_PSEUDO_VAL_PER_CLASS]:
        (x, m), _ = test_dataset[i]
        X_pseudo_val[pos] = np.asarray(x, dtype=np.uint8)
        pos += 1
        manip.append(m)
        y_pseudo_val.append(c)

### merge with existing validation, shuffle and save

In [84]:
X_val_val = np.load('../data/X_val_val.npy')
y_val_val = np.load('../data/y_val_val.npy')

In [85]:
X_val_with_pseudo = np.concatenate((X_val_val, X_pseudo_val))
print X_val_with_pseudo.shape, X_val_with_pseudo.dtype

(480, 512, 512, 3) uint8


In [86]:
y_val_with_pseudo = np.concatenate((y_val_val, np.asarray(y_pseudo_val)))
print y_val_with_pseudo.shape, y_val_with_pseudo.dtype

(480,) int64


In [87]:
manip_with_pseudo = np.concatenate((np.asarray([float32(0.)]*len(y_val_val)), manip))
print manip_with_pseudo.shape, manip_with_pseudo.dtype

(480, 1) float32


In [90]:
ind = np.arange(480)
RNG(seed=1234).shuffle(ind)
X_val_with_pseudo = X_val_with_pseudo[ind]
y_val_with_pseudo = y_val_with_pseudo[ind]
manip_with_pseudo = manip_with_pseudo[ind]
np.save('../data/X_val_with_pseudo.npy', X_val_with_pseudo)
np.save('../data/y_val_with_pseudo.npy', y_val_with_pseudo)
np.save('../data/manip_with_pseudo.npy', manip_with_pseudo)

In [97]:
sum(manip_with_pseudo)

array([ 68.], dtype=float32)

In [102]:
float32(0.)[0] < 0.1

True

### remaining pseudo-labels for training

In [32]:
for c in xrange(10):
    pseudo_ind[c] = pseudo_ind[str(c)][N_PSEUDO_VAL_PER_CLASS:]
    del pseudo_ind[str(c)]
PSEUDO_CLASSES = map(len, pseudo_ind.values())
PSEUDO_CLASSES

[224, 79, 213, 218, 212, 228, 227, 182, 199, 205]

#### split into blocks of 8 images and save like regular train

In [33]:
PSEUDO_BLOCK_SIZE = 8
N_PSEUDO_BLOCKS = [int(np.round(t/float(PSEUDO_BLOCK_SIZE))) for t in PSEUDO_CLASSES]
N_PSEUDO_BLOCKS

[28, 10, 27, 27, 26, 28, 28, 23, 25, 26]

In [None]:
for c in xrange(10):
    c_ind = pseudo_ind[c]
    RNG(seed=8888 + c).shuffle(c_ind)
    class_blocks = []
    class_manip = []
    for _ in xrange(N_PSEUDO_BLOCKS[c]):
        class_blocks.append([])
        class_manip.append([])
    pos = 0
    for i in progress_iter(c_ind, True):
        (x, m), _ = test_dataset[i]
        x = np.asarray(x, dtype=np.uint8)
        class_blocks[pos % N_PSEUDO_BLOCKS[c]].append(x)
        class_manip[pos % N_PSEUDO_BLOCKS[c]].append(m)
        pos += 1
    for b in xrange(N_PSEUDO_BLOCKS[c]):
        np.save('../data/X_pseudo_{0}_{1}.npy'.format(c, b), np.asarray(class_blocks[b], dtype=np.uint8))
        np.save('../data/manip_pseudo_{0}_{1}.npy'.format(c, b), np.asarray(class_manip[b], dtype=np.float32))

### manip ratio

In [40]:
manip = []
for c in xrange(10):
    c_ind = pseudo_ind[c]
    for i in c_ind:
        (_, m), _ = test_dataset[i]
        manip.append(m)
print len(manip)
print sum(manip)

1987
[ 972.]


## pseudo-labels

In [32]:
# P from 0.908-#109
ind = np.arange(2640)[P.max(axis=1) > 0.8]
print len(ind)

835


In [49]:
test_data = KaggleCameraDataset('../data/', train=False)
X = np.zeros((len(ind), 512, 512, 3), dtype=np.uint8)
pos = 0
for i in ind:
    x = Image.open(test_data.X[i])
    x = np.array(x, dtype=np.uint8)
    X[pos, ...] = x
    pos += 1
y = np.argmax(P, axis=1)[ind]

### stratifically split into ~335 for validation and 500 for training

In [55]:
sss = StratifiedShuffleSplit(n_splits=1, train_size=500, random_state=1337)
train_ind, val_ind = list(sss.split(X, y))[0]
print train_ind.shape

(500,)


In [57]:
np.save('../data/X_pseudo_train.npy', X[train_ind])
np.save('../data/y_pseudo_train.npy', y[train_ind])
np.save('../data/X_pseudo_val.npy', X[val_ind])
np.save('../data/y_pseudo_val.npy', y[val_ind])