## Ensemble Blender

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
from pathlib import Path
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

from fastai import *
from fastai.vision import *

from utils import open_4_channel
from resnet_v2 import Resnet4Channel

In [3]:
gpu_device = 0

defaults.device = torch.device(f'cuda:{gpu_device}')
torch.cuda.set_device(gpu_device)

In [4]:
path = Path('/home/haider/data/human_protein_atlas/')

In [5]:
# df = pd.read_csv(path/'train.csv')
# df.head()

In [6]:
np.random.seed(42)
src = (ImageItemList.from_csv(path, 'train.csv', folder='train', suffix='.png')
       .random_split_by_pct(0.2)
       .label_from_df(sep=' ',  classes=[str(i) for i in range(28)]))

In [7]:
src.train.x.create_func = open_4_channel
src.train.x.open = open_4_channel

In [8]:
src.valid.x.create_func = open_4_channel
src.valid.x.open = open_4_channel

In [9]:
test_ids = list(sorted({fname.split('_')[0] for fname in os.listdir(path/'test')}))

In [10]:
test_fnames = [path/'test'/test_id for test_id in test_ids]

In [11]:
test_fnames[:5]

[PosixPath('/home/haider/data/human_protein_atlas/test/00008af0-bad0-11e8-b2b8-ac1f6b6435d0'),
 PosixPath('/home/haider/data/human_protein_atlas/test/0000a892-bacf-11e8-b2b8-ac1f6b6435d0'),
 PosixPath('/home/haider/data/human_protein_atlas/test/0006faa6-bac7-11e8-b2b7-ac1f6b6435d0'),
 PosixPath('/home/haider/data/human_protein_atlas/test/0008baca-bad7-11e8-b2b9-ac1f6b6435d0'),
 PosixPath('/home/haider/data/human_protein_atlas/test/000cce7e-bad4-11e8-b2b8-ac1f6b6435d0')]

In [12]:
src.add_test(test_fnames, label='0');

In [13]:
src.test.x.create_func = open_4_channel
src.test.x.open = open_4_channel

In [14]:
protein_stats = ([0.08069, 0.05258, 0.05487, 0.08282], [0.13704, 0.10145, 0.15313, 0.13814])

In [15]:
trn_tfms,_ = get_transforms(do_flip=True, flip_vert=True, max_rotate=30., max_zoom=1,
                      max_lighting=0.05, max_warp=0.)

In [16]:
NUM_WORKERS = 8
BS = 64//4

In [17]:
data = (src.transform((trn_tfms, _), size=512)
        .databunch(bs=BS, num_workers=NUM_WORKERS).normalize(protein_stats))

#        .normalize(imagenet_stats)

In [18]:
# data.show_batch(rows=3, figsize=(12,9))

In [19]:
def resnet50(pretrained):
    return Resnet4Channel(encoder_depth=50)

In [20]:
# copied from https://github.com/fastai/fastai/blob/master/fastai/vision/learner.py
def _resnet_split(m): 
    # import pdb; pdb.set_trace()  ; m[0] returns the body ; m[0][6] returns the 7th block of the body ; m[1] returns the head; 
    # so it seems this function specifies the points that divides the 3 groups for the differential learning rates
    return (m[0][6],m[1])

In [21]:
f1_score = partial(fbeta, thresh=0.2, beta=1)

In [22]:
learn = create_cnn(
    data,
    resnet50,
    cut=-2,
    split_on=_resnet_split,
    loss_func=F.binary_cross_entropy_with_logits,
    path=path,    
    metrics=[f1_score], 
)

In [23]:
models = ['stage-2-rn50_sz512', 
          'stage-2-rn50_sz512-os', 
          'stage-2-rn50_sz512_both', 
          'stage-2-rn50_sz512-os-from-extrads']


In [24]:

preds = torch.zeros([4, 6214, 28])
y = torch.zeros([5, 6214, 28])
for i,m in enumerate(models):
    learn.load(m)
    preds[i],y[i] = learn.get_preds(DatasetType.Valid)
    
            
    
    
    
    

In [25]:
preds.size()

torch.Size([4, 6214, 28])

In [26]:
preds = preds.mean(0)
preds.size()

torch.Size([6214, 28])

In [27]:
y.size()

torch.Size([5, 6214, 28])

In [28]:
y = y.mean(0)
y.size()

torch.Size([6214, 28])

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import scipy.optimize as opt

# preds_y = model.predict(valid_x) # in his code mine is : preds_y -> preds; valid_y -> y

thresholds = np.linspace(0, 1, 1500)
score = 0.0
best_threshold=0.0
best_val = 0.0
for threshold in thresholds:
    score = f1_score(y > 0.5, preds > threshold, average='macro')
    if score > best_val:
        best_threshold = threshold
        best_val = score
    print("Threshold %0.4f, F1: %0.4f" % (threshold,score))

print("BEST: %0.5f, F1: %0.5f" % (best_threshold,best_val))



Threshold 0.0000, F1: 0.0992
Threshold 0.0007, F1: 0.1362
Threshold 0.0013, F1: 0.1640
Threshold 0.0020, F1: 0.1872
Threshold 0.0027, F1: 0.2074
Threshold 0.0033, F1: 0.2253
Threshold 0.0040, F1: 0.2412
Threshold 0.0047, F1: 0.2555
Threshold 0.0053, F1: 0.2704
Threshold 0.0060, F1: 0.2834
Threshold 0.0067, F1: 0.2963
Threshold 0.0073, F1: 0.3080
Threshold 0.0080, F1: 0.3179
Threshold 0.0087, F1: 0.3286
Threshold 0.0093, F1: 0.3402
Threshold 0.0100, F1: 0.3499
Threshold 0.0107, F1: 0.3624
Threshold 0.0113, F1: 0.3721
Threshold 0.0120, F1: 0.3823
Threshold 0.0127, F1: 0.3912
Threshold 0.0133, F1: 0.4009
Threshold 0.0140, F1: 0.4070
Threshold 0.0147, F1: 0.4156
Threshold 0.0153, F1: 0.4230
Threshold 0.0160, F1: 0.4279
Threshold 0.0167, F1: 0.4352
Threshold 0.0173, F1: 0.4408
Threshold 0.0180, F1: 0.4452
Threshold 0.0187, F1: 0.4545
Threshold 0.0193, F1: 0.4613
Threshold 0.0200, F1: 0.4663
Threshold 0.0207, F1: 0.4709
Threshold 0.0213, F1: 0.4762
Threshold 0.0220, F1: 0.4816
Threshold 0.02

  'precision', 'predicted', average, warn_for)


Threshold 0.4463, F1: 0.7294
Threshold 0.4470, F1: 0.7291
Threshold 0.4476, F1: 0.7285
Threshold 0.4483, F1: 0.7282
Threshold 0.4490, F1: 0.7282
Threshold 0.4496, F1: 0.7280
Threshold 0.4503, F1: 0.7295
Threshold 0.4510, F1: 0.7294
Threshold 0.4516, F1: 0.7291
Threshold 0.4523, F1: 0.7290
Threshold 0.4530, F1: 0.7290
Threshold 0.4536, F1: 0.7290
Threshold 0.4543, F1: 0.7290
Threshold 0.4550, F1: 0.7289
Threshold 0.4556, F1: 0.7290
Threshold 0.4563, F1: 0.7283
Threshold 0.4570, F1: 0.7276
Threshold 0.4576, F1: 0.7277
Threshold 0.4583, F1: 0.7277
Threshold 0.4590, F1: 0.7274
Threshold 0.4596, F1: 0.7274
Threshold 0.4603, F1: 0.7272
Threshold 0.4610, F1: 0.7271
Threshold 0.4616, F1: 0.7272
Threshold 0.4623, F1: 0.7270
Threshold 0.4630, F1: 0.7262
Threshold 0.4636, F1: 0.7262
Threshold 0.4643, F1: 0.7260
Threshold 0.4650, F1: 0.7259
Threshold 0.4656, F1: 0.7258
Threshold 0.4663, F1: 0.7255
Threshold 0.4670, F1: 0.7257
Threshold 0.4676, F1: 0.7256
Threshold 0.4683, F1: 0.7254
Threshold 0.46

In [30]:
preds_np= to_np(preds)

thresholds = np.linspace(0, 1, 100)
score = 0.0
test_threshold=0.5*np.ones(28)
best_threshold=np.zeros(28)
best_val = np.zeros(28)
for i in range(28):
    for threshold in thresholds:
        test_threshold[i] = threshold
        # max_val = np.max(preds)
        # import pdb; pdb.set_trace()
        val_predict = (preds_np > test_threshold)
        score = f1_score(y > 0.5, val_predict, average='macro')
        if score > best_val[i]:
            best_threshold[i] = threshold
            best_val[i] = score

    print("Threshold[%d] %0.6f, F1: %0.6f" % (i,best_threshold[i],best_val[i]))
    test_threshold[i] = best_threshold[i]
print("Best threshold: ")
print(best_threshold)
print("Best f1:")
print(best_val)

Threshold[0] 0.404040, F1: 0.714238
Threshold[1] 0.242424, F1: 0.715214
Threshold[2] 0.313131, F1: 0.716193
Threshold[3] 0.363636, F1: 0.716742
Threshold[4] 0.252525, F1: 0.717982
Threshold[5] 0.303030, F1: 0.719480
Threshold[6] 0.333333, F1: 0.721254
Threshold[7] 0.333333, F1: 0.722005
Threshold[8] 0.090909, F1: 0.732209
Threshold[9] 0.121212, F1: 0.741608
Threshold[10] 0.313131, F1: 0.744355
Threshold[11] 0.262626, F1: 0.744684
Threshold[12] 0.303030, F1: 0.745787
Threshold[13] 0.222222, F1: 0.747507
Threshold[14] 0.484848, F1: 0.747788
Threshold[15] 0.141414, F1: 0.754931
Threshold[16] 0.282828, F1: 0.758583
Threshold[17] 0.424242, F1: 0.762863
Threshold[18] 0.282828, F1: 0.764512
Threshold[19] 0.303030, F1: 0.766572
Threshold[20] 0.242424, F1: 0.771582
Threshold[21] 0.212121, F1: 0.773855
Threshold[22] 0.292929, F1: 0.776705
Threshold[23] 0.303030, F1: 0.777599
Threshold[24] 0.646465, F1: 0.778439
Threshold[25] 0.363636, F1: 0.779286
Threshold[26] 0.272727, F1: 0.785029
Threshold[2

In [31]:
best_threshold

array([0.40404 , 0.242424, 0.313131, 0.363636, 0.252525, 0.30303 , 0.333333, 0.333333, 0.090909, 0.121212, 0.313131,
       0.262626, 0.30303 , 0.222222, 0.484848, 0.141414, 0.282828, 0.424242, 0.282828, 0.30303 , 0.242424, 0.212121,
       0.292929, 0.30303 , 0.646465, 0.363636, 0.272727, 0.030303])

**Submission**

In [32]:
name = 'ensemble_4_mdls_after_training_extrads_in_2_days'

preds_t,_ = learn.get_preds(DatasetType.Test)
pred_labels = [' '.join(list([str(i) for i in np.nonzero(row>best_threshold)[0]])) for row in np.array(preds_t)]
df = pd.DataFrame({'Id':test_ids,'Predicted':pred_labels})
df.to_csv(path/'subs'/f'{name}.csv', header=True, index=False)

In [33]:
!kaggle competitions submit -c human-protein-atlas-image-classification -f {path}/subs/{name}.csv -m "'ensemble_4_mdls_in_rush'"

100%|████████████████████████████████████████| 469k/469k [00:13<00:00, 35.6kB/s]
Successfully submitted to Human Protein Atlas Image Classification