# hc ensemble

## common

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import multiprocessing as mp
from datetime import datetime
import traceback
import sys
sys.path.insert(0, '../../data/siim-pneumothorax')

import torch
import torch.nn as nn
import torch.nn.functional as F
import fastai
from fastai.vision import *
from mask_functions import *
from fastai.callbacks import SaveModelCallback
import gc
from sklearn.model_selection import KFold
from PIL import Image
from mpl_toolkits.mplot3d import Axes3D

fastai.__version__

'1.0.54'

In [2]:
data_path = '../../data/siim-pneumothorax'

In [3]:
id_df = pd.read_csv(os.path.join(data_path, 'test_ids.csv'))
ids = list(id_df['ImageId'])

print(len(ids))

1377


In [4]:
def use_leakage(ids, ptt, pt_clean):
    leak_sample_df = pd.read_csv(os.path.join(data_path, "sample_submission_leak.csv"))

    # this part was taken from @raddar's kernel: https://www.kaggle.com/raddar/better-sample-submission
    masks_ = leak_sample_df.groupby('ImageId')['ImageId'].count().reset_index(name='N')
    masks_ = masks_.loc[masks_.N > 1].ImageId.values

    # restore preds with multiple masks
    for i, uid in enumerate(ids):
        if uid in masks_:
            pt_clean[i] = ptt[i]
            
    return pt_clean

## common config

In [5]:
sz = 512
tag = 'en2'

pts = [
    'hc_pt_20190729-122311_512_fold0',
    'hc_pt_20190729-122311_512_fold1'
]
thrs = [
    0.25,
    0.25
]
n_thrs = [
    75,
    75
]

In [6]:
for i, (pt, thr, n_thr) in enumerate(zip(pts, thrs, n_thrs)):
    print('i: {}, pt: {}, thr: {}, n_thr: {}'.format(i, pt, thr, n_thr))
    
    ptt = torch.load(pt)
    
    # hard mask
    ptt = (ptt[:,...]>thr).float()
    
    pt_mask = ptt.clone()
    
    # noise removal
    pt_mask[pt_mask.view(pt_mask.shape[0],-1).sum(-1) < n_thr*(sz/128.0)**2,...] = 0.0
    
    # leakage
    pt_mask = use_leakage(ids, ptt, pt_mask)
    
    if i == 0:
        pt_ensemble = pt_mask
    else:
        pt_ensemble += pt_mask
        
    gc.collect()
    torch.cuda.empty_cache()

i: 0, pt: hc_pt_20190729-122311_512_fold0, thr: 0.25, n_thr: 75
i: 1, pt: hc_pt_20190729-122311_512_fold1, thr: 0.25, n_thr: 75


In [7]:
# majority vote
pt_ensemble = (pt_ensemble[:,...]>(len(pts)/2.)).long()

In [8]:
# Generate rle encodings in parallel (images are first converted to the original size)
mask_size = 1024

def mask_worker(mask):
    im = PIL.Image.fromarray((mask.T*255).astype(np.uint8)).resize((mask_size, mask_size))
    im = np.asarray(im)

    rle = mask2rle(im, mask_size, mask_size)
    
    return rle

pool = mp.Pool()
rle_list = pool.map(mask_worker, pt_ensemble.numpy())

In [9]:
# ok, export submission csv

sub_df = pd.DataFrame({'ImageId': ids, 'EncodedPixels': rle_list})
sub_df.loc[sub_df.EncodedPixels=='', 'EncodedPixels'] = '-1'

sub_df.to_csv('ensemble_sub_{}_{}.csv'.format(tag, sz), index=False)

sub_df.head(20)

Unnamed: 0,ImageId,EncodedPixels
0,1.2.276.0.7230010.3.1.4.8323329.6106.151787519...,-1
1,1.2.276.0.7230010.3.1.4.8323329.6588.151787519...,-1
2,1.2.276.0.7230010.3.1.4.8323329.6014.151787519...,-1
3,1.2.276.0.7230010.3.1.4.8323329.6813.151787520...,-1
4,1.2.276.0.7230010.3.1.4.8323329.699.1517875164...,-1
5,1.2.276.0.7230010.3.1.4.8323329.6236.151787519...,-1
6,1.2.276.0.7230010.3.1.4.8323329.6680.151787519...,-1
7,1.2.276.0.7230010.3.1.4.8323329.6967.151787520...,-1
8,1.2.276.0.7230010.3.1.4.8323329.6923.151787520...,-1
9,1.2.276.0.7230010.3.1.4.8323329.6744.151787519...,-1
