In [60]:
import sys
print(sys.path)
import seaborn as sns
sns.set()
sys.path.append("/tmp/fastai/old")

['', '/opt/conda/lib/python36.zip', '/opt/conda/lib/python3.6', '/opt/conda/lib/python3.6/lib-dynload', '/opt/conda/lib/python3.6/site-packages', '/opt/conda/lib/python3.6/site-packages/torchvision-0.2.1-py3.6.egg', '/opt/conda/lib/python3.6/site-packages/IPython/extensions', '/home/yukimiki/.ipython', '/tmp/fastai/old', '/tmp/fastai/old', '/tmp/fastai/old']


In [61]:
from fastai.conv_learner import *
from fastai.dataset import *

In [62]:
from datetime import datetime
import pandas as pd
import numpy as np
np.random.seed(seed=32)
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import scipy.optimize as opt
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
#from torchvision.transforms import RandomHorizontalFlip, RandomVerticalFlip
from tensorboard_cb import TensorboardLogger
import torch
torch.manual_seed(7)
torch.cuda.manual_seed_all(7)
import utils_pytorch

In [63]:
nw = 20   #number of workers for data loader
arch = resnet34 #specify target architecture

In [64]:
train_names = list({f[:36] for f in os.listdir(utils_pytorch.TRAIN)})
test_names = list({f[:36] for f in os.listdir(utils_pytorch.TEST)})
# tr_n, val_n = train_test_split(train_names, test_size=0.1, random_state=42)

data_info = pd.read_csv(utils_pytorch.LABELS)
tr_n, val_n = train_test_split(data_info, test_size = 0.1, 
                 stratify = data_info['Target'].map(lambda x: x[:3] if '27' not in x else '0'), random_state=42)
tr_n = tr_n['Id'].tolist()
val_n = val_n['Id'].tolist()

In [65]:
def get_data(sz,bs,stats=None):
    #data augmentation
    aug_tfms = [RandomRotate(45, tfm_y=TfmType.NO),
                RandomFlip(),
                RandomDihedral(tfm_y=TfmType.NO),
                RandomLighting(0.05, 0.05, tfm_y=TfmType.NO)]
    #mean and std in of each channel in the train set
    #stats = A([0.08069, 0.05258, 0.05487, 0.08282], [0.13704, 0.10145, 0.15313, 0.13814])
    if stats is None:
        stats = A([0.0804419, 0.05262986, 0.05474701, 0.08270896], [0.13000701, 0.08796628, 0.1386317, 0.12718021]) # calulate myself
    else:
        stats = A(stats)
    tfms = tfms_from_stats(stats, sz, crop_type=CropType.NO, tfm_y=TfmType.NO, 
                aug_tfms=aug_tfms)
    ds = ImageData.get_ds(utils_pytorch.pdFilesDataset, (tr_n[:-(len(tr_n)%bs)],utils_pytorch.TRAIN), 
                (val_n,utils_pytorch.TRAIN), tfms, test=(test_names,utils_pytorch.TEST))
    md = ImageData(utils_pytorch.PATH, ds, bs, num_workers=nw, classes=None)
    return md

In [66]:
sz = 256 #image size
bs = 64  #batch size

arch = resnet34 #specify target architecture
# dir_name = datetime.strftime(datetime.now(), '%Y%m%d%H%M%S')
# print(dir_name)
# dir_path = os.path.join('test', dir_name)
#best_model_path = '20181201025910_size512_best_resnet'


md = get_data(sz,bs)
learner = utils_pytorch.ConvLearner.pretrained(arch, md, ps=0.5) #dropout 50%
pretrained_model_name = '20181206004817_size256_B64_lr0.01_resnet34_best_resnet'
learner.load(pretrained_model_name)
#learner.load(best_model_path)
learner.set_data(md)

# learner.opt_fn = optim.Adam
# learner.clip = 1.0 #gradient clipping
# learner.crit = utils_pytorch.FocalLoss()
# learner.metrics = [utils_pytorch.acc, utils_pytorch.f1_torch]
# tb_logger = TensorboardLogger(learner.model, md, dir_path, metrics_names=['acc', 'f1'])
#save_best_model = SaveBestModel(model=learner.model)

In [67]:
def sigmoid_np(x):
    return 1.0/(1.0 + np.exp(-x))

preds,y = learner.TTA(n_aug=16)
preds = np.stack(preds, axis=-1)
preds = sigmoid_np(preds)
pred = preds.max(axis=-1)



In [68]:
def F1_soft(preds,targs,th=0.5,d=50.0):
    preds = sigmoid_np(d*(preds - th))
    targs = targs.astype(np.float)
    score = 2.0*(preds*targs).sum(axis=0)/((preds+targs).sum(axis=0) + 1e-6)
    return score

def fit_val(x,y):
    params = 0.5*np.ones(len(utils_pytorch.name_label_dict))
    wd = 1e-5
    error = lambda p: np.concatenate((F1_soft(x,y,p) - 1.0,
                                      wd*(p - 0.5)), axis=None)
    p, success = opt.leastsq(error, params)
    return p

In [69]:
th = fit_val(pred,y)
th[th<0.1] = 0.1
print('Thresholds: ',th)
print('F1 macro: ',f1_score(y, pred>th, average='macro'))
print('F1 macro (th = 0.5): ',f1_score(y, pred>0.5, average='macro'))
print('F1 micro: ',f1_score(y, pred>th, average='micro'))

Thresholds:  [0.53734 0.63686 0.62736 0.57562 0.62739 0.57219 0.58239 0.6307  0.56685 0.70104 0.70727 0.64738 0.55646
 0.52518 0.5653  0.55973 0.49559 0.46114 0.48565 0.54266 0.57667 0.53584 0.58586 0.61077 0.75063 0.51474
 0.60322 0.29303]
F1 macro:  0.6999953975805191
F1 macro (th = 0.5):  0.6408342696559723
F1 micro:  0.7649950835791542


In [70]:
print('Fractions: ',(pred > th).mean(axis=0))
print('Fractions (true): ',(y > th).mean(axis=0))

Fractions:  [0.46654 0.03507 0.10425 0.04408 0.04987 0.06853 0.02381 0.07561 0.00129 0.00129 0.00097 0.03282 0.02027
 0.01609 0.03378 0.00032 0.01094 0.00611 0.03121 0.0473  0.00225 0.12677 0.01802 0.08848 0.0074  0.32014
 0.00837 0.00129]
Fractions (true):  [0.42535 0.03861 0.11776 0.04923 0.05856 0.07497 0.03185 0.08784 0.00129 0.00161 0.00097 0.03539 0.0222
 0.01544 0.03636 0.00064 0.01673 0.00804 0.03024 0.04698 0.00547 0.12323 0.02574 0.09299 0.0103  0.26062
 0.01062 0.00032]


In [71]:
# stats = np.array([[0.05908022, 0.04532852, 0.04065233, 0.05923426], [0.10371015, 0.07984633, 0.10664798, 0.09878183]])
# md = get_data(sz,bs, stats)
# learner.set_data(md)

In [72]:
preds_t,y_t = learner.TTA(n_aug=16,is_test=True)
preds_t = np.stack(preds_t, axis=-1)
preds_t = sigmoid_np(preds_t)
pred_t = preds_t.max(axis=-1) #max works better for F1 macro score



In [73]:
def save_pred(pred, th=0.5, fname='protein_classification.csv', use_leak=False):
    if use_leak:
        print('use leak')
    pred_list = []
    for line in pred:
        s = ' '.join(list([str(i) for i in np.nonzero(line>th)[0]]))
        pred_list.append(s)
        
    sample_df = pd.read_csv(utils_pytorch.SAMPLE)
    sample_list = list(sample_df.Id)
    leak_df = pd.read_csv('./data/test_matches.csv')
#     pred_dic = dict((key, value) for (key, value) 
#                 in zip(learner.data.test_ds.fnames,pred_list))
    pred_dic = {}
    for key, value in zip(learner.data.test_ds.fnames,pred_list):
        pred_dic[key] = value
        check_leak_df = leak_df.query('Test.str.contains(@key)' ,engine='python')
        if use_leak and len(check_leak_df) > 0:
            #print(f'found leak data ! key:{key}, target:{check_leak_df.iloc[0,5]}')
            pred_dic[key] = check_leak_df.iloc[0,5]
    pred_list_cor = [pred_dic[id] for id in sample_list]
    df = pd.DataFrame({'Id':sample_list,'Predicted':pred_list_cor})
    df.to_csv(fname, header=True, index=False)

In [74]:
th_t = np.array([0.565,0.39,0.55,0.345,0.33,0.39,0.33,0.45,0.38,0.39,
               0.34,0.42,0.31,0.38,0.49,0.50,0.38,0.43,0.46,0.40,
               0.39,0.505,0.37,0.47,0.41,0.545,0.32,0.1])
print('Fractions: ',(pred_t > th_t).mean(axis=0))
save_pred(pred_t,th_t, use_leak=True)

Fractions:  [0.44061 0.05691 0.11246 0.10007 0.10212 0.15955 0.08084 0.11596 0.00273 0.00214 0.00171 0.06597 0.05811
 0.02478 0.04991 0.00026 0.04247 0.02205 0.04546 0.09212 0.00957 0.18698 0.04751 0.13587 0.01786 0.34669
 0.05204 0.03435]
use leak
found leak data ! key:d48b2036-bacc-11e8-b2b8-ac1f6b6435d0, target:14
found leak data ! key:107d6830-bac6-11e8-b2b7-ac1f6b6435d0, target:19 0
found leak data ! key:7929949a-bad4-11e8-b2b8-ac1f6b6435d0, target:25 17
found leak data ! key:1bcde1d2-bac7-11e8-b2b7-ac1f6b6435d0, target:27
found leak data ! key:e3279d4c-bad4-11e8-b2b8-ac1f6b6435d0, target:13
found leak data ! key:8f257b9c-bacf-11e8-b2b8-ac1f6b6435d0, target:21
found leak data ! key:d79a1e12-bad1-11e8-b2b8-ac1f6b6435d0, target:16
found leak data ! key:af2c5f2e-bac9-11e8-b2b8-ac1f6b6435d0, target:25 14
found leak data ! key:04eef62c-bad6-11e8-b2b9-ac1f6b6435d0, target:12 25 0
found leak data ! key:1a15e75a-bad5-11e8-b2b8-ac1f6b6435d0, target:16 14 17 0
found leak data ! key:869a7f8c

In [75]:
print('Thresholds: ',th_t)
print('F1 macro: ',f1_score(y, pred>th_t, average='macro'))
print('F1 macro (th = 0.5): ',f1_score(y, pred>0.5, average='macro'))
print('F1 micro: ',f1_score(y, pred>th_t, average='micro'))

Thresholds:  [0.565 0.39  0.55  0.345 0.33  0.39  0.33  0.45  0.38  0.39  0.34  0.42  0.31  0.38  0.49  0.5   0.38  0.43
 0.46  0.4   0.39  0.505 0.37  0.47  0.41  0.545 0.32  0.1  ]
F1 macro:  0.5550297956508498
F1 macro (th = 0.5):  0.6408342696559723
F1 micro:  0.668060133451242


In [76]:
lb_prob = [
 0.362397820,0.043841336,0.075268817,0.059322034,0.075268817,
 0.075268817,0.043841336,0.075268817,0.010000000,0.010000000,
 0.010000000,0.043841336,0.043841336,0.014198783,0.043841336,
 0.010000000,0.028806584,0.014198783,0.028806584,0.059322034,
 0.010000000,0.126126126,0.028806584,0.075268817,0.010000000,
 0.222493880,0.028806584,0.010000000]
# I replaced 0 by 0.01 since there may be a rounding error leading to 0

In [77]:
def Count_soft(preds,th=0.5,d=50.0):
    preds = sigmoid_np(d*(preds - th))
    return preds.mean(axis=0)

def fit_test(x,y):
    params = 0.5*np.ones(len(utils_pytorch.name_label_dict))
    wd = 1e-5
    error = lambda p: np.concatenate((Count_soft(x,p) - y,
                                      wd*(p - 0.5)), axis=None)
    p, success = opt.leastsq(error, params)
    return p

In [78]:
th_t = fit_test(pred_t,lb_prob)
th_t[th_t<0.1] = 0.1
print('Thresholds: ',th_t)
print('Fractions: ',(pred_t > th_t).mean(axis=0))
print('Fractions (th = 0.5): ',(pred_t > 0.5).mean(axis=0))

Thresholds:  [0.6435  0.47741 0.66794 0.43134 0.38736 0.51992 0.42557 0.55425 0.21507 0.19063 0.15043 0.517   0.35394
 0.48779 0.53776 0.23495 0.43207 0.50993 0.51531 0.47299 0.38989 0.58318 0.46176 0.62953 0.58883 0.62448
 0.40747 0.16856]
Fractions:  [0.36293 0.04375 0.07537 0.05691 0.07332 0.07366 0.04187 0.07469 0.00863 0.00846 0.00761 0.04307 0.04145
 0.01376 0.04324 0.00778 0.02726 0.01401 0.02692 0.05794 0.00957 0.12391 0.02863 0.0746  0.01    0.22048
 0.0282  0.00658]
Fractions (th = 0.5):  [0.50658 0.04051 0.13314 0.04025 0.04478 0.08187 0.02743 0.09374 0.0012  0.00111 0.00043 0.04598 0.02059
 0.01299 0.04862 0.00026 0.01598 0.01444 0.03187 0.04794 0.00316 0.19133 0.02282 0.121   0.01325 0.41506
 0.01564 0.00009]


In [79]:
print('Thresholds: ',th_t)
print('F1 macro: ',f1_score(y, pred>th_t, average='macro'))
print('F1 macro (th = 0.5): ',f1_score(y, pred>0.5, average='macro'))
print('F1 micro: ',f1_score(y, pred>th_t, average='micro'))

Thresholds:  [0.6435  0.47741 0.66794 0.43134 0.38736 0.51992 0.42557 0.55425 0.21507 0.19063 0.15043 0.517   0.35394
 0.48779 0.53776 0.23495 0.43207 0.50993 0.51531 0.47299 0.38989 0.58318 0.46176 0.62953 0.58883 0.62448
 0.40747 0.16856]
F1 macro:  0.5403567463009497
F1 macro (th = 0.5):  0.6408342696559723
F1 micro:  0.7031615925058549


In [80]:
save_pred(pred_t,th_t,'protein_classification_f.csv')

In [81]:
save_pred(pred_t,th,'protein_classification_v.csv')
save_pred(pred_t,0.5,'protein_classification_05.csv')

In [82]:
class_list = [8,9,10,15,20,24,27]
for i in class_list:
    th_t[i] = th[i]
save_pred(pred_t,th_t,'protein_classification_c.csv')

In [83]:
labels = pd.read_csv(utils_pytorch.LABELS).set_index('Id')
label_count = np.zeros(len(utils_pytorch.name_label_dict))
for label in labels['Target']:
    l = [int(i) for i in label.split()]
    label_count += np.eye(len(utils_pytorch.name_label_dict))[l].sum(axis=0)
label_fraction = label_count.astype(np.float)/len(labels)
label_count, label_fraction

(array([12885.,  1254.,  3621.,  1561.,  1858.,  2513.,  1008.,  2822.,    53.,    45.,    28.,  1093.,
          688.,   537.,  1066.,    21.,   530.,   210.,   902.,  1482.,   172.,  3777.,   802.,  2965.,
          322.,  8228.,   328.,    11.]),
 array([0.41468, 0.04036, 0.11654, 0.05024, 0.0598 , 0.08088, 0.03244, 0.09082, 0.00171, 0.00145, 0.0009 ,
        0.03518, 0.02214, 0.01728, 0.03431, 0.00068, 0.01706, 0.00676, 0.02903, 0.0477 , 0.00554, 0.12156,
        0.02581, 0.09542, 0.01036, 0.2648 , 0.01056, 0.00035]))

In [84]:
th_t = fit_test(pred_t,label_fraction)
th_t[th_t<0.05] = 0.05
print('Thresholds: ',th_t)
print('Fractions: ',(pred_t > th_t).mean(axis=0))
save_pred(pred_t,th_t,'protein_classification_t.csv')

Thresholds:  [0.59119 0.50619 0.54256 0.46059 0.43176 0.50696 0.4774  0.51016 0.43689 0.46348 0.43238 0.57592 0.48358
 0.44803 0.64201 0.42251 0.50035 0.62928 0.51449 0.50632 0.44993 0.58955 0.48298 0.5663  0.57696 0.596
 0.55448 0.30047]
Fractions:  [0.41437 0.03991 0.11588 0.04871 0.05768 0.07879 0.03205 0.08973 0.00145 0.00145 0.00094 0.03435 0.02171
 0.01675 0.0341  0.0006  0.01589 0.00675 0.02709 0.04674 0.00513 0.11964 0.02538 0.09392 0.01025 0.26354
 0.01008 0.00017]


In [85]:
print('Thresholds: ',th_t)
print('F1 macro: ',f1_score(y, pred>th_t, average='macro'))
print('F1 macro (th = 0.5): ',f1_score(y, pred>0.5, average='macro'))
print('F1 micro: ',f1_score(y, pred>th_t, average='micro'))

Thresholds:  [0.59119 0.50619 0.54256 0.46059 0.43176 0.50696 0.4774  0.51016 0.43689 0.46348 0.43238 0.57592 0.48358
 0.44803 0.64201 0.42251 0.50035 0.62928 0.51449 0.50632 0.44993 0.58955 0.48298 0.5663  0.57696 0.596
 0.55448 0.30047]
F1 macro:  0.639705943112005
F1 macro (th = 0.5):  0.6408342696559723
F1 micro:  0.7362335880854399
