In [1]:
import sys
print(sys.path)
import seaborn as sns
sns.set()
sys.path.append("/tmp/fastai/old")

['', '/opt/conda/lib/python36.zip', '/opt/conda/lib/python3.6', '/opt/conda/lib/python3.6/lib-dynload', '/opt/conda/lib/python3.6/site-packages', '/opt/conda/lib/python3.6/site-packages/torchvision-0.2.1-py3.6.egg', '/opt/conda/lib/python3.6/site-packages/IPython/extensions', '/home/yukimiki/.ipython']


In [2]:
from fastai.conv_learner import *
from fastai.dataset import *

In [3]:
from datetime import datetime
import pandas as pd
import numpy as np
np.random.seed(seed=32)
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import scipy.optimize as opt
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
#from torchvision.transforms import RandomHorizontalFlip, RandomVerticalFlip
from tensorboard_cb import TensorboardLogger
import torch
torch.manual_seed(7)
torch.cuda.manual_seed_all(7)
import utils_pytorch

In [4]:
nw = 20   #number of workers for data loader
arch = resnet34 #specify target architecture

In [5]:
train_names = list({f[:36] for f in os.listdir(utils_pytorch.TRAIN)})
test_names = list({f[:36] for f in os.listdir(utils_pytorch.TEST)})
# tr_n, val_n = train_test_split(train_names, test_size=0.1, random_state=42)

data_info = pd.read_csv(utils_pytorch.LABELS)
tr_n, val_n = train_test_split(data_info, test_size = 0.1, 
                 stratify = data_info['Target'].map(lambda x: x[:3] if '27' not in x else '0'), random_state=42)
tr_n = tr_n['Id'].tolist()
val_n = val_n['Id'].tolist()

In [6]:
def get_data(sz,bs,stats=None):
    #data augmentation
    aug_tfms = [RandomRotate(45, tfm_y=TfmType.NO),
                RandomFlip(),
                RandomDihedral(tfm_y=TfmType.NO),
                RandomLighting(0.05, 0.05, tfm_y=TfmType.NO)]
    #mean and std in of each channel in the train set
    #stats = A([0.08069, 0.05258, 0.05487, 0.08282], [0.13704, 0.10145, 0.15313, 0.13814])
    if stats is None:
        stats = A([0.0804419, 0.05262986, 0.05474701, 0.08270896], [0.13000701, 0.08796628, 0.1386317, 0.12718021]) # calulate myself
    else:
        stats = A(stats)
    tfms = tfms_from_stats(stats, sz, crop_type=CropType.NO, tfm_y=TfmType.NO, 
                aug_tfms=aug_tfms)
    ds = ImageData.get_ds(utils_pytorch.pdFilesDataset, (tr_n[:-(len(tr_n)%bs)],utils_pytorch.TRAIN), 
                (val_n,utils_pytorch.TRAIN), tfms, test=(test_names,utils_pytorch.TEST))
    md = ImageData(utils_pytorch.PATH, ds, bs, num_workers=nw, classes=None)
    return md

In [7]:
sz = 512 #image size
bs = 64  #batch size

# dir_name = datetime.strftime(datetime.now(), '%Y%m%d%H%M%S')
# print(dir_name)
# dir_path = os.path.join('test', dir_name)
best_model_path = '20181201025910_size512_best_resnet'


md = get_data(sz,bs)
learner = utils_pytorch.ConvLearner.pretrained(arch, md, ps=0.5) #dropout 50%
#pretrained_model_name = '20181124135324best_resnet' # 299
#pretrained_model_name = '20181125052614best_resnet' # 512
#learner.load(pretrained_model_name)
learner.load(best_model_path)
learner.set_data(md)

# learner.opt_fn = optim.Adam
# learner.clip = 1.0 #gradient clipping
# learner.crit = utils_pytorch.FocalLoss()
# learner.metrics = [utils_pytorch.acc, utils_pytorch.f1_torch]
# tb_logger = TensorboardLogger(learner.model, md, dir_path, metrics_names=['acc', 'f1'])
#save_best_model = SaveBestModel(model=learner.model)

In [8]:
def sigmoid_np(x):
    return 1.0/(1.0 + np.exp(-x))

preds,y = learner.TTA(n_aug=16)
preds = np.stack(preds, axis=-1)
preds = sigmoid_np(preds)
pred = preds.max(axis=-1)



In [9]:
def F1_soft(preds,targs,th=0.5,d=50.0):
    preds = sigmoid_np(d*(preds - th))
    targs = targs.astype(np.float)
    score = 2.0*(preds*targs).sum(axis=0)/((preds+targs).sum(axis=0) + 1e-6)
    return score

def fit_val(x,y):
    params = 0.5*np.ones(len(utils_pytorch.name_label_dict))
    wd = 1e-5
    error = lambda p: np.concatenate((F1_soft(x,y,p) - 1.0,
                                      wd*(p - 0.5)), axis=None)
    p, success = opt.leastsq(error, params)
    return p

In [10]:
th = fit_val(pred,y)
th[th<0.1] = 0.1
print('Thresholds: ',th)
print('F1 macro: ',f1_score(y, pred>th, average='macro'))
print('F1 macro (th = 0.5): ',f1_score(y, pred>0.5, average='macro'))
print('F1 micro: ',f1_score(y, pred>th, average='micro'))

Thresholds:  [0.55378 0.57474 0.57471 0.59127 0.58363 0.63822 0.55454 0.64155 0.63851 0.6012  0.63056 0.60766 0.61747
 0.64713 0.58832 2.25547 0.5129  0.6432  0.53469 0.55045 0.54323 0.54961 0.56721 0.66442 0.63134 0.52532
 0.65931 0.33687]
F1 macro:  0.736923043442027
F1 macro (th = 0.5):  0.686862048625337
F1 micro:  0.7962816455696202


In [11]:
print('Fractions: ',(pred > th).mean(axis=0))
print('Fractions (true): ',(y > th).mean(axis=0))

Fractions:  [0.45045 0.03861 0.1139  0.0444  0.05405 0.06017 0.02671 0.07851 0.00129 0.00129 0.00129 0.03378 0.01834
 0.01062 0.03571 0.      0.01319 0.00579 0.02606 0.04537 0.00225 0.12838 0.02027 0.08301 0.01094 0.31145
 0.00804 0.00032]
Fractions (true):  [0.42535 0.03861 0.11776 0.04923 0.05856 0.07497 0.03185 0.08784 0.00129 0.00161 0.00097 0.03539 0.0222
 0.01544 0.03636 0.      0.01673 0.00804 0.03024 0.04698 0.00547 0.12323 0.02574 0.09299 0.0103  0.26062
 0.01062 0.00032]


In [12]:
stats = np.array([[0.05908022, 0.04532852, 0.04065233, 0.05923426], [0.10371015, 0.07984633, 0.10664798, 0.09878183]])
md = get_data(sz,bs, stats)
learner.set_data(md)

In [13]:
preds_t,y_t = learner.TTA(n_aug=16,is_test=True)
preds_t = np.stack(preds_t, axis=-1)
preds_t = sigmoid_np(preds_t)
pred_t = preds_t.max(axis=-1) #max works better for F1 macro score



In [14]:
def save_pred(pred, th=0.5, fname='protein_classification.csv'):
    pred_list = []
    for line in pred:
        s = ' '.join(list([str(i) for i in np.nonzero(line>th)[0]]))
        pred_list.append(s)
        
    sample_df = pd.read_csv(utils_pytorch.SAMPLE)
    sample_list = list(sample_df.Id)
    pred_dic = dict((key, value) for (key, value) 
                in zip(learner.data.test_ds.fnames,pred_list))
    pred_list_cor = [pred_dic[id] for id in sample_list]
    df = pd.DataFrame({'Id':sample_list,'Predicted':pred_list_cor})
    df.to_csv(fname, header=True, index=False)

In [15]:
th_t = np.array([0.565,0.39,0.55,0.345,0.33,0.39,0.33,0.45,0.38,0.39,
               0.34,0.42,0.31,0.38,0.49,0.50,0.38,0.43,0.46,0.40,
               0.39,0.505,0.37,0.47,0.41,0.545,0.32,0.1])
print('Fractions: ',(pred_t > th_t).mean(axis=0))
save_pred(pred_t,th_t)

Fractions:  [0.43634 0.05008 0.10614 0.08016 0.08281 0.14878 0.07178 0.11246 0.00231 0.00171 0.00145 0.05862 0.05162
 0.02581 0.04717 0.00009 0.0582  0.02829 0.04222 0.09041 0.01119 0.18817 0.05486 0.12408 0.01777 0.34644
 0.04256 0.01846]


In [16]:
print('Thresholds: ',th_t)
print('F1 macro: ',f1_score(y, pred>th_t, average='macro'))
print('F1 macro (th = 0.5): ',f1_score(y, pred>0.5, average='macro'))
print('F1 micro: ',f1_score(y, pred>th_t, average='micro'))

Thresholds:  [0.565 0.39  0.55  0.345 0.33  0.39  0.33  0.45  0.38  0.39  0.34  0.42  0.31  0.38  0.49  0.5   0.38  0.43
 0.46  0.4   0.39  0.505 0.37  0.47  0.41  0.545 0.32  0.1  ]
F1 macro:  0.6031190285205824
F1 macro (th = 0.5):  0.686862048625337
F1 micro:  0.7147127200265692


In [17]:
lb_prob = [
 0.362397820,0.043841336,0.075268817,0.059322034,0.075268817,
 0.075268817,0.043841336,0.075268817,0.010000000,0.010000000,
 0.010000000,0.043841336,0.043841336,0.014198783,0.043841336,
 0.010000000,0.028806584,0.014198783,0.028806584,0.059322034,
 0.010000000,0.126126126,0.028806584,0.075268817,0.010000000,
 0.222493880,0.028806584,0.010000000]
# I replaced 0 by 0.01 since there may be a rounding error leading to 0

In [18]:
def Count_soft(preds,th=0.5,d=50.0):
    preds = sigmoid_np(d*(preds - th))
    return preds.mean(axis=0)

def fit_test(x,y):
    params = 0.5*np.ones(len(utils_pytorch.name_label_dict))
    wd = 1e-5
    error = lambda p: np.concatenate((Count_soft(x,p) - y,
                                      wd*(p - 0.5)), axis=None)
    p, success = opt.leastsq(error, params)
    return p

In [19]:
th_t = fit_test(pred_t,lb_prob)
th_t[th_t<0.1] = 0.1
print('Thresholds: ',th_t)
print('Fractions: ',(pred_t > th_t).mean(axis=0))
print('Fractions (th = 0.5): ',(pred_t > 0.5).mean(axis=0))

Thresholds:  [0.64425 0.46003 0.66883 0.41708 0.35597 0.54567 0.4122  0.56877 0.20519 0.17657 0.13772 0.50965 0.3439
 0.52351 0.53443 0.1951  0.50234 0.57111 0.52003 0.49794 0.40414 0.60768 0.50337 0.63289 0.58591 0.63755
 0.38292 0.14437]
Fractions:  [0.36344 0.0435  0.07512 0.0582  0.07272 0.07443 0.04213 0.07409 0.00872 0.00837 0.00684 0.04367 0.0429
 0.01419 0.04384 0.0088  0.0276  0.01393 0.02777 0.05837 0.00983 0.12647 0.02837 0.07469 0.01008 0.22116
 0.02769 0.00521]
Fractions (th = 0.5):  [0.49539 0.04102 0.12485 0.04298 0.04649 0.0899  0.02957 0.09503 0.00128 0.00085 0.00051 0.04504 0.02213
 0.01478 0.04657 0.00009 0.02803 0.01957 0.03196 0.0576  0.0047  0.19159 0.02871 0.11348 0.01325 0.40446
 0.01513 0.     ]


In [20]:
print('Thresholds: ',th_t)
print('F1 macro: ',f1_score(y, pred>th_t, average='macro'))
print('F1 macro (th = 0.5): ',f1_score(y, pred>0.5, average='macro'))
print('F1 micro: ',f1_score(y, pred>th_t, average='micro'))

Thresholds:  [0.64425 0.46003 0.66883 0.41708 0.35597 0.54567 0.4122  0.56877 0.20519 0.17657 0.13772 0.50965 0.3439
 0.52351 0.53443 0.1951  0.50234 0.57111 0.52003 0.49794 0.40414 0.60768 0.50337 0.63289 0.58591 0.63755
 0.38292 0.14437]
F1 macro:  0.5904756986410729
F1 macro (th = 0.5):  0.686862048625337
F1 micro:  0.7422274695001969


In [21]:
save_pred(pred_t,th_t,'protein_classification_f.csv')

In [22]:
save_pred(pred_t,th,'protein_classification_v.csv')
save_pred(pred_t,0.5,'protein_classification_05.csv')

In [23]:
class_list = [8,9,10,15,20,24,27]
for i in class_list:
    th_t[i] = th[i]
save_pred(pred_t,th_t,'protein_classification_c.csv')

In [24]:
labels = pd.read_csv(utils_pytorch.LABELS).set_index('Id')
label_count = np.zeros(len(utils_pytorch.name_label_dict))
for label in labels['Target']:
    l = [int(i) for i in label.split()]
    label_count += np.eye(len(utils_pytorch.name_label_dict))[l].sum(axis=0)
label_fraction = label_count.astype(np.float)/len(labels)
label_count, label_fraction

(array([12885.,  1254.,  3621.,  1561.,  1858.,  2513.,  1008.,  2822.,    53.,    45.,    28.,  1093.,
          688.,   537.,  1066.,    21.,   530.,   210.,   902.,  1482.,   172.,  3777.,   802.,  2965.,
          322.,  8228.,   328.,    11.]),
 array([0.41468, 0.04036, 0.11654, 0.05024, 0.0598 , 0.08088, 0.03244, 0.09082, 0.00171, 0.00145, 0.0009 ,
        0.03518, 0.02214, 0.01728, 0.03431, 0.00068, 0.01706, 0.00676, 0.02903, 0.0477 , 0.00554, 0.12156,
        0.02581, 0.09542, 0.01036, 0.2648 , 0.01056, 0.00035]))

In [25]:
th_t = fit_test(pred_t,label_fraction)
th_t[th_t<0.05] = 0.05
print('Thresholds: ',th_t)
print('Fractions: ',(pred_t > th_t).mean(axis=0))
save_pred(pred_t,th_t,'protein_classification_t.csv')

Thresholds:  [0.58835 0.51234 0.52186 0.46062 0.41536 0.52789 0.4769  0.51395 0.43506 0.43711 0.41656 0.57684 0.50098
 0.47007 0.66362 0.35646 0.59474 0.7181  0.51889 0.54696 0.48079 0.61569 0.52773 0.55849 0.57402 0.60503
 0.56819 0.29643]
Fractions:  [0.41531 0.04068 0.11554 0.05042 0.05879 0.0799  0.0323  0.09067 0.00162 0.00154 0.00094 0.03478 0.02196
 0.01675 0.0341  0.0006  0.01658 0.00649 0.02803 0.04734 0.0053  0.12143 0.02521 0.09494 0.01025 0.26474
 0.01077 0.00034]


In [26]:
print('Thresholds: ',th_t)
print('F1 macro: ',f1_score(y, pred>th_t, average='macro'))
print('F1 macro (th = 0.5): ',f1_score(y, pred>0.5, average='macro'))
print('F1 micro: ',f1_score(y, pred>th_t, average='micro'))

Thresholds:  [0.58835 0.51234 0.52186 0.46062 0.41536 0.52789 0.4769  0.51395 0.43506 0.43711 0.41656 0.57684 0.50098
 0.47007 0.66362 0.35646 0.59474 0.7181  0.51889 0.54696 0.48079 0.61569 0.52773 0.55849 0.57402 0.60503
 0.56819 0.29643]
F1 macro:  0.7082964931957392
F1 macro (th = 0.5):  0.686862048625337
F1 micro:  0.7760869565217391
