In [None]:
import warnings
warnings.filterwarnings(action='once')
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import datetime
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pickle
%load_ext autoreload
%autoreload 2
%matplotlib inline
from exp.misc import *
from exp.ProcessData import *
from exp.PytorchModels import *
from exp.LearnerClass import *
import torch
import torch.nn as nn
import torch.utils.data as D
import torch.nn.functional as F
import copy
from torchvision import transforms
import PIL.Image
from sklearn.metrics import roc_auc_score
import torchvision.transforms.functional as TF
from types import MethodType


In [None]:
params=json_to_parameters('config.json')
num_folds=3
SEEDS=[220,432,8153]
add_seed=2000
add_seed_orig=3000
add_seed_cv=200
add_seed_test=300
model_type='tf_efficientnet_b3_ns'
base_name_tamplate='image_mlps_cut_128_seed_fullv{}'
name_tamplate='image_mlps_cut_128_seed_fullv{}_transformer_sample'
name_tamplate_orig='image_mlps_cut_128_seed_fullv{}_transformer_sample_origonly_rand23'
output_name=f'{model_type}_image_mlps_cut_128_seed_fullv_transformer_sample_origonly_rand23_predict.pth'
device = device_by_name('Tesla')
torch.cuda.set_device(device)

In [None]:
train_df=pd.read_csv(params.path.data+'train.csv')
extra_df=pd.read_csv(params.path.data+'isim2019.csv')
train_df=pd.concat([train_df, extra_df], ignore_index=True,sort=False)
prepare_df(train_df)

In [None]:
def my_metric(y_pred,y_true):
    y_true=(y_true==1).reshape(-1)
    y_pred=y_pred.reshape(-1,y_pred.shape[-1])[y_true>=0]
    y_true=y_true[y_true>=0]==1

    preds=torch.sigmoid(torch.tensor(y_pred[:,1],dtype=torch.float32)).numpy()
    preds1=F.softmax(torch.tensor(y_pred,dtype=torch.float32),-1)[:,1].numpy()
    preds2=0.5*preds1/preds1.std()+0.5*preds/preds.std()
    return {'metric':-roc_auc_score(y_true, preds1),'metric1':-roc_auc_score(y_true, preds),
                                                                'metric2':-roc_auc_score(y_true, preds2)}

In [None]:
class MyLoss():
    def __init__(self,nom_size=0.5):
        self.nom_size=nom_size
    def __call__(self,y_pred,y_true):
        y_true=y_true.reshape(-1)
        y_pred=y_pred.reshape(-1,y_pred.shape[-1])
        return F.cross_entropy(y_pred,y_true,ignore_index=-1)*(y_true!=-1).to(torch.float32).mean()/self.nom_size


In [None]:
class MyFocalLoss():
    def __init__(self,gamma=2.,nom_size=0.5):
        self.nom_size=nom_size
        self.loss=FocalCrossEntropy(gamma)
    def __call__(self,y_pred,y_true):
        y_true=y_true.reshape(-1)
        y_pred=y_pred.reshape(-1,y_pred.shape[-1])
        k=(y_true!=-1).to(torch.float32).mean()/self.nom_size
        y_pred=y_pred[y_true>=0]
        y_true=y_true[y_true>=0]
        return self.loss(y_pred,y_true)*k


In [None]:
def evaluate(self,ds,num_workers=8,tta=1,dl_args={'shuffle':False}):
    predss=[]
    truess=[]
    inds=[]
    tk = notebook.tqdm(range(tta))
    for i in tk:
        ds.reset()
        y_pred=learner.predict(ds,batch_size=batch_size,num_workers=num_workers,
                               return_inds=True,return_true=True,dl_args=dl_args,verbose=False)
        predss.append(y_pred[0])
        inds.append(y_pred[1])
        truess.append(y_pred[2])
    prd=[]
    y_true=[]
    for i,pred in enumerate(predss):
        p=pred.reshape(-1,8)
        y=truess[i].reshape(-1)
        ind = inds[i].reshape(-1)
        p=p[ind<ind.max()]
        y=y[ind<ind.max()]
        ind=ind[ind<ind.max()]
        prd.append(p[np.argsort(ind)])
        y_true.append(y[np.argsort(ind)])
    y_pred=np.nanmean(np.stack(prd,0),0)
    y_true=np.nanmean(np.stack(y_true,0),0)
    l,m = self.loss_func(torch.tensor(y_pred,dtype=torch.float32),torch.tensor(y_true,dtype=torch.long)), dict() if self.metric is None else self.metric(y_pred,y_true)
    tk.disable=False
    tk.set_postfix(loss = l.item(), **m)
    tk.disable=True
    return l,m


## Train 2020 + ISIM2019

In [None]:
v = 1
num_epochs=24 #epoch_in_rep*reps
batch_size=64
pos_mul=1
epoch_mul=1
epoch_in_rep=6
reps =3
num_epochs=epoch_in_rep*reps
reps_lr=[1e-4,0.3e-4,1e-5]


for SEED in SEEDS:
    val_folds, train_folds, patients_val = create_folds_extra(train_df,num_folds,SEED)
    for fold in range(num_folds): 
        dft=train_df.copy()
        base_name=params.model_format.format(model_type,base_name_tamplate.format(SEED),v,fold)
        fname=params.path.features+(base_name.split('.')[0]+'.pkl')
        fname2019=params.path.features+(base_name.split('.')[0]+'_isim2019.pkl')
        name=params.model_format.format(model_type,name_tamplate.format(SEED),v,fold)

        dft.patient_id=np.where(dft.patient_id=='0',dft.index.values.astype('str'),dft.patient_id)

        patients_val=[set(np.unique(dft.patient_id.values[v])) for v in val_folds]
        all_patients=set(np.unique(dft.patient_id.values))

        patients_train=[all_patients.difference(p) for p in patients_val]

        with open(fname,'rb') as f:
            features0=pickle.load(f)
        with open(fname2019,'rb') as f:
            features1=pickle.load(f)
        features=torch.tensor(np.concatenate([features0,features1,np.zeros_like(features0[:,[0]])],1),dtype=torch.float32)


        ds = PatientFeaturesDataset(features,dft,patients_train[fold],24,min_len=23)
        ds_val = PatientFeaturesDataset(features,dft,patients_val[fold],24,min_len=23)
        

        print(name)

        torch.manual_seed(SEED+fold+add_seed)
        np.random.seed(SEED+fold+add_seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        validate_ds=ds_val
        train_ds=ds

        model = TransformerModel(256,1024,num_outputs=8,n_heads=4,n_encoders=4).to(device)

        my_loss=MyLoss(len(ds)/ds.sex.shape[0])
        print(len(ds)/ds.sex.shape[0])

        learner = Learner(model,None,loss_func=my_loss,name=name,scheduler=None,device=device)
        learner.metric=my_metric

        learner.optimizer = torch.optim.Adam(learner.model.parameters(), lr=0.3e-4)
        mx=ds.idx_array.max()
        num_ids=np.array([(s<mx).sum() for s in ds.idx_array])
        learner.sampler=D.WeightedRandomSampler(num_ids,int(num_ids.sum()), replacement=True)
        epoch_size=int(1.001*num_ids.sum())
        def new_get_y(self,batch):
            return batch[-2]
        def new_get_inds(self,batch):
            return batch[-3]
        def new_get_x(self,batch):
            return batch[:-3] 
        def on_epoch_begin(self,*args,**kargs):
            train_ds.reset()
            mx=train_ds.idx_array.max()
            num_ids=np.array([(s<mx).sum() for s in train_ds.idx_array])
            self.sampler=D.WeightedRandomSampler(num_ids, int(num_ids.sum()),replacement=True)
        learner.get_y=MethodType(new_get_y, learner)
        learner.get_x=MethodType(new_get_x, learner)
        learner.get_inds=MethodType(new_get_inds, learner)
        learner.evaluate=MethodType(evaluate, learner)
        learner.on_epoch_begin=MethodType(on_epoch_begin, learner)
        for t in range(reps):
            train_dl_args={'shuffle': False,'sampler':learner.sampler }
            learner.scheduler = torch.optim.lr_scheduler.OneCycleLR(learner.optimizer, pct_start=0.01,final_div_factor= 10,
                                                                    max_lr=reps_lr[t], steps_per_epoch=epoch_size//batch_size+1, 
                                                                    epochs=epoch_in_rep)
            learner.fit(epoch_in_rep,train_ds,validate_ds,batch_size=batch_size,eval_batch=2*batch_size,
                    path=params.path.models,num_workers=12,send_log=False,train_dl_args=train_dl_args,tta=4)
        learner.save_model(params.path.models)

## Train 2020 only

In [None]:
v=1
num_epochs=24 #epoch_in_rep*reps
batch_size=64
pos_mul=1
epoch_mul=1
epoch_in_rep=8
reps =3
num_epochs=epoch_in_rep*reps
reps_lr=[1e-4,3e-5]
ttas=[8,8]

dft=train_df[train_df.patient_id!='0'].copy()

for SEED in SEEDS:
    val_folds, train_folds, patients_val = create_folds_extra(train_df,num_folds,SEED)
    for fold in range(num_folds): 
        dft=train_df[train_df.patient_id!='0'].copy()
        base_name=params.model_format.format(model_type,base_name_tamplate.format(SEED),v,fold)
        fname=params.path.features+(base_name.split('.')[0]+'.pkl')
        name=params.model_format.format(model_type,name_tamplate.format(SEED),v,fold)

        patients_val=[set(np.unique(dft.patient_id.values[v])) for v in val_folds]
        all_patients=set(np.unique(dft.patient_id.values))

        patients_train=[all_patients.difference(p) for p in patients_val]

        with open(fname,'rb') as f:
            features0=pickle.load(f)

        features=torch.tensor(np.concatenate([features0,np.zeros_like(features0[:,[0]])],1),dtype=torch.float32)


        ds = PatientFeaturesDataset(features,dft,patients_train[fold],24,min_len=23)
        ds_val = PatientFeaturesDataset(features,dft,patients_val[fold],24,min_len=23)


        print(name)

        torch.manual_seed(SEED+fold+add_seed_orig)
        np.random.seed(SEED+fold+add_seed_orig)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        validate_ds=ds_val
        train_ds=ds

        epoch_size=len(train_ds)
        model = TransformerModel(256,1024,num_outputs=8,n_heads=4,n_encoders=4).to(device)

        my_loss=MyLoss(0.2) #MyFocalLoss(gamma=0.9,nom_size=0.2)

        learner = Learner(model,None,loss_func=my_loss,name=name,scheduler=None,device=device)
        learner.metric=my_metric
        learner.load_model(params.path.models)
        learner.name= params.model_format.format(model_type,name_tamplate_orig.format(SEED),v,fold)
        print(learner.name)

        learner.optimizer = torch.optim.Adam(learner.model.parameters(), lr=3e-5)

        def new_get_y(self,batch):
            return batch[-2]
        def new_get_inds(self,batch):
            return batch[-3]
        def new_get_x(self,batch):
            return batch[:-3] 
        learner.get_y=MethodType(new_get_y, learner)
        learner.get_x=MethodType(new_get_x, learner)
        learner.get_inds=MethodType(new_get_inds, learner)
        learner.evaluate=MethodType(evaluate, learner)

        def on_epoch_begin(self,*args,**kargs):
            train_ds.reset()

        learner.on_epoch_begin=MethodType(on_epoch_begin, learner)

        for i,lr in enumerate(reps_lr):
            learner.set_lr(lr)
            learner.fit(epoch_in_rep,train_ds,validate_ds,batch_size=batch_size,eval_batch=2*batch_size,
                         path=params.path.models,num_workers=12,send_log=False,tta=ttas[i])
        learner.save_model(params.path.models)

## Calculate CV

In [None]:

v =1
batch_size=64
metrics=[]
dft=train_df[train_df.patient_id!='0'].copy()
for SEED in SEEDS:
    val_folds, train_folds, patients_val = create_folds_extra(train_df,num_folds,SEED)
    for fold in range(num_folds): 
        dft=train_df[train_df.patient_id!='0'].copy()
        base_name=params.model_format.format(model_type,base_name_tamplate.format(SEED),v,fold)
        fname=params.path.features+(base_name.split('.')[0]+'.pkl')
        name= params.model_format.format(model_type,name_tamplate_orig.format(SEED),v,fold)

        patients_val=[set(np.unique(dft.patient_id.values[v])) for v in val_folds]
        all_patients=set(np.unique(dft.patient_id.values))

        patients_train=[all_patients.difference(p) for p in patients_val]

        with open(fname,'rb') as f:
            features0=pickle.load(f)

        features=torch.tensor(np.concatenate([features0,np.zeros_like(features0[:,[0]])],1),dtype=torch.float32)
        ds_val = PatientFeaturesDataset(features,dft,patients_val[fold],33,min_len=32)

        print(name)
        _=torch.manual_seed(SEED+fold+add_seed_cv)
        np.random.seed(SEED+fold+add_seed_cv)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        validate_ds=ds_val


        model = TransformerModel(256,1024,num_outputs=8,n_heads=4,n_encoders=4).to(device)
        my_loss=MyLoss(0.2) #MyFocalLoss(gamma=2,nom_size=0.2)
                #my_loss=my_one_loss
        learner = Learner(model,None,loss_func=my_loss,name=name,scheduler=None,device=device)
        learner.metric=my_metric
        m=torch.load(f'{params.path.models}{name}',map_location='cpu')
        learner.model.load_state_dict(m)
        _=learner.model.to(device)

        def on_epoch_begin(self,*args,**kargs):
            train_ds.reset()
            validate_ds.reset()
        def new_get_y(self,batch):
            return batch[-2]
        def new_get_inds(self,batch):
            return batch[-3]
        def new_get_x(self,batch):
            return batch[:-3] 

        learner.get_y=MethodType(new_get_y, learner)
        learner.get_x=MethodType(new_get_x, learner)
        learner.get_inds=MethodType(new_get_inds, learner)
        learner.evaluate=MethodType(evaluate, learner)
        learner.on_epoch_begin=MethodType(on_epoch_begin, learner)
        l,m=learner.evaluate(validate_ds,num_workers=8,tta=16,dl_args={'shuffle':False,'batch_size':batch_size})
        metrics.append(m)
        metrics[-1]['loss']=l.item()

name
np.array([m['metric'] for m in metrics ]).mean()
np.array([m['metric'] for m in metrics ]).std()

## Inference

In [None]:
test_df=pd.read_csv(params.path.data+'test.csv')
prepare_df(test_df)
test_df=test_df.reset_index(drop=True)
predss=[]
inds=[]
v=1
for SEED in SEEDS:
    val_folds, train_folds, patients_val = create_folds_extra(train_df,num_folds,SEED)
    for fold in range(num_folds): 
        base_name=params.model_format.format(model_type,base_name_tamplate.format(SEED),v,fold)
        fname=params.path.features+(base_name.split('.')[0]+'_test.pkl')
        with open(fname,'rb') as f:
            features=pickle.load(f)
        features.shape
        features=torch.tensor(np.concatenate([features,np.zeros_like(features[:,[0]])],1),dtype=torch.float32)
        name= params.model_format.format(model_type,name_tamplate_orig.format(SEED),v,fold)
        model = TransformerModel(256,1024,num_outputs=8,n_heads=4,n_encoders=4).to(device)
        test_ds= PatientFeaturesDataset(features,test_df,set(test_df.patient_id.unique()),24,min_len=23)
        print (name)
        my_loss=MyLoss(0.01)
        learner = Learner(model,None,loss_func=my_loss,name=name,scheduler=None,device=device)
        learner.metric=my_metric
        learner.load_model(params.path.models)
        learner.init_amp()

        def new_get_x(self,batch):
            return batch[:-1] 
        learner.get_x=MethodType(new_get_x, learner)
        for i in range(32):
            test_ds.reset()
            y_pred=learner.predict(test_ds,batch_size=64,num_workers=12,return_inds=True)
            predss.append(y_pred[0])
            inds.append(y_pred[1])


In [None]:
prd=[]
for i,pred in enumerate(predss):
    p=pred.reshape(-1,8)
    ind = inds[i].reshape(-1)
    p=p[ind<test_df.shape[0]]
    ind=ind[ind<test_df.shape[0]]
    prd.append(p[np.argsort(ind)])

with open(params.path.output+output_name,'wb') as f:
    pickle.dump(prd,f,protocol=4)


## Prepare Submission file

In [None]:
with open(params.path.output+output_name,'rb') as f:
    prd=pickle.load(f)


y_pred=np.nanmean(np.stack(prd,0),0)

preds2=F.softmax(torch.tensor(y_pred,dtype=torch.float32),-1)[:,1].numpy()


preds2.min()
preds2.max()
preds2.std()
_=plt.hist(preds2,bins=30)

sub=pd.read_csv(params.path.data+'sample_submission.csv')

sub['image_name']=test_df['image_name']
sub['target']=preds2
sub.head(10)
sub.to_csv(params.path.output+'/submission116.csv',index=False)