In [None]:
import warnings
warnings.filterwarnings(action='once')
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import datetime
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pickle
%load_ext autoreload
%autoreload 2
%matplotlib inline
from exp.misc import *
from exp.ProcessData import *
from exp.PytorchModels import *
from exp.LearnerClass import *
import torch
import torch.nn as nn
import torch.utils.data as D
import torch.nn.functional as F
import copy
from torchvision import transforms
import PIL.Image
from sklearn.metrics import roc_auc_score
import torchvision.transforms.functional as TF
from types import MethodType
import sandesh

In [None]:
params=json_to_parameters('config.json')
num_folds=3
SEEDS=[220,432,8153]
add_seed=214456
model_type='tf_efficientnet_b5_ns'
name_tamplate='image_mlps_cut_128_seed_fullv{}'
output_name=f'{model_type}_image_mlps_cut_128_seed_fullv_predict.pth'

device = device_by_name('Tesla')

torch.cuda.set_device(device)

In [None]:
train_df=pd.read_csv(params.path.data+'train.csv')
extra_df=pd.read_csv(params.path.data+'isim2019.csv')
train_df=pd.concat([train_df, extra_df], ignore_index=True,sort=False)
prepare_df(train_df)

In [None]:
def my_metric(y_pred,y_true):
    preds1=F.softmax(torch.tensor(y_pred,dtype=torch.float32),-1)[:,1].numpy()
    return {'metric':-roc_auc_score(y_true[:,-1], preds1)}

In [None]:
class MyLoss():
    def __init__(self,weight,mweight=1.):
        self.weight=weight
        self.mweight=mweight
    def __call__(self,y_pred,y_true):
        w=torch.ones_like(y_pred[0])
        w[1]=self.mweight
#         w=w/w.mean()
        return self.weight*F.binary_cross_entropy_with_logits(y_pred[:,1],y_true[:,1].to(torch.float32),weight=torch.tensor(1.,device=device))+\
               (1-self.weight)*F.cross_entropy(y_pred,y_true[:,0],weight=w)


In [None]:
transform=transforms.Compose([HairTransform(1.0),transforms.RandomRotation(45),
                              transforms.RandomHorizontalFlip(p=0.5),transforms.RandomVerticalFlip(p=0.5),
                              transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.3, hue=0.1),
                              transforms.RandomResizedCrop((400,600), scale=(0.7, 1.1)),
                              transforms.ToTensor(),CutoutTransform(0.5,24,fill_const=False),
                              CutoutTransform(0.5,16,fill_const=False)])

transform_val=transforms.Compose([transforms.ToTensor()])


## Train

In [None]:
epoch_in_rep=7
reps =3
num_epochs=epoch_in_rep*reps
batch_size=24
accumulation_steps=1
reps_lr=[3e-4*batch_size/24,1e-4*batch_size/24,0.3e-4*batch_size/24]
pos_mul=1
epoch_mul=1
for SEED in SEEDS:
    val_folds, train_folds, patients_val = create_folds_extra(train_df,num_folds,SEED)
    for fold in range(num_folds): 
        torch.manual_seed(SEED+fold+add_seed)
        np.random.seed(SEED+fold+add_seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        validate_ds=D.Subset(ImageDataset(params.path.train_jpg_small,
                                          train_df,transform=transform_val,return_years=False),val_folds[fold])
        train_ds=D.Subset(ImageDataset(params.path.train_jpg_small,
                                       train_df,transform=transform,meta_aug=0.1,return_years=False),train_folds[fold])#model_type='efficientnet_b0'
        sample_weights=np.ones(len(train_ds))
        sample_weights=sample_weights+train_df.target.values[train_folds[fold]]*(pos_mul-1)
        epoch_size=(int(epoch_mul*len(train_ds)//pos_mul)//(batch_size*accumulation_steps))*(batch_size*accumulation_steps)
        sampler=D.WeightedRandomSampler(sample_weights, epoch_size, replacement=True)
        model = get_model(model_type,8,extra=[8,8,[7,32]],mid_extra=128,mlps=[1024,256],dropout=0.5,bn=True).to(device)
        name=params.model_format.format(model_type,name_tamplate.format(SEED),1,fold)
        print(name)
        my_loss=MyLoss(0.01)
        #my_loss=my_one_loss
        learner = Learner(model,None,loss_func=my_loss,name=name,scheduler=None,device=device)
        learner.metric=my_metric
        learner.optimizer = torch.optim.Adam(learner.model.parameters(), lr=1e-4)
        learner.init_amp()
        def new_get_y(self,batch):
            return torch.stack(batch[-2:],1)
        def new_get_x(self,batch):
            return batch[:-2] 
        learner.get_y=MethodType(new_get_y, learner)
        learner.get_x=MethodType(new_get_x, learner)

        train_dl_args={'shuffle': False,'sampler':sampler }
        for t in range(reps):
            learner.scheduler = torch.optim.lr_scheduler.OneCycleLR(learner.optimizer, pct_start=0.01,final_div_factor= 10,
                                                                    max_lr=reps_lr[t], 
                                                                    steps_per_epoch=epoch_size//(batch_size*accumulation_steps)+1, 
                                                                    epochs=num_epochs//reps)

            learner.fit(num_epochs//reps,train_ds,
                        validate_ds,
                        batch_size=batch_size,
                        accumulation_steps=accumulation_steps,
                        eval_batch=2*batch_size,
                        path=params.path.models,
                        train_dl_args=train_dl_args,
                        num_workers=12)
        sandesh.send({'name':learner.name,'best_metric':learner.best_metric})
        print(learner.name,' best metric:',learner.best_metric)
        learner.save_model(params.path.models)

## Inference

In [None]:
test_df=pd.read_csv(params.path.data+'test.csv')
prepare_df(test_df)
test_df=test_df.reset_index(drop=True)

test_ds=ImageDataset(params.path.test_jpg_small,test_df,transform=transform)

predss=[]
for SEED in SEEDS:
    for fold in range(num_folds): 
        torch.manual_seed(SEED+fold+add_seed)
        np.random.seed(SEED+fold+add_seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        model = get_model(model_type,8,extra=[8,8,[7,32]],mid_extra=128,mlps=[1024,256],dropout=0.4,bn=True,return_features=True).to(device)
        name=params.model_format.format(model_type,name_tamplate.format(SEED),1,fold)
        print(name)
        my_loss=MyLoss(0.01)
        learner = Learner(model,None,loss_func=my_loss,name=name,scheduler=None,device=device)
        learner.metric=my_metric
        learner.load_model(params.path.models)
        learner.init_amp()
        def new_get_x(self,batch):
            return batch
        learner.get_x=MethodType(new_get_x, learner)
        featuress=[]
        for i in range(16):
            y_pred=learner.predict(test_ds,batch_size=64,num_workers=16)
            predss.append(y_pred[0])
            featuress.append(y_pred[1])
        with open(params.path.features+(name.split('.')[0]+'_test.pkl'),'wb') as f:
            pickle.dump(np.stack(featuress,0),f,protocol=4)
with open(params.path.output+output_name,'wb') as f:
    pickle.dump(predss,f,protocol=4)


## Prepare Submission file

In [None]:
y_pred=np.nanmean(np.stack(predss,0),0)
preds2=F.softmax(torch.tensor(y_pred,dtype=torch.float32),-1)[:,1].numpy()


preds2.min()
preds2.max()
preds2.std()
_=plt.hist(preds2,bins=30)

sub=pd.read_csv(params.path.data+'sample_submission.csv')

sub['image_name']=test_df['image_name']
sub['target']=preds2
sub.head(10)
sub.to_csv(params.path.output+'/submission102.csv',index=False)