mainly looking at https://github.com/ngessert/isic2019/blob/master/models.py

# start

In [1]:
PRFX = 'Mel0626A4'
PRFX_PREP = 'MelPrp0626A2'
ARCH = 'efficientnet-b0'
SZ = 128
EPOCHS = 10
BS = 8
K=5; SEED=101; FLD2USE=0
FP16 = True
N_SAMPL = int(1e3)

DEBUG = True
if DEBUG: 
    EPOCHS=1
    K = 2
    N_SAMPL = 100
    
DEVICE = 'cpu'; PIN_MEM = (DEVICE=='cuda')

# setup

In [23]:
from pathlib import Path
import pickle
import pandas as pd
import numpy as np
import random
import os, sys, gc
import datetime
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import GroupKFold

# https://github.com/eriklindernoren/PyTorch-YOLOv3/issues/162#issuecomment-491115265
from PIL import ImageFile; ImageFile.LOAD_TRUNCATED_IMAGES = True

def dtnow(): return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

import torch
device=torch.device(DEVICE)
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

from efficientnet_pytorch import EfficientNet

from apex import amp

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
set_seed(SEED)



# p_out=f'../output/{PRFX}'; Path(p_out).mkdir(exist_ok=True,parents=True)
# p_cmp = '../input/siim-isic-melanoma-classification'



# prep

In [3]:
# train,test,sample_submission = (pd.read_csv(o, nrows=100 if DEBUG else None) 
#                                 for o in [f'{p_cmp}/{o}.csv' 
#                                           for o in ('train', 'test', 'sample_submission')])
# print([o.shape for o in (train,test,sample_submission)])
# display(test.head(2))
# p_19 = '../input/andrewmvd--isic-2019'

p_prp = f'../output/{PRFX_PREP}'
dftrn = pd.read_csv(f'{p_prp}/train_all.csv') 
print(dftrn.shape)

set_seed(SEED); dftrn = dftrn.sample(frac=1.)

if N_SAMPL is not None: dftrn = dftrn.head(N_SAMPL)
    
display(dftrn.head(2))
display(dftrn.shape)
display(dftrn.source.value_counts())
display(dftrn.groupby('source').target.mean())

(58457, 4)


Unnamed: 0,source,im_pth,patient_id,target
24437,20,../input/siim-isic-melanoma-classification/jpe...,IP_4021847,0.0
57432,19,../input/andrewmvd--isic-2019/ISIC_2019_Traini...,BCN_0004730,1.0


(100, 4)

20    64
19    36
Name: source, dtype: int64

source
19    0.166667
20    0.000000
Name: target, dtype: float64

## folds

In [4]:
idx_nopid=np.where(dftrn.patient_id.isna())[0]
print(len(idx_nopid))
dftrn['patient_id'].iloc[idx_nopid]=[f'Nan_{i}' for i in range(len(idx_nopid))]
assert dftrn.patient_id.isna().mean()==0

4


In [5]:
set_seed(SEED)
kf = GroupKFold(n_splits=K)
fld2trvl={fld:(tr,vl) for fld,(tr,vl) in enumerate(kf.split(dftrn, groups=dftrn.patient_id))}

for fld, (tr, vl) in fld2trvl.items():
    print(fld, len(tr), len(vl))
    dftr=dftrn.iloc[tr]
    dfvl=dftrn.iloc[vl]
    assert set(dftr.patient_id)&set(dfvl.patient_id)==set()
for fld, (tr, vl) in fld2trvl.items():
    dftr=dftrn.iloc[tr]
    display(dftr.target.value_counts()/len(tr))
for fld, (tr, vl) in fld2trvl.items():
    dftr=dftrn.iloc[tr]
    display(dftr.source.value_counts())


0 50 50
1 50 50


0.0    0.92
1.0    0.08
Name: target, dtype: float64

0.0    0.96
1.0    0.04
Name: target, dtype: float64

20    30
19    20
Name: source, dtype: int64

20    34
19    16
Name: source, dtype: int64

## load images

In [6]:
%%time
im_pil = Image.open(dftrn.im_pth.sample().values[0])

CPU times: user 11.1 ms, sys: 0 ns, total: 11.1 ms
Wall time: 10.9 ms


In [7]:
ims_pil = []
for im_pth in tqdm(dftrn.im_pth):
    ims_pil.append(Image.open(im_pth))
### faster to preload images
# i = np.random.choice(range(len(dftrn)))
# %%timeit 
# im_pil = Image.open(dftrn.im_pth.values[i])
# %%timeit 
# im_pil = ims_pil[i]

100%|██████████| 100/100 [00:00<00:00, 5842.38it/s]


# dataset

In [8]:
class MelDataset(Dataset):
    def __init__(self, imgs, targets=None):
        self.imgs = imgs
        self.targets = targets
        self.composed = transforms.Compose([
            transforms.RandomResizedCrop(SZ),
            transforms.RandomHorizontalFlip(),
            transforms.RandomVerticalFlip(),
            transforms.ColorJitter(brightness=32. / 255.,saturation=0.5),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ])    
    def __getitem__(self, i):
        x = self.imgs[i]
        x = self.composed(x)
        if self.targets is not None:
            y = self.targets[i]
            return x, y
        else:
            return x,
    
    def __len__(self):
        return len(self.imgs)    

In [9]:
# ds_trn = MelDataset(ims_pil, dftrn.target.values)
# ds_trn[0][0].shape, ds_trn[0][1]

# plt.imshow(ds_trn[0][0].numpy().transpose(1,2,0));
# plt.show()

# dl_trn = DataLoader(ds_trn, batch_size=BS, shuffle=True, num_workers=4, pin_memory=PIN_MEM)
# for dat in dl_trn:
#     x,y=dat; break

# print(x.shape, y)

# del ds_trn; gc.collect()

# Model

In [10]:
mdl = EfficientNet.from_pretrained(ARCH, num_classes=1)

Loaded pretrained weights for efficientnet-b0


In [11]:
# mdl(x).shape

# train

In [17]:
tr,vl=fld2trvl[FLD2USE]
dftr=dftrn.iloc[tr]
dfvl=dftrn.iloc[vl]
print(len(dftr), len(dfvl))
dstr = MelDataset([ims_pil[o] for o in tr], dftr.target.values)
dsvl = MelDataset([ims_pil[o] for o in vl], dfvl.target.values)
print(len(dstr), len(dsvl))
dltr = DataLoader(dstr, batch_size=BS,   shuffle=True,  num_workers=4, pin_memory=PIN_MEM)
dlvl = DataLoader(dsvl, batch_size=BS*2, shuffle=False, num_workers=4, pin_memory=PIN_MEM)
print(len(dltr), len(dlvl))

50 50


In [None]:
mdl = EfficientNet.from_pretrained(ARCH, num_classes=1)

In [None]:
opt_level = 'O1'
mdl, optimizer = amp.initialize(mdl, optimizer, opt_level=opt_level)


In [30]:
for epc in range(EPOCHS):
    for step, dat in enumerate(dltr):
        mdl.train()
        xb,yb=(o.to(device) for o in dat)
        prdb = mdl(xb)
        loss = F.cross_entropy(prd, yb)
        if FP16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        if FP16:
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1)
        else:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        model.zero_grad()
    break

KeyboardInterrupt: 

In [None]:
loss

In [29]:
prdb

tensor([[-0.0260],
        [ 0.0682],
        [-0.2579],
        [-0.2346],
        [ 0.2248],
        [ 0.1600],
        [-0.3170],
        [ 0.2984]], grad_fn=<AddmmBackward>)