In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="3"

from glob import glob
from sklearn.model_selection import GroupKFold
import cv2
from skimage import io
import torch
from torch import nn
import os
from datetime import datetime
import time
import random
import cv2
import pandas as pd
import numpy as np
import albumentations as A
import matplotlib.pyplot as plt
from albumentations.pytorch.transforms import ToTensorV2, ToTensor
from torchvision import transforms as T

from torch.utils.data import Dataset,DataLoader
from torch.utils.data.sampler import SequentialSampler, RandomSampler
import sklearn
from efficientnet_pytorch import EfficientNet


from tqdm.notebook import tqdm
import torch.nn.functional as F
from sklearn import metrics



import warnings
warnings.filterwarnings("ignore")


In [2]:
train_df = pd.read_csv('./input/train.csv')
val_df = pd.read_csv('./input/val.csv')

train_df['kind'] = train_df['ImageFileName'].apply(lambda x: x.split('/')[-2])
train_df['image_name'] = train_df['ImageFileName'].apply(lambda x: x.split('/')[-1])
train_df['label4'] = train_df['kind'].map({'Cover':0, 'JMiPOD':1, 'JUNIWARD':2, 'UERD':3})


val_df['kind'] = val_df['ImageFileName'].apply(lambda x: x.split('/')[-2])
val_df['image_name'] = val_df['ImageFileName'].apply(lambda x: x.split('/')[-1])
val_df['label4'] = val_df['kind'].map({'Cover':0, 'JMiPOD':1, 'JUNIWARD':2, 'UERD':3})

In [3]:
train_df = train_df.sample(frac=1).reset_index(drop=True)
val_df = val_df.sample(frac=1).reset_index(drop=True)


In [5]:
def get_train_transforms():
    return A.Compose([
            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, p=1.0),
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.5),
            ToTensor(),
        ], p=1.0)

def get_valid_transforms():
    return A.Compose([
            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, p=1.0),
            ToTensor(),
        ], p=1.0)

In [6]:
DATA_ROOT_PATH = '../../data/alaska/'
class DatasetRetriever(Dataset):
    def __init__(self, kinds, image_names, labels, transforms=None):
        super().__init__()
        self.kinds = kinds
        self.image_names = image_names
        self.labels = labels
        self.transforms = transforms
    def __getitem__(self, index: int):
        kind, image_name, label = self.kinds[index], self.image_names[index], self.labels[index]
        image = cv2.imread(f'{DATA_ROOT_PATH}/{kind}/{image_name}')[:, :, ::-1]
        if self.transforms:
            sample = {'image': image}
            sample = self.transforms(**sample)
            image = sample['image']
        return image, label

    def __len__(self) -> int:
        return self.image_names.shape[0]

    def get_labels(self):
        return list(self.labels)

In [9]:
train_dataset = DatasetRetriever(
    kinds=train_df.kind.values,
    image_names=train_df.image_name.values,
    labels=train_df.label4.values,
    transforms=get_train_transforms(),
)
validation_dataset = DatasetRetriever(
    kinds=val_df.kind.values,
    image_names=val_df.image_name.values,
    labels=val_df.label4.values,
    transforms=get_valid_transforms(),
)

In [12]:
def alaska_weighted_auc(y_true, y_valid):
    """
    https://www.kaggle.com/anokas/weighted-auc-metric-updated
    """
    tpr_thresholds = [0.0, 0.4, 1.0]
    weights = [2, 1]
    fpr, tpr, thresholds = metrics.roc_curve(y_true, y_valid, pos_label=1)
    areas = np.array(tpr_thresholds[1:]) - np.array(tpr_thresholds[:-1])
    normalization = np.dot(areas, weights)
    competition_metric = 0
    for idx, weight in enumerate(weights):
        y_min = tpr_thresholds[idx]
        y_max = tpr_thresholds[idx + 1]
        mask = (y_min < tpr) & (tpr < y_max)
        x_padding = np.linspace(fpr[mask][-1], 1, 100)
        x = np.concatenate([fpr[mask], x_padding])
        y = np.concatenate([tpr[mask], [y_max] * len(x_padding)])
        y = y - y_min  # normalize such that curve starts at y=0
        score = metrics.auc(x, y)
        submetric = score * weight
        best_subscore = (y_max - y_min) * weight
        competition_metric += submetric
    return competition_metric / normalization
        

In [13]:
class get_net(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.model = EfficientNet.from_pretrained('efficientnet-b4')
        self.dense_output = nn.Linear(1792, num_classes)

    def forward(self, x):
        feat = self.model.extract_features(x)
        feat = F.avg_pool2d(feat, feat.size()[2:]).reshape(-1, 1792)
        return self.dense_output(feat)

In [14]:
model = get_net()

Loaded pretrained weights for efficientnet-b0


In [20]:
device = torch.device('cuda')
batch_size = 64
num_workers = 6 
n_epochs = 200

In [21]:
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=batch_size,
    pin_memory=False,
    drop_last=True,
    num_workers=num_workers,
)
    
val_loader = torch.utils.data.DataLoader(
    validation_dataset, 
    batch_size=batch_size,
    num_workers=num_workers,
    shuffle=False,
    pin_memory=False,
)

In [23]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
] 

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)
scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=200, eta_min=1e-7)

criterion = torch.nn.CrossEntropyLoss()

In [24]:
model.to(device);

In [26]:
def get_score(preds, y):
    preds = np.array(preds)
    labels = preds.argmax(1)
    acc = (labels == y).mean()*100
    new_preds = np.zeros((len(preds),))
    temp = preds[labels != 0, 1:]
    new_preds[labels != 0] = temp.sum(1)
    new_preds[labels == 0] = preds[labels == 0, 0]
    y = np.array(y)
    y[y != 0] = 1
    auc_score = alaska_weighted_auc(y, new_preds)
    return auc_score, acc

def model_train():
    model.train();
def model_eval():
    model.eval()

In [28]:
def save_model(path, loss,epoch):
    model_eval()
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'model_loss': loss,
        'epoch': epoch,
    }, path)


In [29]:
foledr = './dump/ef_b0_small_div255_improve_2/'

In [None]:
best_loss = 999999

train_loss, val_loss = [], []
for e in range(n_epochs):
    model_train()
    running_loss = 0
    tk0 = tqdm(train_loader, total=int(len(train_loader)))
    for im, labels in tk0:
        inputs = im.to(device, dtype=torch.float)
        labels = labels.to(device, dtype=torch.long)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        tk0.set_postfix(loss=(loss.item()))
    epoch_loss = running_loss / (len(train_loader))
    train_loss.append(epoch_loss)
    print('Training Loss: {:.8f}'.format(epoch_loss))
    with open(foledr+"/train_loss.txt", "a") as text_file:
        _ = text_file.write(str(epoch_loss)+'\n')
    
    tk1 = tqdm(val_loader, total=int(len(val_loader)))
    model_eval()
    running_loss = 0
    y, preds = [], []
    with torch.no_grad():
        for (im, labels) in tk1:
            inputs = im.to(device, dtype=torch.float)
            labels = labels.to(device, dtype=torch.long)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            y.extend(labels.cpu().numpy().astype(int))
            preds.extend(F.softmax(outputs, 1).cpu().numpy())
            running_loss += loss.item()
            tk1.set_postfix(loss=(loss.item()))
        epoch_loss = running_loss / (len(val_loader))
        val_loss.append(epoch_loss)
        auc_score, acc = get_score(preds, y)

        print(f'Val Loss: {epoch_loss:.3}, Weighted AUC:{auc_score:.3}, Acc: {acc:.3}')
        with open(foledr+"/val_loss.txt", "a") as text_file:
            _ = text_file.write(str(epoch_loss)+'\n')
        with open(foledr+"/auc.txt", "a") as text_file:
            _ = text_file.write(str(auc_score)+'\n')
        with open(foledr+"/acc.txt", "a") as text_file:
            _ = text_file.write(str(acc)+'\n')
    
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        save_model(foledr+'/best_model/best_cp.bin', epoch_loss,e)
    save_model(foledr+'/last_model/last_cp.bin', epoch_loss,e)
    scheduler.step(metrics=epoch_loss)
        
                

In [None]:
model_paths = ['./dump/model_0.925_17.635.h5',
    './dump/model_0.925_17.679.h5',
    './dump//model_0.924_17.725.h5']

In [None]:
data_dir = '/home/data/alaska/'
from glob import glob
class Alaska2TestDataset(Dataset):
    def __init__(self, df, augmentations=None):
        self.data = df
        self.augment = augmentations

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        fn = self.data.loc[idx][0]
        im = cv2.imread(fn)[:, :, ::-1]

        return im

test_filenames = sorted(glob(f"{data_dir}/Test/*.jpg"))
test_df = pd.DataFrame({'ImageFileName': list(
    test_filenames)}, columns=['ImageFileName'])

test_dataset = Alaska2TestDataset(test_df, augmentations=AUGMENTATIONS_TEST)
testloader = torch.utils.data.DataLoader(test_dataset,
                                          batch_size=8,
                                          num_workers=8,
                                          shuffle=False,
                                          drop_last=False)

In [None]:
def to_sabmition(all_preds):
    all_preds = np.array(all_preds)
    s = np.sum([2**i for i in range(len(all_preds))])
    all_preds = np.sum([all_preds[i]*2**(len(all_preds) -i - 1) for i in range(len(all_preds))], axis=0) / s
    return all_preds

def foo(preds):
    preds = np.array(preds)
    labels = preds.argmax(1)
    result = np.zeros((len(preds),))
    result[labels != 0] = preds[labels != 0, 1:].sum(1)
    result[labels == 0] = 1 - preds[labels == 0, 0]
    return result
    
p1, p2, p3 = [], [], []
for i, model_path in enumerate(model_paths):
    net = get_net(num_classes)
    checkpoint = torch.load(model_path)
    net.load_state_dict(torch.load_state_dict(checkpoint['model_state_dict']))
    net = net.to(device)
    net = net.eval() 
    
    p1,p2,p3 = predict_test_by_model(net)
    t1,t2,t3 = [], [], []
    net = net.eval()
    with torch.no_grad():
        for inputs in tqdm(testloader):
            inputs = inputs.to(device, dtype=torch.float)
            t1.extend(F.softmax(net(inputs), 1).cpu().detach().numpy())
            t2.extend(F.softmax(net(inputs.flip(2)), 1).cpu().detach().numpy())
            t3.extend(F.softmax(net(inputs.flip(3)), 1).cpu().detach().numpy())
    p1.append(foo(np.array(t1)))
    p2.append(foo(np.array(t2)))
    p3.append(foo(np.array(t3)))


In [None]:
submission = pd.read_csv('/home/data/alaska/sample_submission.csv')

submission.Label = 0.4*to_sabmition(p1) + 0.3*to_sabmition(p2) + 0.3*to_sabmition(p3)
submission.head()
submission.to_csv('./dump/s/submition.csv', index=False)