In [1]:
import os
from datetime import datetime
from typing import Dict, Tuple, Any
from tqdm import tqdm

import math
import numpy as np
import pandas as pd

import cv2
import albumentations
from torch.utils.data import Dataset

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.autograd import Variable
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import OneCycleLR, ReduceLROnPlateau

import timm

In [6]:
a = torch.randn(3, 4)
a

tensor([[-0.5054, -0.4654,  0.9447,  1.6845],
        [ 0.3274,  0.8050, -0.5107,  0.1046],
        [-0.2346,  0.6053,  1.3478,  0.9806]])

In [7]:
a.mean(1)

tensor([0.4146, 0.1816, 0.6748])

In [2]:
timm.list_models()

['adv_inception_v3',
 'botnet26t_256',
 'botnet50ts_256',
 'cait_m36_384',
 'cait_m48_448',
 'cait_s24_224',
 'cait_s24_384',
 'cait_s36_384',
 'cait_xs24_384',
 'cait_xxs24_224',
 'cait_xxs24_384',
 'cait_xxs36_224',
 'cait_xxs36_384',
 'coat_lite_mini',
 'coat_lite_small',
 'coat_lite_tiny',
 'coat_mini',
 'coat_tiny',
 'cspdarknet53',
 'cspdarknet53_iabn',
 'cspresnet50',
 'cspresnet50d',
 'cspresnet50w',
 'cspresnext50',
 'cspresnext50_iabn',
 'darknet53',
 'densenet121',
 'densenet121d',
 'densenet161',
 'densenet169',
 'densenet201',
 'densenet264',
 'densenet264d_iabn',
 'densenetblur121d',
 'dla34',
 'dla46_c',
 'dla46x_c',
 'dla60',
 'dla60_res2net',
 'dla60_res2next',
 'dla60x',
 'dla60x_c',
 'dla102',
 'dla102x',
 'dla102x2',
 'dla169',
 'dm_nfnet_f0',
 'dm_nfnet_f1',
 'dm_nfnet_f2',
 'dm_nfnet_f3',
 'dm_nfnet_f4',
 'dm_nfnet_f5',
 'dm_nfnet_f6',
 'dpn68',
 'dpn68b',
 'dpn92',
 'dpn98',
 'dpn107',
 'dpn131',
 'eca_nfnet_l0',
 'eca_nfnet_l1',
 'eca_vovnet39b',
 'ecaresnet26t'

In [3]:
model = timm.create_model('resnest101e', pretrained=True)

Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_s_21k-6337ad01.pth" to /home/yuan/.cache/torch/hub/checkpoints/tf_efficientnetv2_s_21k-6337ad01.pth


In [8]:
model = EffnetV2m_Landmark(81318)

In [4]:
class Swish(torch.autograd.Function):

    @staticmethod
    def forward(ctx, i):
        result = i * torch.sigmoid(i)
        ctx.save_for_backward(i)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        i = ctx.saved_variables[0]
        sigmoid_i = torch.sigmoid(i)
        return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))


class Swish_module(nn.Module):
    @autocast()
    def forward(self, x):
        return Swish.apply(x)


class DenseCrossEntropy(nn.Module):
    @autocast()
    def forward(self, x, target):
        x = x.float()
        target = target.float()
        logprobs = torch.nn.functional.log_softmax(x, dim=-1)

        loss = -logprobs * target
        loss = loss.sum(-1)
        return loss.mean()


class ArcMarginProduct_subcenter(nn.Module):
    def __init__(self, in_features, out_features, k=3):
        super().__init__()
        self.weight = nn.Parameter(torch.FloatTensor(out_features*k, in_features))
        self.reset_parameters()
        self.k = k
        self.out_features = out_features
        
    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
    
    @autocast()
    def forward(self, features):
        cosine_all = F.linear(F.normalize(features), F.normalize(self.weight))
        cosine_all = cosine_all.view(-1, self.out_features, self.k)
        cosine, _ = torch.max(cosine_all, dim=2)
        return cosine   


class ArcFaceLossAdaptiveMargin(nn.modules.Module):
    def __init__(self, margins, s=30.0):
        super().__init__()
        self.crit = DenseCrossEntropy()
        self.s = s
        self.margins = margins
            
    @autocast()
    def forward(self, logits, labels, out_dim):
        ms = []
        ms = self.margins[labels.cpu().numpy()]
        cos_m = torch.from_numpy(np.cos(ms)).float().cuda()
        sin_m = torch.from_numpy(np.sin(ms)).float().cuda()
        th = torch.from_numpy(np.cos(math.pi - ms)).float().cuda()
        mm = torch.from_numpy(np.sin(math.pi - ms) * ms).float().cuda()
        labels = F.one_hot(labels, out_dim).float()
        logits = logits.float()
        cosine = logits
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * cos_m.view(-1, 1) - sine * sin_m.view(-1, 1)
        phi = torch.where(cosine > th.view(-1, 1), phi, cosine - mm.view(-1, 1))
        output = (labels * phi) + ((1.0 - labels) * cosine)
        output *= self.s
        loss = self.crit(output, labels)
        return loss


def gem(x, p=3, eps=1e-6):
    return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)


class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6, p_trainable=True):
        super(GeM,self).__init__()
        if p_trainable:
            self.p = Parameter(torch.ones(1)*p)
        else:
            self.p = p
        self.eps = eps

    def forward(self, x):
        return gem(x, p=self.p, eps=self.eps)
    
    def __repr__(self):
        return self.__class__.__name__ + '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + ', ' + 'eps=' + str(self.eps) + ')'

class EffnetV2m_Landmark(nn.Module):

    def __init__(self, out_dim, load_pretrained=True):
        super().__init__()

        self.backbone = timm.create_model('resnest101e', pretrained=True)
        self.feat = nn.Sequential(
            nn.Linear(self.backbone.num_features, 512, bias=True),
            nn.BatchNorm1d(512),
            Swish_module()
        )
        self.backbone.global_pool = GeM()
        self.backbone.classifier = nn.Identity()
        
        # self.swish = Swish_module()
        self.metric_classify = ArcMarginProduct_subcenter(512, out_dim)


    def extract(self, x):
        return self.backbone(x)[:, :, 0, 0]

    @autocast()
    def forward(self, x):
        x = self.extract(x)
        logits_m = self.metric_classify(self.feat(x))
        return logits_m

In [5]:
model = EffnetV2m_Landmark(81313)

Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-resnest/resnest101-22405ba7.pth" to /home/yuan/.cache/torch/hub/checkpoints/resnest101-22405ba7.pth


In [None]:
model.backbone

In [5]:
with torch.no_grad():
    a = model.forward_features(torch.randn(8, 3, 448, 448).float()).shape
    print(a)

torch.Size([8, 1280, 14, 14])


In [12]:
model.num_features

768

In [11]:
dir(model)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backward_hooks',
 '_buffers',
 '_call_impl',
 '_forward_hooks',
 '_forward_pre_hooks',
 '_get_backward_hooks',
 '_get_name',
 '_init_weights',
 '_is_full_backward_hook',
 '_load_from_state_dict',
 '_load_state_dict_pre_hooks',
 '_maybe_warn_non_full_backward_hook',
 '_modules',
 '_named_members',
 '_non_persistent_buffers_set',
 '_parameters',
 '_register_load_state_dict_pre_hook',
 '_register_state_dict_hook',
 '_replicate_for_data_parallel',
 '_save_to_state_dict',
 '_slow_forward',
 '_state_dict_hooks',
 '_version',
 'add_m

In [2]:
# ---------------------------------------
# parameters

MODEL_DIR = './model_checkpoints/'
DATA_DIR = '../input/'
LOG_DIR = './logs/'
DEVICE = 'cuda:0'
MODEL_NAME = 'rexnet_200'

TRAIN_STEP = 0
FOLD = 0

IMAGE_SIZE = 256
BATCH_SIZE = 64
NUM_EPOCHS = 10
NUM_WORKERS = 4
LR = 1e-4
USE_AMP = True


In [5]:
# ---------------------------------------
# utils

def set_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True


In [3]:
# ---------------------------------------
# data pipeline definitions

class LandmarkDataset(Dataset):
    def __init__(self, csv, mode, transform=None):

        self.csv = csv.reset_index()
        self.mode = mode
        self.transform = transform

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]

        image = cv2.imread(row.filepath)[:,:,::-1]

        if self.transform is not None:
            res = self.transform(image=image)
            image = res['image'].astype(np.float32)
        else:
            image = image.astype(np.float32)

        image = image.transpose(2, 0, 1)
        if self.mode == 'test':
            return torch.tensor(image)
        else:
            return torch.tensor(image), torch.tensor(row.landmark_id)


def get_transforms():

    transforms_train = albumentations.Compose([
        albumentations.Resize(IMAGE_SIZE, IMAGE_SIZE),
        albumentations.HorizontalFlip(p=0.5),
        albumentations.ImageCompression(quality_lower=99, quality_upper=100),
        albumentations.ShiftScaleRotate(shift_limit=0.2, scale_limit=0.2, rotate_limit=10, border_mode=0, p=0.7),
        albumentations.Cutout(max_h_size=int(IMAGE_SIZE * 0.4), max_w_size=int(IMAGE_SIZE * 0.4), num_holes=1, p=0.5),
        albumentations.Normalize()
    ])

    transforms_val = albumentations.Compose([
        albumentations.Resize(IMAGE_SIZE, IMAGE_SIZE),
        albumentations.Normalize()
    ])

    return transforms_train, transforms_val


def get_df():

    df = pd.read_csv(os.path.join(DATA_DIR, 'train_0.csv'))

    if TRAIN_STEP == 0:
        # df_train = pd.read_csv(os.path.join(DATA_DIR, 'train_url.csv')).drop(columns=['url'])
        df_train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
    else:
        cls_81313 = df.landmark_id.unique()
        # df_train = pd.read_csv(os.path.join(DATA_DIR, 'train_url.csv')).drop(columns=['url']).set_index('landmark_id').loc[cls_81313].reset_index()
        df_train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv')).set_index('landmark_id').loc[cls_81313].reset_index()
        
    df_train['filepath'] = df_train['id'].apply(lambda x: os.path.join(DATA_DIR, 'train', x[0], x[1], x[2], f'{x}.jpg'))
    df = df_train.merge(df, on=['id','landmark_id'], how='left')

    landmark_id2idx = {landmark_id: idx for idx, landmark_id in enumerate(sorted(df['landmark_id'].unique()))}
    idx2landmark_id = {idx: landmark_id for idx, landmark_id in enumerate(sorted(df['landmark_id'].unique()))}
    df['landmark_id'] = df['landmark_id'].map(landmark_id2idx)

    out_dim = df.landmark_id.nunique()

    return df, out_dim

In [34]:
# ---------------------------------------
# model definitions

class Swish(torch.autograd.Function):

    @staticmethod
    def forward(ctx, i):
        result = i * torch.sigmoid(i)
        ctx.save_for_backward(i)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        i = ctx.saved_variables[0]
        sigmoid_i = torch.sigmoid(i)
        return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))


class Swish_module(nn.Module):
    @autocast()
    def forward(self, x):
        return Swish.apply(x)


class DenseCrossEntropy(nn.Module):
    @autocast()
    def forward(self, x, target):
        x = x.float()
        target = target.float()
        logprobs = torch.nn.functional.log_softmax(x, dim=-1)

        loss = -logprobs * target
        loss = loss.sum(-1)
        return loss.mean()


class ArcMarginProduct_subcenter(nn.Module):
    def __init__(self, in_features, out_features, k=3):
        super().__init__()
        self.weight = nn.Parameter(torch.FloatTensor(out_features*k, in_features))
        self.reset_parameters()
        self.k = k
        self.out_features = out_features
        
    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
    
    @autocast()
    def forward(self, features):
        cosine_all = F.linear(F.normalize(features), F.normalize(self.weight))
        cosine_all = cosine_all.view(-1, self.out_features, self.k)
        cosine, _ = torch.max(cosine_all, dim=2)
        return cosine   


class ArcFaceLossAdaptiveMargin(nn.modules.Module):
    def __init__(self, margins, s=30.0):
        super().__init__()
        self.crit = DenseCrossEntropy()
        self.s = s
        self.margins = margins
            
    @autocast()
    def forward(self, logits, labels, out_dim):
        ms = []
        ms = self.margins[labels.cpu().numpy()]
        cos_m = torch.from_numpy(np.cos(ms)).float().cuda()
        sin_m = torch.from_numpy(np.sin(ms)).float().cuda()
        th = torch.from_numpy(np.cos(math.pi - ms)).float().cuda()
        mm = torch.from_numpy(np.sin(math.pi - ms) * ms).float().cuda()
        labels = F.one_hot(labels, out_dim).float()
        logits = logits.float()
        cosine = logits
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * cos_m.view(-1, 1) - sine * sin_m.view(-1, 1)
        phi = torch.where(cosine > th.view(-1, 1), phi, cosine - mm.view(-1, 1))
        output = (labels * phi) + ((1.0 - labels) * cosine)
        output *= self.s
        loss = self.crit(output, labels)
        return loss


def gem(x, p=3, eps=1e-6):
    return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)


class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6, p_trainable=True):
        super(GeM,self).__init__()
        if p_trainable:
            self.p = Parameter(torch.ones(1)*p)
        else:
            self.p = p
        self.eps = eps

    def forward(self, x):
        return gem(x, p=self.p, eps=self.eps)
    
    def __repr__(self):
        return self.__class__.__name__ + '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + ', ' + 'eps=' + str(self.eps) + ')'


class RexNet20_Landmark(nn.Module):

    def __init__(self, out_dim, load_pretrained=True):
        super(RexNet20_Landmark, self).__init__()

        self.backbone = timm.create_model('rexnet_200', pretrained=load_pretrained)
        self.feat = nn.Sequential(
            nn.Linear(self.backbone.features[-1].out_channels, 512, bias=True),
            nn.BatchNorm1d(512),
            Swish_module()
        )
        self.backbone.head.global_pool = GeM()
        self.backbone.head.fc = nn.Identity()
        
        # self.swish = Swish_module()
        self.metric_classify = ArcMarginProduct_subcenter(512, out_dim)


    def extract(self, x):
        return self.backbone(x)[:, :, 0, 0]

    @autocast()
    def forward(self, x):
        x = self.extract(x)
        # logits_m = self.metric_classify(self.swish(self.feat(x)))
        logits_m = self.metric_classify(self.feat(x))
        return logits_m

In [35]:
# a = timm.create_model('rexnet_200', pretrained=True)
a = RexNet20_Landmark(80000)
# a = Net()

In [36]:
b = torch.randn(8, 3, 512, 512).float()

In [37]:
a(b).shape

torch.Size([8, 80000])

In [20]:
a.global_pool(a.backbone(b)).shape

torch.Size([8, 1536, 16, 16])

In [21]:
a.global_pool(a.backbone(b)).shape

torch.Size([8, 1536, 1, 1])

In [6]:
# ---------------------------------------
# training utils

def global_average_precision_score(
        y_true: Dict[Any, Any],
        y_pred: Dict[Any, Tuple[Any, float]]
) -> float:
    """
    Compute Global Average Precision score (GAP)
    Parameters
    ----------
    y_true : Dict[Any, Any]
        Dictionary with query ids and true ids for query samples
    y_pred : Dict[Any, Tuple[Any, float]]
        Dictionary with query ids and predictions (predicted id, confidence
        level)
    Returns
    -------
    float
        GAP score
    """
    indexes = list(y_pred.keys())
    indexes.sort(
        key=lambda x: -y_pred[x][1],
    )
    queries_with_target = len([i for i in y_true.values() if i is not None])
    correct_predictions = 0
    total_score = 0.
    for i, k in enumerate(indexes, 1):
        relevance_of_prediction_i = 0
        if y_true[k] == y_pred[k][0]:
            correct_predictions += 1
            relevance_of_prediction_i = 1
        precision_at_rank_i = correct_predictions / i
        total_score += precision_at_rank_i * relevance_of_prediction_i

    return 1 / queries_with_target * total_score
    

def train_epoch(model, loader, optimizer, criterion, scaler):

    model.train()
    train_loss = []
    bar = tqdm(loader)
    for (data, target) in bar:

        data, target = data.cuda(), target.cuda()
        optimizer.zero_grad()

        if not USE_AMP:
            logits_m = model(data)
            loss = criterion(logits_m, target)
            loss.backward()
            optimizer.step()
        else:
            with autocast():
                logits_m = model(data)
                loss = criterion(logits_m, target)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()

        torch.cuda.synchronize()
            
        loss_np = loss.detach().cpu().numpy()
        train_loss.append(loss_np)
        smooth_loss = sum(train_loss[-100:]) / min(len(train_loss), 100)
        bar.set_description('loss: %.5f, smth: %.5f' % (loss_np, smooth_loss))

    return train_loss


def val_epoch(model, valid_loader, criterion, get_output=False):

    model.eval()
    val_loss = []
    PRODS_M = []
    PREDS_M = []
    TARGETS = []

    with torch.no_grad():
        for (data, target) in tqdm(valid_loader):
            data, target = data.cuda(), target.cuda()

            logits_m = model(data)

            lmax_m = logits_m.max(1)
            probs_m = lmax_m.values
            preds_m = lmax_m.indices

            PRODS_M.append(probs_m.detach().cpu())
            PREDS_M.append(preds_m.detach().cpu())
            TARGETS.append(target.detach().cpu())

            loss = criterion(logits_m, target)
            val_loss.append(loss.detach().cpu().numpy())

        val_loss = np.mean(val_loss)
        PRODS_M = torch.cat(PRODS_M).numpy()
        PREDS_M = torch.cat(PREDS_M).numpy()
        TARGETS = torch.cat(TARGETS)

    if get_output:
        return LOGITS_M
    else:
        acc_m = (PREDS_M == TARGETS.numpy()).mean() * 100.
        y_true = {idx: target if target >=0 else None for idx, target in enumerate(TARGETS)}
        y_pred_m = {idx: (pred_cls, conf) for idx, (pred_cls, conf) in enumerate(zip(PREDS_M, PRODS_M))}
        gap_m = global_average_precision_score(y_true, y_pred_m)
        return val_loss, acc_m, gap_m


In [7]:
# get dataframe
df, out_dim = get_df()
print(f"out_dim = {out_dim}")

# get adaptive margin
tmp = np.sqrt(1 / np.sqrt(df['landmark_id'].value_counts().sort_index().values))
margins = (tmp - tmp.min()) / (tmp.max() - tmp.min()) * 0.45 + 0.05

out_dim = 81313


In [8]:
# get augmentations
transforms_train, transforms_val = get_transforms()

# get train and valid dataset
df_train = df[df['fold'] != FOLD]
df_valid = df[df['fold'] == FOLD].reset_index(drop=True).query("index % 15==0")

dataset_train = LandmarkDataset(df_train, 'train', transform=transforms_train)
dataset_valid = LandmarkDataset(df_valid, 'val', transform=transforms_val)
train_loader = DataLoader(
    dataset_train, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS,
    shuffle=True
)
valid_loader = DataLoader(dataset_valid, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
 



In [9]:
# model
model = nn.DataParallel(RexNet20_Landmark(out_dim=out_dim)).to(DEVICE)

# loss func
def criterion(logits_m, target):
    arc = ArcFaceLossAdaptiveMargin(margins=margins, s=80)
    loss_m = arc(logits_m, target, out_dim)
    return loss_m

# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scaler = GradScaler(enabled=True)

# scheduler
scheduler = OneCycleLR(optimizer, max_lr=LR, steps_per_epoch=len(train_loader), pct_start=.05, epochs=NUM_EPOCHS)

In [10]:
# train & valid loop
gap_m_max = 0.
model_file = os.path.join(MODEL_DIR, f'{MODEL_NAME}_fold{FOLD}.pth')

for epoch in range(NUM_EPOCHS):
    
    curr_time = datetime.strftime(datetime.now(), '%Y%b%d_%HH%MM%SS')
    print(curr_time, 'Epoch:', epoch)
    
    train_loss = train_epoch(model, train_loader, optimizer, criterion, scaler)
    val_loss, acc_m, gap_m = val_epoch(model, valid_loader, criterion)

    content = curr_time + ' ' + f'Fold {FOLD}, Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {np.mean(train_loss):.5f}, valid loss: {(val_loss):.5f}, acc_m: {(acc_m):.6f}, gap_m: {(gap_m):.6f}.'
    print(content)
    
    with open(os.path.join(MODEL_DIR, f'{MODEL_NAME}.txt'), 'a') as appender:
        appender.write(content + '\n')

    print('gap_m_max ({:.6f} --> {:.6f}). Saving model ...'.format(gap_m_max, gap_m))
    
    torch.save(
        {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        },
        model_file
    )
    gap_m_max = gap_m

print(datetime.strftime(datetime.now(), '%Y%b%d_%HH%MM%SS'), 'Training Finished!')

torch.save(
    {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, 
    os.path.join(MODEL_DIR, f'{MODEL_NAME}_fold{FOLD}_final.pth')
)

2021Aug21_22H43M16S Epoch: 0


  i = ctx.saved_variables[0]
loss: 30.36711, smth: 33.90587:   1%|▏         | 288/19756 [01:47<2:01:34,  2.67it/s]


KeyboardInterrupt: 