In [1]:
from torch import Tensor
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor

import torch
import torchvision 
from torchvision import datasets, transforms
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from torch.utils.data import Dataset
import PIL
import os
import cv2
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn.model_selection import KFold
from torch.nn import Linear, ReLU, CrossEntropyLoss, Sequential, Conv2d, MaxPool2d, Module, Softmax, BatchNorm2d, Dropout
from torch import optim
import time
import random
from sklearn.model_selection import StratifiedKFold
import math
from torchvision import models
import datetime
from torch.utils.data.dataset import Subset
from torchvision.models import resnet50

In [2]:
!pip install einops

Collecting einops
  Downloading einops-0.4.1-py3-none-any.whl (28 kB)
Installing collected packages: einops
Successfully installed einops-0.4.1


In [3]:
!pip install torchsummary

Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1


In [4]:
from einops import rearrange, reduce, repeat
from einops.layers.torch import Rearrange, Reduce
from torchsummary import summary

In [5]:
data_dir = '../input/petfinder-pawpularity-score/'
working_dir = './'
global_batch_size = 64
workers = 2
seed = 42
model_name = 'resnet50_pretrained'
model_path = model_name
print(os.listdir(data_dir))

['sample_submission.csv', 'train.csv', 'test.csv', 'test', 'train']


In [6]:
# Defining a function to seed everything.
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONSEED"] = str(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

# Running the function:
seed_everything(seed)

In [7]:
train_data = pd.read_csv(f'{data_dir}train.csv')
train_data.head(5)

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,28
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,15
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,72


In [8]:
class create_dataset(Dataset):
    """Dataset connecting animal images to the score and annotations"""

    def __init__(self, csv_file, img_dir, transform=transforms.ToTensor()):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            img_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """

        self.annotations_csv = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.annotations_csv)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = os.path.join(self.img_dir,
                                self.annotations_csv.iloc[idx, 0])

        # load each image in PIL format for compatibility with transforms
        image = PIL.Image.open(img_name + '.jpg')
        
        # Columns 1 to 12 contain the annotations
        annotations = np.array(self.annotations_csv.iloc[idx, 1:13])
        annotations = annotations.astype('float')
        # Column 13 has the scores
        score = np.array(self.annotations_csv.iloc[idx, 13])
        score = torch.tensor(score.astype('float')).view(1).to(torch.float32)

        # Apply the transforms
        image = self.transform(image)

        sample = [image, annotations, score]
        return sample

In [9]:
## Define transforms with image augmentation on the training set
img_transforms = transforms.Compose([transforms.Resize(255),
                                     transforms.CenterCrop(224),
                                     transforms.RandomHorizontalFlip(),
                                     transforms.RandomRotation(20),
                                     transforms.ToTensor(),
                                     transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                          std=[0.229, 0.224, 0.225])])

img_transforms_valid = transforms.Compose([transforms.Resize(255),
                                           transforms.CenterCrop(224),
                                           transforms.ToTensor(),
                                           transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                                std=[0.229, 0.224, 0.225])])

In [10]:
num_bins = int(1+np.log2(len(train_data)))
train_data['bins'] = pd.cut(train_data['Pawpularity'],bins=num_bins,labels=False)
KF = StratifiedKFold(n_splits=3,random_state=seed,shuffle=True)

In [11]:
# Load the dataset
train_dataset = create_dataset(f'{data_dir}train.csv', f'{data_dir}train', transform=img_transforms)
valid_dataset = create_dataset(f'{data_dir}train.csv', f'{data_dir}train', transform=img_transforms_valid)

In [12]:
img = train_dataset[0][0]
img = img.unsqueeze(0)

In [13]:
img.shape

torch.Size([1, 3, 224, 224])

In [14]:
class PatchEmbedding(nn.Module):
    def __init__(self, in_channels: int = 3, patch_size: int = 16, emb_size: int = 768, img_size: int = 224):
        self.patch_size = patch_size
        super().__init__()
        self.projection = nn.Sequential(
            # using a conv layer instead of a linear one -> performance gains
            nn.Conv2d(in_channels, emb_size, kernel_size=patch_size, stride=patch_size),
            Rearrange('b e (h) (w) -> b (h w) e'),
        )
        self.cls_token = nn.Parameter(torch.randn(1,1, emb_size))
        self.positions = nn.Parameter(torch.randn((img_size // patch_size) **2 + 1, emb_size))

        
    def forward(self, x: Tensor) -> Tensor:
        b, _, _, _ = x.shape
        x = self.projection(x)
        cls_tokens = repeat(self.cls_token, '() n e -> b n e', b=b)
        # prepend the cls token to the input
        x = torch.cat([cls_tokens, x], dim=1)
        # add position embedding
        x += self.positions
        return x

In [15]:
PatchEmbedding()(img).shape

torch.Size([1, 197, 768])

In [16]:
class MultiHeadAttention(nn.Module):
    def __init__(self, emb_size: int = 768, num_heads: int = 8, dropout: float = 0):
        super().__init__()
        self.emb_size = emb_size
        self.num_heads = num_heads
        # fuse the queries, keys and values in one matrix
        self.qkv = nn.Linear(emb_size, emb_size * 3)
        self.att_drop = nn.Dropout(dropout)
        self.projection = nn.Linear(emb_size, emb_size)
        
    def forward(self, x : Tensor, mask: Tensor = None) -> Tensor:
        # split keys, queries and values in num_heads
        qkv = rearrange(self.qkv(x), "b n (h d qkv) -> (qkv) b h n d", h=self.num_heads, qkv=3)
        queries, keys, values = qkv[0], qkv[1], qkv[2]
        # sum up over the last axis
        energy = torch.einsum('bhqd, bhkd -> bhqk', queries, keys) # batch, num_heads, query_len, key_len
        if mask is not None:
            fill_value = torch.finfo(torch.float32).min
            energy.mask_fill(~mask, fill_value)
            
        scaling = self.emb_size ** (1/2)
        att = F.softmax(energy, dim=-1) / scaling
        att = self.att_drop(att)
        # sum up over the third axis
        out = torch.einsum('bhal, bhlv -> bhav ', att, values)
        out = rearrange(out, "b h n d -> b n (h d)")
        out = self.projection(out)
        return out

In [17]:
class ResidualAdd(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn
        
    def forward(self, x, **kwargs):
        res = x
        x = self.fn(x, **kwargs)
        x += res
        return x

In [18]:
class FeedForwardBlock(nn.Sequential):
    def __init__(self, emb_size: int, expansion: int = 4, drop_p: float = 0.):
        super().__init__(
            nn.Linear(emb_size, expansion * emb_size),
            nn.GELU(),
            nn.Dropout(drop_p),
            nn.Linear(expansion * emb_size, emb_size),
        )

In [19]:
class TransformerEncoderBlock(nn.Sequential):
    def __init__(self,
                 emb_size: int = 768,
                 drop_p: float = 0.,
                 forward_expansion: int = 4,
                 forward_drop_p: float = 0.,
                 ** kwargs):
        super().__init__(
            ResidualAdd(nn.Sequential(
                nn.LayerNorm(emb_size),
                MultiHeadAttention(emb_size, **kwargs),
                nn.Dropout(drop_p)
            )),
            ResidualAdd(nn.Sequential(
                nn.LayerNorm(emb_size),
                FeedForwardBlock(
                    emb_size, expansion=forward_expansion, drop_p=forward_drop_p),
                nn.Dropout(drop_p)
            )
            ))

In [20]:
patches_embedded = PatchEmbedding()(img)
MultiHeadAttention()(patches_embedded).shape

torch.Size([1, 197, 768])

In [21]:
class TransformerEncoder(nn.Sequential):
    def __init__(self, depth: int = 12, **kwargs):
        super().__init__(*[TransformerEncoderBlock(**kwargs) for _ in range(depth)])

In [22]:
class ClassificationHead(nn.Sequential):
    def __init__(self, emb_size: int = 768, n_classes: int = 1000):
        super().__init__(
            Reduce('b n e -> b e', reduction='mean'),
            nn.LayerNorm(emb_size), 
            nn.Linear(emb_size, n_classes))

In [23]:
class ViT(nn.Sequential):
    def __init__(self,     
                in_channels: int = 3,
                patch_size: int = 16,
                emb_size: int = 768,
                img_size: int = 224,
                depth: int = 12,
                n_classes: int = 1,
                **kwargs):
        super().__init__(
            PatchEmbedding(in_channels, patch_size, emb_size, img_size),
            TransformerEncoder(depth, emb_size=emb_size, **kwargs),
            ClassificationHead(emb_size, n_classes)
        )

In [24]:
# resouces: https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py
class EarlyStopping:
    
    def __init__(self,patience,verbose,path,trace_func):
        self.patience = patience
        self.verbose = verbose
        self.path = path
        self.counter = -1
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.trace_func = trace_func
        
    def __call__(self,val_loss,model):
        
        if self.best_score is None:
            self.best_score = val_loss
            self.checkpoint(val_loss,model)
        
        if val_loss < self.best_score:
            self.best_score = val_loss
            self.checkpoint(val_loss,model)
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
                
        return self.path
    
    def checkpoint(self,val_loss,model):
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(),self.path)
        self.val_loss_min = val_loss

In [25]:
# check if CUDA is available and set the training device

train_on_gpu = torch.cuda.is_available()
device = torch.cuda.get_device_name()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print(f'CUDA is available!  Training on GPU {device}...')

CUDA is available!  Training on GPU Tesla P100-PCIE-16GB...


In [26]:
def training_loop(n_epochs,optimizer,model,loss_fn,train_loader,val_loader,model_path,fold):

    earlystopping = EarlyStopping(patience=7,verbose=False,path=model_path,trace_func=print)
    train_losses, valid_losses = [], []
    
    for epoch in range(1,n_epochs+1):
        start = time.time()
        current_lr = scheduler.get_last_lr()[0]
        
        # keep track of training and validation loss
        train_loss = 0.0
        valid_loss = 0.0
        
        ###################
        # train the model #
        ###################
        # put in training mode (enable dropout)
        model.train()
        for images, annotations, scores in train_loader:
            # move tensors to GPU if CUDA is available
            if train_on_gpu:
                images, annotations, scores = images.cuda(), annotations.cuda(), scores.cuda()
            # clear the gradients of all optimized variables
            optimizer.zero_grad()
            # forward pass: compute predicted outputs by passing inputs to the model
            output = model(images)*100 # multiply by 100 the sigmoid output to 0-100 pawpularity scale
            # print(output.dtype)
            # print(scores.dtype)
            # calculate the batch loss
            loss = loss_fn(output, scores)
            # backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()
            # perform a single optimization step (parameter update)
            optimizer.step()
            # update training loss
            train_loss += loss.item()
        
        ######################    
        # validate the model #
        ######################
        # eval mode (no dropout)
        model.eval()
        with torch.no_grad():
            for images, annotations, scores in valid_loader:
                # move tensors to GPU if CUDA is available
                if train_on_gpu:
                    images, annotations, scores = images.cuda(), annotations.cuda(), scores.cuda()
                # forward pass: compute predicted outputs by passing inputs to the model
                output = model(images)*100 # multiply by 100 the sigmoid output to 0-100 pawpularity scale
                # calculate the batch loss
                loss = loss_fn(output, scores)
                # update average validation loss 
                valid_loss += loss.item()
        
        # calculate RMSE
        train_loss = math.sqrt(train_loss/len(train_loader.sampler))
        valid_loss = math.sqrt(valid_loss/len(valid_loader.sampler))
    
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)  
       
        # increment learning rate decay
        scheduler.step()
        
         # print training/validation statistics 
#         print('Epoch: {}, time: {:.1f}s, lr: {:.7f} \tTraining Loss: {:.3f} \tValidation Loss: {:.3f}'.format(
#             epoch, float(time.time() - start), current_lr, train_loss, valid_loss))
        
        print('Fold: {}, epoch: {}, time: {:.1f}s,lr: {:.7f} \tTraining Loss: {:.3f} \tValidation Loss: {:.3f}'.format(
        fold,epoch,float(time.time() - start), current_lr,
        train_loss, valid_loss))
        
        earlystopping(valid_loss,model)
        if earlystopping.early_stop:
            print('{} Fold,{} Epoch ,Early Stopping'.format(fold,epoch-earlystopping.patience))
            break

In [27]:
    model_list= []
    for fold,(train_index,val_index) in enumerate(KF.split(train_data,train_data['bins'])):
        train_subdata = Subset(train_dataset,train_index)
        train_loader = torch.utils.data.DataLoader(train_subdata, batch_size=global_batch_size,
                                                   shuffle=True, num_workers=workers,
                                                   pin_memory=True) 

        valid_subdata = Subset(valid_dataset,val_index)
        valid_loader = torch.utils.data.DataLoader(valid_subdata, batch_size=global_batch_size,
                                                   shuffle=True, num_workers=workers,
                                                   pin_memory=True) 
    
        model_path = model_name+str(fold)+'.pt'
        model = ViT()
        # the size of resnet50 final layer is 1x1x2048 so 
        # we must reinitialize model.fc to be a Linear layer with 2048 input features and 1 output features
        # re: https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html
        
        optimizer = torch.optim.Adam(params=model.parameters(),lr=1e-4)
        scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones = [1, 2, 6], gamma=0.5)
        criterion = nn.MSELoss()   
        if train_on_gpu:
            model.cuda()
        training_loop(
                n_epochs=20,
                optimizer=optimizer,
                model=model,
                loss_fn=criterion,
                train_loader=train_loader,
                val_loader=valid_loader,
                model_path=model_path,
                fold=fold
            )

Fold: 0, epoch: 1, time: 208.9s,lr: 0.0001000 	Training Loss: 17.072 	Validation Loss: 2.609
Fold: 0, epoch: 2, time: 195.3s,lr: 0.0000500 	Training Loss: 2.613 	Validation Loss: 2.639
Fold: 0, epoch: 3, time: 193.0s,lr: 0.0000250 	Training Loss: 2.611 	Validation Loss: 2.599
Fold: 0, epoch: 4, time: 193.9s,lr: 0.0000250 	Training Loss: 2.599 	Validation Loss: 2.598
Fold: 0, epoch: 5, time: 194.1s,lr: 0.0000250 	Training Loss: 2.605 	Validation Loss: 2.613
Fold: 0, epoch: 6, time: 195.0s,lr: 0.0000250 	Training Loss: 2.596 	Validation Loss: 2.584
Fold: 0, epoch: 7, time: 193.4s,lr: 0.0000125 	Training Loss: 2.593 	Validation Loss: 2.588
Fold: 0, epoch: 8, time: 194.9s,lr: 0.0000125 	Training Loss: 2.601 	Validation Loss: 2.593
Fold: 0, epoch: 9, time: 193.7s,lr: 0.0000125 	Training Loss: 2.594 	Validation Loss: 2.599
Fold: 0, epoch: 10, time: 193.2s,lr: 0.0000125 	Training Loss: 2.586 	Validation Loss: 2.618
Fold: 0, epoch: 11, time: 192.8s,lr: 0.0000125 	Training Loss: 2.590 	Validati