In [1]:
import sys
import os
import random
import json
import gc
import cv2
import pandas as pd
import numpy as np

from tqdm import tqdm
from PIL import Image
from sklearn.metrics import accuracy_score
from functools import partial
from albumentations import (Compose, OneOf, Normalize, Resize, RandomResizedCrop, RandomCrop, CenterCrop, 
                            HorizontalFlip, VerticalFlip, Rotate, ShiftScaleRotate, Transpose)
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform
import tensorflow as tf
import timm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from torch.utils.data import DataLoader, Dataset

import torchvision.transforms as transforms
from transformers import ViTForImageClassification,ViTConfig,AutoModel
from transformers import ViTModel

import math

In [11]:
path = "/kaggle/input/cassava-leaf-disease-classification/"
image_path = path+"test_images/"

IMAGE_SIZE = (512,512)
submission_df = pd.DataFrame(columns={"image_id","label"})
submission_df["image_id"] = os.listdir(image_path)
submission_df["label"] = 0

In [12]:
# used_models_pytorch = {"resnext": [f'../input/models/resnext50_32x4d_fold{fold}_best.pth' for fold in [1]]}
# used_models_pytorch = {"vit": f'../input/model-vit/original_save_pretrained'}
# used_models_pytorch = {"mobilenet": f'../input/model-mobilenet/mn3_bt20_ep5_lr1.pth'}
used_models_pytorch = {"resnext": [f'../input/models/resnext50_32x4d_fold{fold}_best.pth' for fold in [1]],
                       "vit": f'../input/model-vit/original_save_pretrained',
                       "mobilenet": f'../input/model-mobilenet/mn3_bt20_ep5_lr1.pth'}

# ResNext50_32x4d

In [13]:
class CustomResNext(nn.Module):
        def __init__(self, model_name='resnext50_32x4d', pretrained=False):
            super().__init__()
            self.model = timm.create_model(model_name, pretrained=pretrained)
            n_features = self.model.fc.in_features
            self.model.fc = nn.Linear(n_features, 5)

        def forward(self, x):
            x = self.model(x)
            return x

class TestDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_names = df['image_path_id'].values
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_name = self.file_names[idx]
        image = cv2.imread(file_name)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        return image

if "resnext" in used_models_pytorch:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def get_transforms():
        return Compose([Resize(512, 512),
                        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                        ToTensorV2()])

    def inference(model, states, test_loader, device):
        model.to(device)

        probabilities = []
        for i, (images) in enumerate(test_loader):
            images = images.to(device)
            avg_preds = []
            for state in states:
                model.load_state_dict(state['model'])
                model.eval()
                with torch.no_grad():
                    y_preds = model(images)
                avg_preds.append(y_preds.softmax(1).to('cpu').numpy())
            avg_preds = np.mean(avg_preds, axis=0)
            probabilities.append(avg_preds)
        return np.concatenate(probabilities)
    

    predictions_resnext = pd.DataFrame(columns={"image_id"})
    predictions_resnext["image_id"] = submission_df["image_id"].values
    predictions_resnext['image_path_id'] = image_path + predictions_resnext['image_id'].astype(str)

    model = CustomResNext('resnext50_32x4d', pretrained=False)
    states = [torch.load(f) for f in used_models_pytorch["resnext"]]

    test_dataset = TestDataset(predictions_resnext, transform=get_transforms())
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=4, pin_memory=True)
    predictions = inference(model, states, test_loader, device)
    print(predictions)

    predictions_resnext['resnext'] = [np.squeeze(p) for p in predictions]
    predictions_resnext = predictions_resnext.drop(["image_path_id"], axis=1)

    torch.cuda.empty_cache()
    try:
        del(model)
        del(states)
    except:
        pass
    gc.collect()

  cpuset_checked))


[[0.01967716 0.02100126 0.1185554  0.04254021 0.798226  ]]


# ViT

In [14]:
if "vit" in used_models_pytorch:
    
    IMG_SIZE = 224
    BATCH_SIZE = 16
    num_classes = 5

    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
        
    class LeafDataset(torch.utils.data.Dataset):
    
        def __init__(self, df, data_path, mode='train', transforms=None):
            super().__init__()
            self.df_data = df.values
            self.data_path = data_path
            self.transforms = transforms 
            self.mode = mode
            self.data_dir = 'train_images' if mode == 'train' else 'test_images'

        def __len__(self):
            return len(self.df_data)

        def __getitem__(self, index):
            img_name = self.df_data[index][0]
            img_path = os.path.join(self.data_path, self.data_dir, img_name)
            img = Image.open(img_path).convert('RGB')

            if self.transforms is not None:
                img = self.transforms(img)

            return img

    transforms_val = transforms.Compose([
        transforms.Resize((IMG_SIZE, IMG_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])

    def predict(model, test_dataset):
        preds = []
        test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

        for test_images in tqdm(test_dataloader):
            test_images = test_images.to(device)
            model.eval()
            with torch.no_grad():
                output = model(test_images)
            preds.extend(output.logits.data.softmax(1).cpu().data.numpy())

        return preds    

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    predictions_vit = pd.DataFrame(columns={"image_id"})
    predictions_vit["image_id"] = submission_df["image_id"].values

    model = ViTForImageClassification.from_pretrained(used_models_pytorch["vit"], num_labels = num_classes)
    model.to(device)
    
    test_dataset = LeafDataset(df=predictions_vit, data_path=path, mode='test', transforms=transforms_val)

    predictions_raw_vit = predict(model, test_dataset)
    print(predictions_raw_vit)

    predictions_vit['vit'] = [np.squeeze(p) for p in predictions_raw_vit]
    
    torch.cuda.empty_cache()
    try:
        del(model)
    except:
        pass

    gc.collect()

100%|██████████| 1/1 [00:00<00:00, 23.74it/s]


[array([1.00169986e-04, 6.11324678e-04, 9.78616893e-01, 2.71324930e-03,
       1.79583579e-02], dtype=float32)]


# Mobilenet V3 (CropNet)

In [15]:
if "mobilenet" in used_models_pytorch:   
    __all__ = ['mobilenetv3_large', 'mobilenetv3_small']

    def _make_divisible(v, divisor, min_value=None):
        """
        This function is taken from the original tf repo.
        It ensures that all layers have a channel number that is divisible by 8
        It can be seen here:
        https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
        :param v:
        :param divisor:
        :param min_value:
        :return:
        """
        if min_value is None:
            min_value = divisor
        new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
        # Make sure that round down does not go down by more than 10%.
        if new_v < 0.9 * v:
            new_v += divisor
        return new_v

    class h_sigmoid(nn.Module):
        def __init__(self, inplace=True):
            super(h_sigmoid, self).__init__()
            self.relu = nn.ReLU6(inplace=inplace)

        def forward(self, x):
            return self.relu(x + 3) / 6

    class h_swish(nn.Module):
        def __init__(self, inplace=True):
            super(h_swish, self).__init__()
            self.sigmoid = h_sigmoid(inplace=inplace)

        def forward(self, x):
            return x * self.sigmoid(x)

    class SELayer(nn.Module):
        def __init__(self, channel, reduction=4):
            super(SELayer, self).__init__()
            self.avg_pool = nn.AdaptiveAvgPool2d(1)
            self.fc = nn.Sequential(
                    nn.Linear(channel, _make_divisible(channel // reduction, 8)),
                    nn.ReLU(inplace=True),
                    nn.Linear(_make_divisible(channel // reduction, 8), channel),
                    h_sigmoid()
            )

        def forward(self, x):
            b, c, _, _ = x.size()
            y = self.avg_pool(x).view(b, c)
            y = self.fc(y).view(b, c, 1, 1)
            return x * y

    def conv_3x3_bn(inp, oup, stride):
        return nn.Sequential(
            nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
            nn.BatchNorm2d(oup),
            h_swish()
        )

    def conv_1x1_bn(inp, oup):
        return nn.Sequential(
            nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
            nn.BatchNorm2d(oup),
            h_swish()
        )

    class InvertedResidual(nn.Module):
        def __init__(self, inp, hidden_dim, oup, kernel_size, stride, use_se, use_hs):
            super(InvertedResidual, self).__init__()
            assert stride in [1, 2]

            self.identity = stride == 1 and inp == oup

            if inp == hidden_dim:
                self.conv = nn.Sequential(
                    # dw
                    nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride, (kernel_size - 1) // 2, groups=hidden_dim, bias=False),
                    nn.BatchNorm2d(hidden_dim),
                    h_swish() if use_hs else nn.ReLU(inplace=True),
                    # Squeeze-and-Excite
                    SELayer(hidden_dim) if use_se else nn.Identity(),
                    # pw-linear
                    nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                    nn.BatchNorm2d(oup),
                )
            else:
                self.conv = nn.Sequential(
                    # pw
                    nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
                    nn.BatchNorm2d(hidden_dim),
                    h_swish() if use_hs else nn.ReLU(inplace=True),
                    # dw
                    nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride, (kernel_size - 1) // 2, groups=hidden_dim, bias=False),
                    nn.BatchNorm2d(hidden_dim),
                    # Squeeze-and-Excite
                    SELayer(hidden_dim) if use_se else nn.Identity(),
                    h_swish() if use_hs else nn.ReLU(inplace=True),
                    # pw-linear
                    nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                    nn.BatchNorm2d(oup),
                )

        def forward(self, x):
            if self.identity:
                return x + self.conv(x)
            else:
                return self.conv(x)

    class MobileNetV3(nn.Module):
        def __init__(self, cfgs, mode, num_classes=1000, width_mult=1.):
            super(MobileNetV3, self).__init__()
            # setting of inverted residual blocks
            self.cfgs = cfgs
            assert mode in ['large', 'small']

            # building first layer
            input_channel = _make_divisible(16 * width_mult, 8)
            layers = [conv_3x3_bn(3, input_channel, 2)]
            # building inverted residual blocks
            block = InvertedResidual
            for k, t, c, use_se, use_hs, s in self.cfgs:
                output_channel = _make_divisible(c * width_mult, 8)
                exp_size = _make_divisible(input_channel * t, 8)
                layers.append(block(input_channel, exp_size, output_channel, k, s, use_se, use_hs))
                input_channel = output_channel
            self.features = nn.Sequential(*layers)
            # building last several layers
            self.conv = conv_1x1_bn(input_channel, exp_size)
            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
            output_channel = {'large': 1280, 'small': 1024}
            output_channel = _make_divisible(output_channel[mode] * width_mult, 8) if width_mult > 1.0 else output_channel[mode]
            self.classifier = nn.Sequential(
                nn.Linear(exp_size, output_channel),
                h_swish(),
                nn.Dropout(0.2),
                nn.Linear(output_channel, num_classes),
            )

            self._initialize_weights()

        def forward(self, x):
            x = self.features(x)
            x = self.conv(x)
            x = self.avgpool(x)
            x = x.view(x.size(0), -1)
            x = self.classifier(x)
            return x

        def _initialize_weights(self):
            for m in self.modules():
                if isinstance(m, nn.Conv2d):
                    n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                    m.weight.data.normal_(0, math.sqrt(2. / n))
                    if m.bias is not None:
                        m.bias.data.zero_()
                elif isinstance(m, nn.BatchNorm2d):
                    m.weight.data.fill_(1)
                    m.bias.data.zero_()
                elif isinstance(m, nn.Linear):
                    m.weight.data.normal_(0, 0.01)
                    m.bias.data.zero_()

    def mobilenetv3_large(**kwargs):
        """
        Constructs a MobileNetV3-Large model
        """
        cfgs = [
            # k, t, c, SE, HS, s 
            [3,   1,  16, 0, 0, 1],
            [3,   4,  24, 0, 0, 2],
            [3,   3,  24, 0, 0, 1],
            [5,   3,  40, 1, 0, 2],
            [5,   3,  40, 1, 0, 1],
            [5,   3,  40, 1, 0, 1],
            [3,   6,  80, 0, 1, 2],
            [3, 2.5,  80, 0, 1, 1],
            [3, 2.3,  80, 0, 1, 1],
            [3, 2.3,  80, 0, 1, 1],
            [3,   6, 112, 1, 1, 1],
            [3,   6, 112, 1, 1, 1],
            [5,   6, 160, 1, 1, 2],
            [5,   6, 160, 1, 1, 1],
            [5,   6, 160, 1, 1, 1]
        ]
        return MobileNetV3(cfgs, mode='large', **kwargs)

    def mobilenetv3_small(**kwargs):
        """
        Constructs a MobileNetV3-Small model
        """
        cfgs = [
            # k, t, c, SE, HS, s 
            [3,    1,  16, 1, 0, 2],
            [3,  4.5,  24, 0, 0, 2],
            [3, 3.67,  24, 0, 0, 1],
            [5,    4,  40, 1, 1, 2],
            [5,    6,  40, 1, 1, 1],
            [5,    6,  40, 1, 1, 1],
            [5,    3,  48, 1, 1, 1],
            [5,    3,  48, 1, 1, 1],
            [5,    6,  96, 1, 1, 2],
            [5,    6,  96, 1, 1, 1],
            [5,    6,  96, 1, 1, 1],
        ]

        return MobileNetV3(cfgs, mode='small', **kwargs)

    class LeafDataset(torch.utils.data.Dataset):
    
        def __init__(self, df, data_path, mode='train', transforms=None):
            super().__init__()
            self.df_data = df.values
            self.data_path = data_path
            self.transforms = transforms 
            self.mode = mode
            self.data_dir = 'train_images' if mode == 'train' else 'test_images'

        def __len__(self):
            return len(self.df_data)

        def __getitem__(self, index):
            img_name = self.df_data[index][0]
            img_path = os.path.join(self.data_path, self.data_dir, img_name)
            img = Image.open(img_path).convert('RGB')

            if self.transforms is not None:
                img = self.transforms(img)

            return img

    def predict(model, test_dataset):
        preds = []
        test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=20)

        for test_images in tqdm(test_dataloader):
            test_images = test_images.to(device)
            model.eval()
            with torch.no_grad():
                output = model(test_images)
            preds.extend(output[:, :5].softmax(1).cpu().data.numpy())

        return preds

    transforms_val = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    predictions_mobilenet = pd.DataFrame(columns={"image_id"})
    predictions_mobilenet["image_id"] = submission_df["image_id"].values 

    model = mobilenetv3_large()
    model.to(device)
    model.load_state_dict(torch.load(used_models_pytorch["mobilenet"]))

    test_dataset=LeafDataset(df=predictions_mobilenet, data_path=path, mode='test', transforms=transforms_val)

    predictions_raw_mobilenet = predict(model, test_dataset)
    print(predictions_raw_mobilenet)

    predictions_mobilenet['mobilenet'] = [np.squeeze(p) for p in predictions_raw_mobilenet]

    torch.cuda.empty_cache()
    try:
        del(model)
    except:
        pass
    gc.collect()

100%|██████████| 1/1 [00:00<00:00, 34.25it/s]


[array([0.03621518, 0.02839454, 0.22439097, 0.13725644, 0.57374287],
      dtype=float32)]


# Final Ensembling

In [16]:
submission_df["label"] = 0

if "resnext" in used_models_pytorch:
    submission_df = submission_df.merge(predictions_resnext, on="image_id")
    
if "mobilenet" in used_models_pytorch:
    submission_df = submission_df.merge(predictions_mobilenet, on="image_id")
    
if "vit" in used_models_pytorch:
    submission_df = submission_df.merge(predictions_vit, on="image_id")

In [17]:
submission_df["label"] = submission_df.apply(lambda row: np.argmax(
    [np.sum(e) for e in zip(*[row[m] for m in list(used_models_pytorch.keys())])]), axis=1)

In [18]:
submission_df.head(1)

Unnamed: 0,label,image_id,resnext,mobilenet,vit
0,4,2216849948.jpg,"[0.019677162, 0.02100126, 0.1185554, 0.0425402...","[0.03621518, 0.028394544, 0.22439097, 0.137256...","[0.000100169986, 0.0006113247, 0.9786169, 0.00..."


In [19]:
submission_df[["image_id","label"]].to_csv("submission.csv", index=False)
!head submission.csv

image_id,label
2216849948.jpg,4
