# Image classification baseline
This notebook contains the code for our final multi-modal model training.

## 1 - Setup

In [None]:
import pandas as pd

import torch
from torch import from_numpy
from torch.nn import MSELoss, Linear, ReLU, Dropout, Sequential
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.data import Dataset, DataLoader, random_split

import numpy as np

from sklearn.preprocessing import StandardScaler

import imageio.v3 as imageio

from lightning import LightningModule, Trainer, seed_everything
from lightning.pytorch.loggers import WandbLogger
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
seed_everything(42)

from torcheval.metrics import R2Score

from timm import create_model

import wandb

import albumentations as A
from albumentations.pytorch import ToTensorV2

import gc

In [None]:
BASE_PATH = "../data"
IMAGES_PATH = BASE_PATH + "/train_images"

## 2 - Custom Dataset
We refined the custom dataset from the image model, incorporating the tabular data. The dataset scales the tabular data on creation and returns the image and tabular data when indexed.

In [None]:
class MultiModalDataset(Dataset):
    def __init__(self, root, dataset_path, transforms):
        super().__init__()
        self.transforms = transforms
        dataset = pd.read_csv(dataset_path)
        dataset["file_path"] = root + "/" + dataset["id"].astype(str) + ".jpeg"
        self.image_data = dataset["file_path"].apply(lambda fp: open(fp, "rb").read())

        self.target_columns = ["X4", "X11", "X18", "X26", "X50", "X3112"]
        if self.target_columns[0] not in dataset.columns:
            self.target_columns = [f"{col}_mean" for col in self.target_columns]

        y = np.zeros_like(dataset[self.target_columns], dtype=np.float32)
        for target_idx, target in enumerate(self.target_columns):
            v = dataset[target].values
            if target in self.target_columns:
                v = np.log10(v)
            y[:, target_idx] = v

        self.scaler = StandardScaler()
        self.y = self.scaler.fit_transform(y)

        self.tab_data = dataset.drop(columns=["id", "file_path"] + self.target_columns + [col for col in dataset if col.endswith('_sd')])
        for column in self.tab_data.columns:
            min_val = self.tab_data[column].min()
            max_val = self.tab_data[column].max()
            self.tab_data[column] = (self.tab_data[column] - min_val) / (max_val - min_val)  

    def __len__(self):
        return len(self.tab_data)

    def __getitem__(self, index):
        image = self.transforms(image=imageio.imread(self.image_data[index]))["image"]
        tab = torch.tensor(self.tab_data.iloc[index].values, dtype=torch.float32)
        targets = from_numpy(self.y[index])
        
        return image, tab, targets


## 3 - Model
Our multi-modal model loads the swin transformer, gets the output dimensions of it and then creates both the tabular data network and the regressor network.

In [None]:
class MultiModel(LightningModule):
    def __init__(
        self,
        model_name,
        image_dropout,
        image_learning_rate_modifier,
        image_decay,
        tab_features,
        tab_hidden_dim,
        tab_activation,
        tab_dropout,
        tab_decay,
        regressor_hidden_dim,
        regressor_activation,
        regressor_dropout,
        regressor_n_layers,
        regressor_decay,
        learning_rate,
        n_steps,
    ):
        super().__init__()
        self.save_hyperparameters()

        self.image_model = Sequential(
            create_model(model_name, num_classes=0, pretrained=True),
            Dropout(image_dropout),
        )

        image_model_output_shape = self.image_model[0].norm.normalized_shape[0]

        self.tab_model = Sequential(
            Linear(tab_features, tab_hidden_dim),
            tab_activation(),
            Dropout(tab_dropout),
            Linear(tab_hidden_dim, image_model_output_shape),
        )

        concat_size = image_model_output_shape * 2

        self.regressor = Sequential()
        for i in range(self.hparams.regressor_n_layers):
            self.regressor.append(Linear(concat_size, concat_size))
            self.regressor.append(regressor_activation())
            self.regressor.append(Dropout(regressor_dropout))
        self.regressor.append(Linear(concat_size, 6))
        
        self.loss = MSELoss()
        
    def forward(self, batch):
        image, tab, y = batch
        image_output = self.image_model(image)

        tab_output = self.tab_model(tab)

        combined = torch.cat([image_output, tab_output], dim=1)

        output = self.regressor(combined)
        loss = self.loss(output, y)

        metric = R2Score()
        metric.update(output, y)
        
        return loss, metric.compute()

    def training_step(self, batch):
        loss, r2 = self(batch)
        self.log("Train loss", loss, prog_bar=True)
        self.log("Train R2", r2, prog_bar=True)

        self.lr_schedulers().step()
        
        return loss
        
    def validation_step(self, batch):
        loss, r2 = self(batch)
        self.log("Validation loss", loss)
        self.log("Validation R2", r2)

    def configure_optimizers(self):
        optimizer = AdamW(
            params=self.image_model.parameters(),
            lr=self.hparams.learning_rate,
            weight_decay=self.hparams.image_decay
        )

        scheduler = OneCycleLR(
            optimizer=optimizer,
            max_lr=self.hparams.learning_rate,
            total_steps=self.hparams.n_steps,
            pct_start=0.1,
            anneal_strategy='cos',
            div_factor=1e1,
            final_div_factor=1e1,
        )

        return [optimizer], [scheduler] 

## 4 - Training
### 4.1 - Constants

In [None]:
MEAN = [0.485, 0.456, 0.406]
STD = [0.229, 0.224, 0.225]

### 4.2 - Training Loop

In [None]:
wandb.login()

In [None]:
with wandb.init(config={
    "model_name": "swin_large_patch4_window12_384.ms_in22k_ft_in1k",
    "image_dropout": 0,
    "image_learning_rate_modifier": 1/10,
    "image_decay": 0.01,
    "tab_hidden_dim": 512,
    "tab_activation": "ReLU",
    "tab_dropout": 0,
    "tab_decay": 0,
    "regressor_hidden_dim": 512,
    "regressor_activation": "ReLU",
    "regressor_dropout": 0,
    "regressor_decay": 0,
    "regressor_n_layers": 1,
    "learning_rate": 1e-4,
}, project="aicomp") as run:
    wandb.define_metric("Validation R2", summary="max")
    config = wandb.config
    image_size = 384
    
    transforms = A.Compose([
        A.HorizontalFlip(p=0.5),
        A.RandomSizedCrop(
            [448, 512],
            image_size, image_size, w2h_ratio=1.0, p=0.75),
        A.Resize(image_size, image_size),
        A.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.1, p=0.25),
        A.ImageCompression(quality_lower=85, quality_upper=100, p=0.25),
        A.ToFloat(),
        A.Normalize(mean=MEAN, std=STD, max_pixel_value=1),
        ToTensorV2(),
    ])

    dataset = MultiModalDataset(BASE_PATH + "/train_images", BASE_PATH + "cleaned/cleaned_train.csv", transforms=transforms)
    train, val = random_split(dataset, [0.9, 0.1])

    max_epochs = 10
    batch_size = 10
    n_steps = ((len(dataset) // batch_size) * max_epochs) + 1

    tab_activation = ReLU if config["tab_activation"] else None
    regressor_activation = ReLU if config["regressor_activation"] else None
    
    model = MultiModel(
        model_name=config["model_name"],
        image_dropout=config["image_dropout"],
        image_learning_rate_modifier=config["image_learning_rate_modifier"],
        image_decay=config["image_decay"],
        tab_features=dataset.tab_data.shape[1],
        tab_hidden_dim=config["tab_hidden_dim"],
        tab_activation=tab_activation,
        tab_dropout=config["tab_dropout"],
        tab_decay=config["tab_decay"],
        regressor_hidden_dim=config["regressor_hidden_dim"],
        regressor_activation=regressor_activation,
        regressor_dropout=config["regressor_dropout"],
        regressor_decay=config["regressor_decay"],
        regressor_n_layers=config["regressor_n_layers"],
        learning_rate=config["learning_rate"],
        n_steps=n_steps
    )

    logger = WandbLogger(project="aicomp", log_model="all")

    callbacks = [ModelCheckpoint(monitor="Validation R2", mode="max")]
    callbacks += [EarlyStopping(monitor="Validation loss", mode="min", patience=5)]
    callbacks += [EarlyStopping(monitor="Validation R2", mode="max", patience=5)]
    callbacks += [EarlyStopping(monitor="Train loss", mode="min", patience=5, stopping_threshold=0.01, check_on_train_epoch_end=True)]
    callbacks += [EarlyStopping(monitor="Train R2", mode="max", patience=5, stopping_threshold=0.99, check_on_train_epoch_end=True)]
    callbacks += [LearningRateMonitor(logging_interval="step")]
    
    trainer = Trainer(max_epochs=max_epochs, logger=logger, num_sanity_val_steps=0, callbacks=callbacks)
    
    train_dataloader = DataLoader(train, batch_size=batch_size, shuffle=True, num_workers=batch_size)
    val_dataloader = DataLoader(val, batch_size=batch_size, num_workers=batch_size)
    
    trainer.fit(model, train_dataloader, val_dataloader)

    run.finish()

    del model
    del trainer
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
wandb.teardown()