In [1]:
import os
import tempfile
from typing import List, Tuple

import mlflow
import pytorch_lightning as pl
import torch
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from dotenv import load_dotenv
from git import Diff, Repo
from pytorch_lightning.callbacks import (
    EarlyStopping,
    LearningRateMonitor,
    ModelCheckpoint,
)
from pytorch_lightning.loggers.mlflow import MLFlowLogger
from torch import nn
from torch.utils.data import DataLoader, random_split
from torchmetrics.classification import MulticlassAccuracy

from libs.models.resnet import ResNet

load_dotenv("./.env")

if os.environ.get("MLFLOW_TRACKING_TOKEN"):
    print("Token set!")

  from .autonotebook import tqdm as notebook_tqdm


Token set!


In [2]:
transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

transform_val = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

train_set = torchvision.datasets.CIFAR10(
    root="./data", train=True, download=True, transform=transform
)
train_size = int(len(train_set) * 0.8)
val_size = len(train_set) - train_size
train_set, val_set = torch.utils.data.random_split(
    train_set, [train_size, val_size], generator=torch.Generator().manual_seed(42)
)
test_set = torchvision.datasets.CIFAR10(
    root="./data", train=False, download=True, transform=transform_val
)

classes = (
    "plane",
    "car",
    "bird",
    "cat",
    "deer",
    "dog",
    "frog",
    "horse",
    "ship",
    "truck",
)

Files already downloaded and verified
Files already downloaded and verified


In [3]:
class LitResNet(pl.LightningModule):
    def __init__(self, num_classes=10):
        super().__init__()
        self.__num_classes = num_classes
        self.resnet = ResNet.make_resnet101(num_classes=num_classes, init_kernel_size=3)
        self.softmax = nn.Softmax(1)
        self.__metric = MulticlassAccuracy(num_classes=num_classes)
    
    def forward(self, x) -> torch.Tensor:
        out = self.resnet(x)
        out = self.softmax(out)
        return out
    
    def __get_dirty_files(self, repo: Repo) -> List[str]:
        dirty_files = []
        dirty_files += repo.untracked_files
        dirty_files += [diff.b_path for diff in repo.index.diff("HEAD", create_patch=True, R=True)]
        dirty_files += [diff.b_path for diff in repo.index.diff(None)]
        return dirty_files
    
    def __get_file_content(self, file_path: str) -> str:
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
        return text
    
    def __log_dirty_files(self, repo: Repo):
        if not isinstance(self.logger, MLFlowLogger):
            return
        
        dirty_files = self.__get_dirty_files(repo)

        for dirty_file in dirty_files:
            self.logger._mlflow_client.log_artifact(self.logger.run_id, dirty_file, os.path.join("uncommit-files", dirty_file))
        
        comment = "# Changes\n"
        comment += "- \n"
        comment += "# Uncommit files\n"
        for dirty_file in dirty_files:
            comment += "- {}\n".format(dirty_file)
        if len(dirty_files) == 0:
            comment += "(empty)\n"
        comment += "# Git info\n"
        comment += "- commit hash: {}\n".format(repo.head.commit.hexsha)
        comment += "- branch: {}\n".format(repo.active_branch.name)
        comment += "- repository: {}\n".format(repo.remotes[0].url)
        
        self.logger._mlflow_client.set_tag(self.logger.run_id, "mlflow.note.content", comment)
    
    def on_train_start(self):
        if not isinstance(self.logger, MLFlowLogger):
            return
        run_id = self.logger.run_id
        self.logger.log_hyperparams(self.hparams)
        repo = Repo()
        self.logger._mlflow_client.set_tag(run_id, "mlflow.source.git.commit", repo.head.commit.hexsha)
        self.logger._mlflow_client.set_tag(run_id, "mlflow.source.git.branch", repo.active_branch.name)
        remotes = repo.remotes
        if len(remotes) >= 1:
            self.logger._mlflow_client.set_tag(run_id, "mlflow.source.git.repoURL", repo.remotes[0].url)
        
        self.__log_dirty_files(repo)

    def training_step(self, batch, batch_idx):
        loss, acc = self._common_step(batch, batch_idx)
        self.log_dict({"loss": loss, "acc": acc})
        return loss

    def validation_step(self, batch, batch_idx):
        loss, acc = self._common_step(batch, batch_idx)
        self.log_dict({"val_loss": loss, "val_acc": acc})

    def predict_step(self, batch, batch_idx: int, dataloader_idx: int = 0) -> torch.Tensor:
        X_common, y_common = batch[0], batch[1]
        y_hat = self.forward(X_common)
        return y_hat.argmax(dim=1)
    
    def _common_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int):
        X_common, y_common = batch[0], batch[1]
        y_hat = self.forward(X_common)
        y_common_one_hot = F.one_hot(y_common, num_classes=self.__num_classes)
        loss = F.cross_entropy(y_hat.type(torch.FloatTensor), y_common_one_hot.type(torch.FloatTensor))
        y_hat_cat = torch.argmax(y_hat, dim=1)
        acc = self.__metric(y_hat_cat, y_common)
        return loss, acc

    def configure_optimizers(self):
        optimizer = torch.optim.SGD(self.parameters(), lr=1e-1, momentum=0.9, weight_decay=1e-4)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)
        return [optimizer], [{"scheduler": scheduler, "interval": "epoch"}]

In [4]:
torch.set_float32_matmul_precision('medium')

train_loader = DataLoader(train_set, shuffle=True, batch_size=256, num_workers=8)
val_loader = DataLoader(val_set, shuffle=False, batch_size=256, num_workers=8)

In [5]:
mlf_logger = MLFlowLogger(experiment_name="cifar10-practice/resnet-101-32x32", tracking_uri=os.environ.get("MLFLOW_TRACKING_URI"), log_model=False)
model = LitResNet()
trainer = pl.Trainer(
    max_epochs=500,
    accelerator="gpu",
    logger=mlf_logger,
    callbacks=[
            ModelCheckpoint(
                dirpath=os.path.join(
                    "lightning_logs/", mlf_logger._experiment_name, mlf_logger.run_id
                ),
                save_top_k=2,
                monitor="val_loss",
                mode="min",
                filename="checkpoint-{epoch:02d}-{val_loss:.5f}",
            ),
            EarlyStopping("val_loss", patience=10),
            LearningRateMonitor('step', log_momentum=True)
        ],
    check_val_every_n_epoch=1,
)
trainer.fit(model=model, train_dataloaders=train_loader, val_dataloaders=val_loader)

Experiment with name cifar10-practice/resnet-101-32x32 not found. Creating it.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type               | Params
----------------------------------------------------------
0 | resnet             | ResNet             | 41.4 M
1 | softmax            | Softmax            | 0     
2 | _LitResNet__metric | MulticlassAccuracy | 0     
----------------------------------------------------------
41.4 M    Trainable params
0         Non-trainable params
41.4 M    Total params
165.414   Total estimated model params size (MB)


Epoch 203: 100%|██████████| 197/197 [00:10<00:00, 18.70it/s, loss=1.59, v_num=d4c7]


## Validate using testing set

### ResNet-18

In [6]:
test_loader = DataLoader(test_set, shuffle=False, batch_size=256, num_workers=8)
model = LitResNet.load_from_checkpoint("lightning_logs/cifar10-practice/resnet-18-32x32/68b3a17d3c174acc8f58e9502b83e00b/checkpoint-epoch=196-val_loss=1.61641.ckpt")
trainer = pl.Trainer(accelerator='gpu')
predictions = trainer.validate(model, test_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  "Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning`"
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation DataLoader 0: 100%|██████████| 40/40 [00:00<00:00, 79.11it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         val_acc            0.8449007868766785
        val_loss            1.6157256364822388
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


### ResNet-101

In [6]:
test_loader = DataLoader(test_set, shuffle=False, batch_size=256, num_workers=8)
model = LitResNet.load_from_checkpoint("lightning_logs/cifar10-practice/resnet-101-32x32/1d34933a8b1a4defa7f17e98562fd4c7/checkpoint-epoch=193-val_loss=1.63769.ckpt")
trainer = pl.Trainer(accelerator='gpu')
predictions = trainer.validate(model, test_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  "Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning`"
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation DataLoader 0: 100%|██████████| 40/40 [00:01<00:00, 39.77it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         val_acc            0.8277215957641602
        val_loss            1.6315324306488037
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
