In [None]:
! nvidia-smi

Fri Dec 10 00:10:23 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P8    12W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
! cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 79
model name	: Intel(R) Xeon(R) CPU @ 2.20GHz
stepping	: 0
microcode	: 0x1
cpu MHz		: 2199.998
cache size	: 56320 KB
physical id	: 0
siblings	: 4
core id		: 0
cpu cores	: 2
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap xsaveopt arat md_clear arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs taa
bogomips	: 4399.99
clflush size	: 64
cache_alignment	: 64
address sizes	: 46 bits physical, 48 b

In [None]:
! pip install fastcore --upgrade -qq
! pip install fastai --upgrade -qq
! pip install transformers --upgrade -qq
! pip install datasets --upgrade -qq
! pip install pytorch_lightning --upgrade -qq
! pip install wandb --upgrade -qq
! pip install ohmeow-blurr --upgrade -qq
! pip install timm --upgrade -qq
! pip install git+https://github.com/warner-benjamin/fastai_snippets.git -qq

In [None]:
import torch
torch.__version__

'1.10.0+cu111'

In [None]:
%env WANDB_SILENT=true

env: WANDB_SILENT=true


In [None]:
import wandb
wandb.login()

True

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Imports

In [None]:
import gc

# import fastai
from fastai.vision.all import *
from fastai.text.all import *
from fastai.callback.wandb import WandbCallback
from fastai_snippets.callback import simpleprofiler
from fastai_snippets.utils import simpleprofiler_wandb

# import blurr/huggingface
from blurr.data.all import *
from blurr.modeling.all import *
from datasets import load_dataset
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

#import pytorch lightning
import timm
import pytorch_lightning as pl
from pytorch_lightning import LightningDataModule, LightningModule, Trainer
from torch.optim.lr_scheduler import OneCycleLR
from torch.optim import AdamW
import torchvision.transforms as tvt
from torchvision.datasets import ImageFolder
from pytorch_lightning.loggers import WandbLogger
import torchmetrics

# Setup

In [None]:
plat = 'Colab Pro High RAM'
GPU = 'T4'
img_bs = 64
nlp_bs = 16

In [None]:
source = untar_data(URLs.IMAGENETTE_320)
source = untar_data(URLs.IMAGENETTE)

## Modify SimpleProfiler to Log Results to wandb

In [None]:
from pytorch_lightning.profiler.simple import SimpleProfiler

@patch
def summary(self:SimpleProfiler):
    output_table = wandb.Table(columns=["Action", "Mean duration (s)", "Duration StDev (s)", "Num calls", "Total time (s)", "Percentage %"])
    if len(self.recorded_durations) > 0:
        max_key = max(len(k) for k in self.recorded_durations.keys())

        def log_row(action, mean, num_calls, total, per):
            row = f"{sep}{action:<{max_key}s}\t|  {mean:<15}\t|"
            row += f"{num_calls:<15}\t|  {total:<15}\t|  {per:<15}\t|"
            return row

        report, total_duration = self._make_report()
        output_table.add_data("Total", "-", "-", "_", f"{total_duration:.5}", "100 %")
        for action, durations, duration_per in report:
            output_table.add_data(
                action,
                f"{np.mean(durations):.5}",
                f"{np.std(durations):.5}",
                f"{len(durations):}",
                f"{np.sum(durations):.5}",
                f"{duration_per:.5}",
            )

    wandb.log({"simple_profiler": output_table})

## PyTorch Lightning Imagenette

In [None]:
class ImagenetteDataModule(LightningDataModule):
    def __init__(self, size, woof, bs, train_transform=None, valid_transform=None):
        super().__init__()
        self.size, self.woof, self.bs = size, woof, bs
        imagewoof_stats  = ([0.496,0.461,0.399],[0.257,0.249,0.258])
        imagenette_stats = ([0.465,0.458,0.429],[0.285,0.28,0.301])

        if train_transform == None:
            self.train_transform = tvt.Compose([
                tvt.RandomResizedCrop(size, scale=(0.35, 1)),
                tvt.RandomHorizontalFlip(),
                tvt.ToTensor(),
                tvt.Normalize(*imagewoof_stats) if woof else tvt.Normalize(*imagenette_stats)])
        else:
            self.train_transform = transforms.Compose(train_transform)

        if valid_transform == None:
            self.valid_transform = tvt.Compose([
                tvt.CenterCrop(size),
                tvt.ToTensor(),
                tvt.Normalize(*imagewoof_stats) if woof else tvt.Normalize(*imagenette_stats)])
        else:
            self.valid_transform = transforms.Compose(valid_transform)

    def prepare_data(self):
        if self.size<=224: path = URLs.IMAGEWOOF_320 if self.woof else URLs.IMAGENETTE_320
        else             : path = URLs.IMAGEWOOF     if self.woof else URLs.IMAGENETTE
        self.source = untar_data(path)

    def setup(self, stage=None):
        if stage == "fit" or stage is None:
            self.train = ImageFolder(self.source/'train', self.train_transform)
        self.val = ImageFolder(self.source/'val', self.valid_transform)

    def train_dataloader(self):
        return DataLoader(self.train, batch_size=self.bs, shuffle=True, pin_memory=True, num_workers=min(8, num_cpus()))

    def val_dataloader(self):
        return DataLoader(self.val, batch_size=self.bs, pin_memory=True, num_workers=min(8, num_cpus()))

    def teardown(self, stage=None):
        self.train = None
        self.val = None

In [None]:
class ResNet(LightningModule):
    def __init__(self, model, lr=3e-3, mom=0.9, wd=1e-2):
        super().__init__()

        self.save_hyperparameters(ignore='model')
        self.model = model()
        self.loss_fn = LabelSmoothingCrossEntropy()
        self.accuracy = torchmetrics.Accuracy()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        x = self(x)
        loss = self.loss_fn(x, y)
        self.log("train_loss", loss)
        return loss

    def evaluate(self, batch, stage=None):
        x, y = batch
        x = self(x)
        loss = self.loss_fn(x, y)
        preds = F.softmax(x, dim=-1).argmax(dim=-1)
        self.accuracy(preds, y)

        if stage:
            self.log(f"{stage}_loss", loss, prog_bar=True, on_epoch=True)
            self.log(f"{stage}_acc", self.accuracy, prog_bar=True, on_epoch=True)

    def validation_step(self, batch, batch_idx):
        self.evaluate(batch, "val")

    def test_step(self, batch, batch_idx):
        self.evaluate(batch, "test")

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(),lr=self.hparams.lr, eps=1e-5)
        
        scheduler_dict = {
            "scheduler": OneCycleLR(
                optimizer=optimizer,
                max_lr =self.hparams.lr,
                total_steps=self.num_training_steps
            ),
            "interval": "step",
            "frequency": 1
        }
        return {"optimizer": optimizer, "lr_scheduler": scheduler_dict}

    # lightly modified from rohitgr7 https://github.com/PyTorchLightning/pytorch-lightning/issues/10760
    @property
    def num_training_steps(self) -> int:
        """Total training steps inferred from datamodule and devices."""
        if self.trainer.num_training_batches != float('inf'):
            dataset_size = self.trainer.num_training_batches
        else:
            print('Requesting dataloader...')
            dataset_size = len(self.trainer._data_connector._train_dataloader_source.dataloader())

        if isinstance(self.trainer.limit_train_batches, int):
            dataset_size = min(dataset_size, self.trainer.limit_train_batches)
        else:
            dataset_size = int(dataset_size * self.trainer.limit_train_batches)

        accelerator_connector = self.trainer._accelerator_connector
        if accelerator_connector.use_ddp2 or accelerator_connector.use_dp:
            effective_devices = 1
        else:
            effective_devices = self.trainer.devices

        effective_devices = effective_devices * self.trainer.num_nodes
        effective_batch_size = self.trainer.accumulate_grad_batches * effective_devices
        max_estimated_steps = math.ceil(dataset_size // effective_batch_size) * self.trainer.max_epochs

        max_estimated_steps = min(max_estimated_steps, self.trainer.max_steps) if self.trainer.max_steps != -1 else max_estimated_steps
        return max_estimated_steps

In [None]:
def train_pl(model, epochs, name, size):
    resnet = ResNet(model)
    imagenette = ImagenetteDataModule(size, False, img_bs)

    wandb_logger = WandbLogger(project="sagecolab", name=f'{name} {plat} {GPU} fp16', log_model=False)
    trainer = Trainer(gpus=1, precision=16, max_epochs=epochs, num_sanity_val_steps=0,
                      benchmark=True, profiler="simple", logger=wandb_logger, enable_checkpointing=False)
    trainer.fit(resnet, imagenette)
    wandb.log({}) # ensure sync of last step
    wandb.finish()

    trainer, resnet, imagenette= None, None, None
    gc.collect()
    torch.cuda.empty_cache()

## Fastai Imagenette

In [None]:
imagewoof_stats =  ([0.496,0.461,0.399],[0.257,0.249,0.258])
imagenette_stats = ([0.465,0.458,0.429],[0.285,0.28,0.301])

def get_imagenette_dls(size, woof, bs, sh=0., augs=None, workers=None, stats=True):
    if size<=224: path = URLs.IMAGEWOOF_320 if woof else URLs.IMAGENETTE_320
    else        : path = URLs.IMAGEWOOF     if woof else URLs.IMAGENETTE
    source = untar_data(path)
    if workers is None: workers = min(8, num_cpus())
    batch_tfms = []
    if stats:
        if woof: 
            batch_tfms += [Normalize.from_stats(*imagewoof_stats)]
        else:
            batch_tfms += [Normalize.from_stats(*imagenette_stats)]
    if augs: batch_tfms += augs
    if sh: batch_tfms.append(RandomErasing(p=0.3, max_count=4, sh=sh))
    dblock = DataBlock(blocks=(ImageBlock, CategoryBlock),
                       splitter=GrandparentSplitter(valid_name='val'),
                       get_items=get_image_files, get_y=parent_label,
                       item_tfms=[RandomResizedCrop(size, min_scale=0.35), FlipItem(0.5)],
                       batch_tfms=batch_tfms)
    return dblock.dataloaders(source, path=source, bs=bs, num_workers=workers)

In [None]:
def train_imagenette_fastai(model, epochs, name, size, precision=[16, 32], augs=None):
    for fp in precision:
        dls = get_imagenette_dls(size, False, img_bs, augs=augs)
        train_fastai(dls, model(), epochs, name, precision=fp)

## Fastai-Blurr IMDB

In [None]:
def get_imdb_dls(model_name, bs):
    dataset = load_dataset('imdb')
    df = pd.DataFrame(dataset['train'])
    df['is_valid'] = False
    df = df.sample(frac=0.2, random_state=42)
    df2 = pd.DataFrame(dataset['test'])
    df2['is_valid'] = True
    df2 = df2.sample(frac=0.2, random_state=42)
    df = df.append(df2, ignore_index=True)

    config = AutoConfig.from_pretrained(model_name)
    config.num_labels = len(df['label'].unique())
    hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(model_name, model_cls=AutoModelForSequenceClassification, config=config)

    dblock = DataBlock(blocks=(HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), CategoryBlock),  
                      get_x=ColReader('text'), 
                      get_y=ColReader('label'), 
                      splitter=ColSplitter())

    return hf_model, dblock.dataloaders(df, bs=bs, workers=min(8, num_cpus()))

In [None]:
def train_imdb_fastai(model_name, epochs, name, precision=[16, 32]):
    for fp in precision:
        hf_model, dls = get_imdb_dls(model_name, nlp_bs if fp==16 else int(nlp_bs/2))
        model = HF_BaseModelWrapper(hf_model)
        train_fastai(dls, model, epochs, name, splitter=hf_splitter, cbs=[HF_BaseModelCallback], precision=fp)

## Generic Fastai Training

In [None]:
def train_fastai(dls, model, epochs, name, splitter=trainable_params, cbs=[], precision=16):
    run = wandb.init(project="sagecolab", name=f'{name} {plat} {GPU} fp{precision}')
    learn = Learner(dls, model, cbs=cbs, splitter=splitter).profile()
    if precision==16:
        learn.to_fp16()
    
    learn.fit_one_cycle(epochs, 3e-3, cbs=[WandbCallback(log=None, log_preds=False, log_model=False)])

    run.finish()

    learn.dls, learn = None,  None
    gc.collect()
    torch.cuda.empty_cache()

# Training

## Train Imagenette Fastai

In [None]:
model = partial(xse_resnet50, n_out=10)
train_imagenette_fastai(model, 4, name='XSEResNet50', size=224)

epoch,train_loss,valid_loss,time
0,1.608497,1.588055,02:26
1,1.280614,1.822295,02:17
2,0.972301,0.955547,02:17
3,0.734233,0.737856,02:17


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,559.8 s,100%
,epoch,139.9 s,3.826 s,4,559.5 s,100%
,train,123.6 s,3.145 s,4,494.3 s,88%
,validate,16.30 s,682.8ms,4,65.18 s,12%
train,batch,805.1ms,355.4ms,588,473.4 s,85%
,step,460.9ms,19.84ms,588,271.0 s,48%
,backward,279.8ms,207.7ms,588,164.5 s,29%
,pred,46.47ms,124.9ms,588,27.33 s,5%
,draw,14.42ms,69.03ms,588,8.482 s,2%
,zero_grad,2.324ms,476.0µs,588,1.367 s,0%


epoch,train_loss,valid_loss,time
0,1.601415,2.034862,05:54
1,1.30327,1.737314,05:44
2,0.957176,0.997784,05:45
3,0.750223,0.747892,05:45


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,1.391ks,100%
,epoch,347.6 s,3.918 s,4,1.390ks,100%
,train,309.1 s,3.423 s,4,1.236ks,89%
,validate,38.49 s,497.5ms,4,153.9 s,11%
train,batch,2.061 s,293.3ms,588,1.212ks,87%
,step,1.437 s,59.45ms,588,844.9 s,61%
,backward,556.0ms,190.2ms,588,326.9 s,24%
,pred,50.19ms,120.5ms,588,29.51 s,2%
,draw,14.41ms,69.64ms,588,8.471 s,1%
,zero_grad,2.418ms,478.3µs,588,1.422 s,0%


In [None]:
model = partial(xresnet18, n_out=10, act_cls=nn.Mish)
train_imagenette_fastai(model, 4, name='XResNet18 128', size=128, precision=[16])

epoch,train_loss,valid_loss,time
0,1.495402,1.70708,00:29
1,1.140327,1.066972,00:28
2,0.857017,0.776542,00:28
3,0.709464,0.684288,00:28


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,114.6 s,100%
,epoch,28.58 s,403.5ms,4,114.3 s,100%
,train,20.01 s,210.7ms,4,80.02 s,70%
,validate,8.566 s,206.7ms,4,34.26 s,30%
train,batch,124.0ms,102.3ms,588,72.90 s,64%
,step,35.46ms,7.684ms,588,20.85 s,18%
,draw,34.64ms,76.97ms,588,20.37 s,18%
,pred,32.93ms,23.89ms,588,19.36 s,17%
,backward,16.49ms,23.73ms,588,9.697 s,8%
,zero_grad,2.086ms,2.964ms,588,1.226 s,1%


In [None]:
model = partial(xse_resnext50, n_out=10, act_cls=nn.Mish)
train_imagenette_fastai(model, 4, name='XSEResNeXt50', size=256, precision=[16])

epoch,train_loss,valid_loss,time
0,1.53302,2.289062,02:03
1,1.176044,1.257967,01:55
2,0.898517,0.971946,01:54
3,0.689543,0.704734,01:54


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,468.9 s,100%
,epoch,117.1 s,3.596 s,4,468.5 s,100%
,train,99.44 s,3.124 s,4,397.8 s,85%
,validate,17.67 s,518.3ms,4,70.68 s,15%
train,batch,671.3ms,345.4ms,588,394.7 s,84%
,step,418.5ms,16.76ms,588,246.1 s,52%
,backward,180.8ms,191.8ms,588,106.3 s,23%
,pred,50.69ms,114.9ms,588,29.80 s,6%
,draw,17.62ms,92.15ms,588,10.36 s,2%
,zero_grad,2.348ms,511.9µs,588,1.380 s,0%


## Train IMDB

In [None]:
train_imdb_fastai('roberta-base', 4, 'Roberta')

Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

Could not gather input dimensions


epoch,train_loss,valid_loss,time
0,0.768247,0.698493,02:46
1,0.710149,0.699225,02:40
2,0.697134,0.693992,02:40
3,0.693717,0.69313,02:40


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,647.7 s,100%
,epoch,161.9 s,2.517 s,4,647.7 s,100%
,train,119.7 s,2.443 s,4,478.8 s,74%
,validate,42.22 s,76.20ms,4,168.9 s,26%
train,batch,362.0ms,203.6ms,1248,451.8 s,70%
,backward,265.0ms,227.5ms,1248,330.7 s,51%
,step,63.36ms,43.12ms,1248,79.07 s,12%
,pred,22.55ms,5.166ms,1248,28.14 s,4%
,draw,5.802ms,25.29ms,1248,7.241 s,1%
,zero_grad,3.912ms,390.6µs,1248,4.883 s,1%


Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

Could not gather input dimensions


epoch,train_loss,valid_loss,time
0,0.759657,0.700448,06:42
1,0.726348,0.706605,06:40
2,0.698382,0.699847,06:39
3,0.693999,0.693163,06:40


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,1.602ks,100%
,epoch,400.6 s,991.9ms,4,1.602ks,100%
,train,300.8 s,347.8ms,4,1.203ks,75%
,validate,99.77 s,660.9ms,4,399.1 s,25%
train,batch,459.6ms,244.5ms,2500,1.149ks,72%
,backward,244.0ms,308.3ms,2500,610.0 s,38%
,step,186.6ms,120.7ms,2500,466.6 s,29%
,pred,18.62ms,3.679ms,2500,46.55 s,3%
,draw,5.038ms,17.33ms,2500,12.59 s,1%
,zero_grad,4.091ms,300.1µs,2500,10.23 s,1%


## Train Imagenette PyTorch Lightning

In [None]:
model = partial(timm.create_model, model_name='resnet50', pretrained=False, num_classes=10)
train_pl(model, 4, name='ResNet50', size=256)

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Requesting dataloader...



  | Name     | Type                       | Params
--------------------------------------------------------
0 | model    | ResNet                     | 23.5 M
1 | loss_fn  | LabelSmoothingCrossEntropy | 0     
2 | accuracy | Accuracy                   | 0     
--------------------------------------------------------
23.5 M    Trainable params
0         Non-trainable params
23.5 M    Total params
47.057    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]