In [None]:
! nvidia-smi

Fri Dec  3 18:14:25 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
! cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 79
model name	: Intel(R) Xeon(R) CPU @ 2.20GHz
stepping	: 0
microcode	: 0x1
cpu MHz		: 2199.998
cache size	: 56320 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 1
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap xsaveopt arat md_clear arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs taa
bogomips	: 4399.99
clflush size	: 64
cache_alignment	: 64
address sizes	: 46 bits physical, 48 b

In [None]:
! pip install fastcore --upgrade -qq
! pip install fastai --upgrade -qq
! pip install transformers --upgrade -qq
! pip install datasets --upgrade -qq
! pip install pytorch_lightning --upgrade -qq
! pip install wandb --upgrade -qq
! pip install ohmeow-blurr --upgrade -qq
! pip install timm --upgrade -qq
! pip install git+https://github.com/warner-benjamin/fastai_snippets.git -qq

In [None]:
import torch
torch.__version__

'1.10.0+cu111'

In [None]:
%env WANDB_SILENT=true

env: WANDB_SILENT=true


In [None]:
import wandb
wandb.login()

True

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Imports

In [None]:
import gc

# import fastai
from fastai.vision.all import *
from fastai.text.all import *
from fastai.callback.wandb import WandbCallback
from fastai_snippets.callback import simpleprofiler
from fastai_snippets.utils import simpleprofiler_wandb

# import blurr/huggingface
from blurr.data.all import *
from blurr.modeling.all import *
from datasets import load_dataset
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

#import pytorch lightning
import timm
import pytorch_lightning as pl
from pytorch_lightning import LightningDataModule, LightningModule, Trainer
from torch.optim.lr_scheduler import OneCycleLR
from torch.optim import AdamW
import torchvision.transforms as tvt
from torchvision.datasets import ImageFolder
from pytorch_lightning.loggers import WandbLogger
import torchmetrics

# Setup

In [None]:
plat = 'Colab Pro'
GPU = 'P100'
img_bs = 64
nlp_bs = 16

In [None]:
source = untar_data(URLs.IMAGENETTE_320)
source = untar_data(URLs.IMAGENETTE)

## Modify SimpleProfiler to Log Results to wandb

In [None]:
from pytorch_lightning.profiler.simple import SimpleProfiler

@patch
def summary(self:SimpleProfiler):
    output_table = wandb.Table(columns=["Action", "Mean duration (s)", "Duration StDev (s)", "Num calls", "Total time (s)", "Percentage %"])
    if len(self.recorded_durations) > 0:
        max_key = max(len(k) for k in self.recorded_durations.keys())

        def log_row(action, mean, num_calls, total, per):
            row = f"{sep}{action:<{max_key}s}\t|  {mean:<15}\t|"
            row += f"{num_calls:<15}\t|  {total:<15}\t|  {per:<15}\t|"
            return row

        report, total_duration = self._make_report()
        output_table.add_data("Total", "-", "-", "_", f"{total_duration:.5}", "100 %")
        for action, durations, duration_per in report:
            output_table.add_data(
                action,
                f"{np.mean(durations):.5}",
                f"{np.std(durations):.5}",
                f"{len(durations):}",
                f"{np.sum(durations):.5}",
                f"{duration_per:.5}",
            )

    wandb.log({"simple_profiler": output_table})

## PyTorch Lightning Imagenette

In [None]:
class ImagenetteDataModule(LightningDataModule):
    def __init__(self, size, woof, bs, train_transform=None, valid_transform=None):
        super().__init__()
        self.size, self.woof, self.bs = size, woof, bs
        imagewoof_stats  = ([0.496,0.461,0.399],[0.257,0.249,0.258])
        imagenette_stats = ([0.465,0.458,0.429],[0.285,0.28,0.301])

        if train_transform == None:
            self.train_transform = tvt.Compose([
                tvt.RandomResizedCrop(size, scale=(0.35, 1)),
                tvt.RandomHorizontalFlip(),
                tvt.ToTensor(),
                tvt.Normalize(*imagewoof_stats) if woof else tvt.Normalize(*imagenette_stats)])
        else:
            self.train_transform = transforms.Compose(train_transform)

        if valid_transform == None:
            self.valid_transform = tvt.Compose([
                tvt.CenterCrop(size),
                tvt.ToTensor(),
                tvt.Normalize(*imagewoof_stats) if woof else tvt.Normalize(*imagenette_stats)])
        else:
            self.valid_transform = transforms.Compose(valid_transform)

    def prepare_data(self):
        if self.size<=224: path = URLs.IMAGEWOOF_320 if self.woof else URLs.IMAGENETTE_320
        else             : path = URLs.IMAGEWOOF     if self.woof else URLs.IMAGENETTE
        self.source = untar_data(path)

    def setup(self, stage=None):
        if stage == "fit" or stage is None:
            self.train = ImageFolder(self.source/'train', self.train_transform)
        self.val = ImageFolder(self.source/'val', self.valid_transform)

    def train_dataloader(self):
        return DataLoader(self.train, batch_size=self.bs, shuffle=True, pin_memory=True, num_workers=min(8, num_cpus()))

    def val_dataloader(self):
        return DataLoader(self.val, batch_size=self.bs, pin_memory=True, num_workers=min(8, num_cpus()))

    def teardown(self, stage=None):
        self.train = None
        self.val = None

In [None]:
class ResNet(LightningModule):
    def __init__(self, model, lr=3e-3, mom=0.9, wd=1e-2):
        super().__init__()

        self.save_hyperparameters(ignore='model')
        self.model = model()
        self.loss_fn = LabelSmoothingCrossEntropy()
        self.accuracy = torchmetrics.Accuracy()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        x = self(x)
        loss = self.loss_fn(x, y)
        self.log("train_loss", loss)
        return loss

    def evaluate(self, batch, stage=None):
        x, y = batch
        x = self(x)
        loss = self.loss_fn(x, y)
        preds = F.softmax(x, dim=-1).argmax(dim=-1)
        self.accuracy(preds, y)

        if stage:
            self.log(f"{stage}_loss", loss, prog_bar=True, on_epoch=True)
            self.log(f"{stage}_acc", self.accuracy, prog_bar=True, on_epoch=True)

    def validation_step(self, batch, batch_idx):
        self.evaluate(batch, "val")

    def test_step(self, batch, batch_idx):
        self.evaluate(batch, "test")

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(),lr=self.hparams.lr, eps=1e-5)
        
        scheduler_dict = {
            "scheduler": OneCycleLR(
                optimizer=optimizer,
                max_lr =self.hparams.lr,
                total_steps=self.num_training_steps
            ),
            "interval": "step",
            "frequency": 1
        }
        return {"optimizer": optimizer, "lr_scheduler": scheduler_dict}

    # lightly modified from rohitgr7 https://github.com/PyTorchLightning/pytorch-lightning/issues/10760
    @property
    def num_training_steps(self) -> int:
        """Total training steps inferred from datamodule and devices."""
        if self.trainer.num_training_batches != float('inf'):
            dataset_size = self.trainer.num_training_batches
        else:
            print('Requesting dataloader...')
            dataset_size = len(self.trainer._data_connector._train_dataloader_source.dataloader())

        if isinstance(self.trainer.limit_train_batches, int):
            dataset_size = min(dataset_size, self.trainer.limit_train_batches)
        else:
            dataset_size = int(dataset_size * self.trainer.limit_train_batches)

        accelerator_connector = self.trainer._accelerator_connector
        if accelerator_connector.use_ddp2 or accelerator_connector.use_dp:
            effective_devices = 1
        else:
            effective_devices = self.trainer.devices

        effective_devices = effective_devices * self.trainer.num_nodes
        effective_batch_size = self.trainer.accumulate_grad_batches * effective_devices
        max_estimated_steps = math.ceil(dataset_size // effective_batch_size) * self.trainer.max_epochs

        max_estimated_steps = min(max_estimated_steps, self.trainer.max_steps) if self.trainer.max_steps != -1 else max_estimated_steps
        return max_estimated_steps

In [None]:
def train_pl(model, epochs, name, size):
    resnet = ResNet(model)
    imagenette = ImagenetteDataModule(size, False, img_bs)

    wandb_logger = WandbLogger(project="sagecolab", name=f'{name} {plat} {GPU} fp16', log_model=False)
    trainer = Trainer(gpus=1, precision=16, max_epochs=epochs, num_sanity_val_steps=0,
                      benchmark=True, profiler="simple", logger=wandb_logger, enable_checkpointing=False)
    trainer.fit(resnet, imagenette)
    wandb.log({}) # ensure sync of last step
    wandb.finish()

    trainer, resnet, imagenette= None, None, None
    gc.collect()
    torch.cuda.empty_cache()

## Fastai Imagenette

In [None]:
imagewoof_stats =  ([0.496,0.461,0.399],[0.257,0.249,0.258])
imagenette_stats = ([0.465,0.458,0.429],[0.285,0.28,0.301])

def get_imagenette_dls(size, woof, bs, sh=0., augs=None, workers=None, stats=True):
    if size<=224: path = URLs.IMAGEWOOF_320 if woof else URLs.IMAGENETTE_320
    else        : path = URLs.IMAGEWOOF     if woof else URLs.IMAGENETTE
    source = untar_data(path)
    if workers is None: workers = min(8, num_cpus())
    batch_tfms = []
    if stats:
        if woof: 
            batch_tfms += [Normalize.from_stats(*imagewoof_stats)]
        else:
            batch_tfms += [Normalize.from_stats(*imagenette_stats)]
    if augs: batch_tfms += augs
    if sh: batch_tfms.append(RandomErasing(p=0.3, max_count=4, sh=sh))
    dblock = DataBlock(blocks=(ImageBlock, CategoryBlock),
                       splitter=GrandparentSplitter(valid_name='val'),
                       get_items=get_image_files, get_y=parent_label,
                       item_tfms=[RandomResizedCrop(size, min_scale=0.35), FlipItem(0.5)],
                       batch_tfms=batch_tfms)
    return dblock.dataloaders(source, path=source, bs=bs, num_workers=workers)

In [None]:
def train_imagenette_fastai(model, epochs, name, size, precision=[16, 32], augs=None):
    for fp in precision:
        dls = get_imagenette_dls(size, False, img_bs, augs=augs)
        train_fastai(dls, model(), epochs, name, precision=fp)

## Fastai-Blurr IMDB

In [None]:
def get_imdb_dls(model_name, bs):
    dataset = load_dataset('imdb')
    df = pd.DataFrame(dataset['train'])
    df['is_valid'] = False
    df = df.sample(frac=0.2, random_state=42)
    df2 = pd.DataFrame(dataset['test'])
    df2['is_valid'] = True
    df2 = df2.sample(frac=0.2, random_state=42)
    df = df.append(df2, ignore_index=True)

    config = AutoConfig.from_pretrained(model_name)
    config.num_labels = len(df['label'].unique())
    hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(model_name, model_cls=AutoModelForSequenceClassification, config=config)

    dblock = DataBlock(blocks=(HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), CategoryBlock),  
                      get_x=ColReader('text'), 
                      get_y=ColReader('label'), 
                      splitter=ColSplitter())

    return hf_model, dblock.dataloaders(df, bs=bs, workers=min(8, num_cpus()))

In [None]:
def train_imdb_fastai(model_name, epochs, name, precision=[16, 32]):
    for fp in precision:
        hf_model, dls = get_imdb_dls(model_name, nlp_bs if fp==16 else int(nlp_bs/2))
        model = HF_BaseModelWrapper(hf_model)
        train_fastai(dls, model, epochs, name, splitter=hf_splitter, cbs=[HF_BaseModelCallback], precision=fp)

## Generic Fastai Training

In [None]:
def train_fastai(dls, model, epochs, name, splitter=trainable_params, cbs=[], precision=16):
    run = wandb.init(project="sagecolab", name=f'{name} {plat} {GPU} fp{precision}')
    learn = Learner(dls, model, cbs=cbs, splitter=splitter).profile()
    if precision==16:
        learn.to_fp16()
    
    learn.fit_one_cycle(epochs, 3e-3, cbs=[WandbCallback(log=None, log_preds=False, log_model=False)])

    run.finish()

    learn.dls, learn = None,  None
    gc.collect()
    torch.cuda.empty_cache()

# Training

## Train Imagenette Fastai

In [None]:
model = partial(xse_resnet50, n_out=10)
train_imagenette_fastai(model, 4, name='XSEResNet50', size=224)

epoch,train_loss,valid_loss,time
0,1.678537,1.926547,02:57
1,1.286759,1.319642,02:50
2,0.974167,0.997008,02:50
3,0.76534,0.752537,02:49


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,689.1 s,100%
,epoch,172.2 s,3.227 s,4,688.7 s,100%
,train,147.5 s,2.045 s,4,590.2 s,86%
,validate,24.62 s,1.198 s,4,98.48 s,14%
train,batch,990.2ms,217.5ms,588,582.3 s,84%
,step,612.3ms,23.88ms,588,360.1 s,52%
,backward,309.1ms,129.6ms,588,181.7 s,26%
,pred,50.30ms,80.19ms,588,29.57 s,4%
,draw,14.86ms,52.75ms,588,8.738 s,1%
,zero_grad,2.087ms,330.6µs,588,1.227 s,0%


epoch,train_loss,valid_loss,time
0,1.643364,2.185736,02:49
1,1.291796,1.384938,02:44
2,0.995839,1.032389,02:44
3,0.737897,0.718119,02:44


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,663.4 s,100%
,epoch,165.8 s,2.120 s,4,663.1 s,100%
,train,142.7 s,1.657 s,4,570.8 s,86%
,validate,23.07 s,465.2ms,4,92.30 s,14%
train,batch,953.3ms,190.7ms,588,560.5 s,84%
,step,626.6ms,24.29ms,588,368.4 s,56%
,backward,262.4ms,116.9ms,588,154.3 s,23%
,pred,45.87ms,66.68ms,588,26.97 s,4%
,draw,14.84ms,51.51ms,588,8.727 s,1%
,zero_grad,2.153ms,362.9µs,588,1.266 s,0%


In [None]:
model = partial(xresnet18, n_out=10, act_cls=nn.Mish)
train_imagenette_fastai(model, 4, name='XResNet18 128', size=128, precision=[16])

epoch,train_loss,valid_loss,time
0,1.48103,1.699179,00:51
1,1.113742,1.020362,00:50
2,0.863878,0.798347,00:51
3,0.702355,0.695104,00:51


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,204.8 s,100%
,epoch,51.13 s,170.1ms,4,204.5 s,100%
,train,35.69 s,52.29ms,4,142.8 s,70%
,validate,15.43 s,129.2ms,4,61.72 s,30%
train,batch,230.6ms,144.8ms,588,135.6 s,66%
,draw,146.8ms,135.8ms,588,86.30 s,42%
,step,39.17ms,6.441ms,588,23.03 s,11%
,pred,26.36ms,18.81ms,588,15.50 s,8%
,backward,13.67ms,13.98ms,588,8.040 s,4%
,loss,2.178ms,2.744ms,588,1.281 s,1%


In [None]:
model = partial(xse_resnext50, n_out=10, act_cls=nn.Mish)
train_imagenette_fastai(model, 4, name='XSEResNeXt50', size=256, precision=[16])

epoch,train_loss,valid_loss,time
0,1.542084,4.308547,02:29
1,1.133814,1.741885,02:25
2,0.895865,1.079704,02:25
3,0.704864,0.679494,02:24


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,585.3 s,100%
,epoch,146.2 s,1.842 s,4,584.7 s,100%
,train,114.6 s,1.567 s,4,458.4 s,78%
,validate,31.57 s,307.3ms,4,126.3 s,22%
train,batch,773.2ms,207.7ms,588,454.6 s,78%
,step,461.6ms,17.76ms,588,271.4 s,46%
,backward,229.8ms,119.7ms,588,135.1 s,23%
,pred,58.08ms,59.86ms,588,34.15 s,6%
,draw,19.44ms,74.00ms,588,11.43 s,2%
,zero_grad,2.485ms,881.7µs,588,1.461 s,0%


## Train IMDB

In [None]:
train_imdb_fastai('roberta-base', 4, 'Roberta')

Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

Could not gather input dimensions


epoch,train_loss,valid_loss,time
0,0.739953,0.695064,03:26
1,0.708759,0.69844,03:27
2,0.699194,0.695637,03:27
3,0.693485,0.693143,03:27


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,828.9 s,100%
,epoch,207.2 s,151.0ms,4,828.9 s,100%
,train,151.7 s,131.4ms,4,606.9 s,73%
,validate,55.49 s,35.57ms,4,222.0 s,27%
train,batch,477.3ms,249.9ms,1248,595.6 s,72%
,backward,368.4ms,298.4ms,1248,459.8 s,55%
,step,75.53ms,70.60ms,1248,94.27 s,11%
,pred,23.59ms,2.843ms,1248,29.44 s,4%
,draw,6.212ms,12.92ms,1248,7.752 s,1%
,zero_grad,1.813ms,377.3µs,1248,2.263 s,0%


Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

Could not gather input dimensions


epoch,train_loss,valid_loss,time
0,0.742616,0.697485,03:39
1,0.707911,0.743573,03:39
2,0.695463,0.693137,03:39
3,0.69287,0.693132,03:39


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,877.0 s,100%
,epoch,219.2 s,122.3ms,4,877.0 s,100%
,train,163.8 s,94.80ms,4,655.0 s,75%
,validate,55.48 s,41.20ms,4,221.9 s,25%
train,batch,250.3ms,123.9ms,2500,625.9 s,71%
,step,114.7ms,78.62ms,2500,286.9 s,33%
,backward,108.9ms,163.2ms,2500,272.2 s,31%
,pred,17.98ms,2.377ms,2500,44.96 s,5%
,draw,5.693ms,8.306ms,2500,14.23 s,2%
,zero_grad,1.830ms,350.8µs,2500,4.575 s,1%


## Train Imagenette PyTorch Lightning

In [None]:
model = partial(timm.create_model, model_name='resnet50', pretrained=False, num_classes=10)
train_pl(model, 4, name='ResNet50', size=256)

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Requesting dataloader...



  | Name     | Type                       | Params
--------------------------------------------------------
0 | model    | ResNet                     | 23.5 M
1 | loss_fn  | LabelSmoothingCrossEntropy | 0     
2 | accuracy | Accuracy                   | 0     
--------------------------------------------------------
23.5 M    Trainable params
0         Non-trainable params
23.5 M    Total params
47.057    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]