In [None]:
! nvidia-smi

Fri Dec  3 18:14:28 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:1E.0 Off |                    0 |
| N/A   46C    P0    28W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
! cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 85
model name	: Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz
stepping	: 7
microcode	: 0x5003103
cpu MHz		: 3209.358
cache size	: 36608 KB
physical id	: 0
siblings	: 4
core id		: 0
cpu cores	: 2
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single pti fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves ida arat pku ospke avx512_vnni
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf

In [None]:
! pip install fastcore --upgrade -qq
! pip install fastai --upgrade -qq
! pip install transformers --upgrade -qq
! pip install datasets --upgrade -qq
! pip install pytorch_lightning --upgrade -qq
! pip install wandb --upgrade -qq
! pip install ohmeow-blurr --upgrade -qq
! pip install timm --upgrade -qq
! pip install git+https://github.com/warner-benjamin/fastai_snippets.git -qq

In [None]:
import torch
torch.__version__

'1.10.0+cu102'

In [None]:
%env WANDB_SILENT=true

env: WANDB_SILENT=true


In [None]:
import wandb
wandb.login()

True

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Imports

In [None]:
import gc

# import fastai
from fastai.vision.all import *
from fastai.text.all import *
from fastai.callback.wandb import WandbCallback
from fastai_snippets.callback import simpleprofiler
from fastai_snippets.utils import simpleprofiler_wandb

# import blurr/huggingface
from blurr.data.all import *
from blurr.modeling.all import *
from datasets import load_dataset
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

#import pytorch lightning
import timm
import pytorch_lightning as pl
from pytorch_lightning import LightningDataModule, LightningModule, Trainer
from torch.optim.lr_scheduler import OneCycleLR
from torch.optim import AdamW
import torchvision.transforms as tvt
from torchvision.datasets import ImageFolder
from pytorch_lightning.loggers import WandbLogger
import torchmetrics

# Setup

In [None]:
plat = 'SageMaker'
GPU = 'T4'
img_bs = 64
nlp_bs = 16

In [None]:
source = untar_data(URLs.IMAGENETTE_320)
source = untar_data(URLs.IMAGENETTE)

## Modify SimpleProfiler to Log Results to wandb

In [None]:
from pytorch_lightning.profiler.simple import SimpleProfiler

@patch
def summary(self:SimpleProfiler):
    output_table = wandb.Table(columns=["Action", "Mean duration (s)", "Duration StDev (s)", "Num calls", "Total time (s)", "Percentage %"])
    if len(self.recorded_durations) > 0:
        max_key = max(len(k) for k in self.recorded_durations.keys())

        def log_row(action, mean, num_calls, total, per):
            row = f"{sep}{action:<{max_key}s}\t|  {mean:<15}\t|"
            row += f"{num_calls:<15}\t|  {total:<15}\t|  {per:<15}\t|"
            return row

        report, total_duration = self._make_report()
        output_table.add_data("Total", "-", "-", "_", f"{total_duration:.5}", "100 %")
        for action, durations, duration_per in report:
            output_table.add_data(
                action,
                f"{np.mean(durations):.5}",
                f"{np.std(durations):.5}",
                f"{len(durations):}",
                f"{np.sum(durations):.5}",
                f"{duration_per:.5}",
            )

    wandb.log({"simple_profiler": output_table})

## PyTorch Lightning Imagenette

In [None]:
class ImagenetteDataModule(LightningDataModule):
    def __init__(self, size, woof, bs, train_transform=None, valid_transform=None):
        super().__init__()
        self.size, self.woof, self.bs = size, woof, bs
        imagewoof_stats  = ([0.496,0.461,0.399],[0.257,0.249,0.258])
        imagenette_stats = ([0.465,0.458,0.429],[0.285,0.28,0.301])

        if train_transform == None:
            self.train_transform = tvt.Compose([
                tvt.RandomResizedCrop(size, scale=(0.35, 1)),
                tvt.RandomHorizontalFlip(),
                tvt.ToTensor(),
                tvt.Normalize(*imagewoof_stats) if woof else tvt.Normalize(*imagenette_stats)])
        else:
            self.train_transform = transforms.Compose(train_transform)

        if valid_transform == None:
            self.valid_transform = tvt.Compose([
                tvt.CenterCrop(size),
                tvt.ToTensor(),
                tvt.Normalize(*imagewoof_stats) if woof else tvt.Normalize(*imagenette_stats)])
        else:
            self.valid_transform = transforms.Compose(valid_transform)

    def prepare_data(self):
        if self.size<=224: path = URLs.IMAGEWOOF_320 if self.woof else URLs.IMAGENETTE_320
        else             : path = URLs.IMAGEWOOF     if self.woof else URLs.IMAGENETTE
        self.source = untar_data(path)

    def setup(self, stage=None):
        if stage == "fit" or stage is None:
            self.train = ImageFolder(self.source/'train', self.train_transform)
        self.val = ImageFolder(self.source/'val', self.valid_transform)

    def train_dataloader(self):
        return DataLoader(self.train, batch_size=self.bs, shuffle=True, pin_memory=True, num_workers=min(8, num_cpus()))

    def val_dataloader(self):
        return DataLoader(self.val, batch_size=self.bs, pin_memory=True, num_workers=min(8, num_cpus()))

    def teardown(self, stage=None):
        self.train = None
        self.val = None

In [None]:
class ResNet(LightningModule):
    def __init__(self, model, lr=3e-3, mom=0.9, wd=1e-2):
        super().__init__()

        self.save_hyperparameters(ignore='model')
        self.model = model()
        self.loss_fn = LabelSmoothingCrossEntropy()
        self.accuracy = torchmetrics.Accuracy()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        x = self(x)
        loss = self.loss_fn(x, y)
        self.log("train_loss", loss)
        return loss

    def evaluate(self, batch, stage=None):
        x, y = batch
        x = self(x)
        loss = self.loss_fn(x, y)
        preds = F.softmax(x, dim=-1).argmax(dim=-1)
        self.accuracy(preds, y)

        if stage:
            self.log(f"{stage}_loss", loss, prog_bar=True, on_epoch=True)
            self.log(f"{stage}_acc", self.accuracy, prog_bar=True, on_epoch=True)

    def validation_step(self, batch, batch_idx):
        self.evaluate(batch, "val")

    def test_step(self, batch, batch_idx):
        self.evaluate(batch, "test")

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(),lr=self.hparams.lr, eps=1e-5)
        
        scheduler_dict = {
            "scheduler": OneCycleLR(
                optimizer=optimizer,
                max_lr =self.hparams.lr,
                total_steps=self.num_training_steps
            ),
            "interval": "step",
            "frequency": 1
        }
        return {"optimizer": optimizer, "lr_scheduler": scheduler_dict}

    # lightly modified from rohitgr7 https://github.com/PyTorchLightning/pytorch-lightning/issues/10760
    @property
    def num_training_steps(self) -> int:
        """Total training steps inferred from datamodule and devices."""
        if self.trainer.num_training_batches != float('inf'):
            dataset_size = self.trainer.num_training_batches
        else:
            print('Requesting dataloader...')
            dataset_size = len(self.trainer._data_connector._train_dataloader_source.dataloader())

        if isinstance(self.trainer.limit_train_batches, int):
            dataset_size = min(dataset_size, self.trainer.limit_train_batches)
        else:
            dataset_size = int(dataset_size * self.trainer.limit_train_batches)

        accelerator_connector = self.trainer._accelerator_connector
        if accelerator_connector.use_ddp2 or accelerator_connector.use_dp:
            effective_devices = 1
        else:
            effective_devices = self.trainer.devices

        effective_devices = effective_devices * self.trainer.num_nodes
        effective_batch_size = self.trainer.accumulate_grad_batches * effective_devices
        max_estimated_steps = math.ceil(dataset_size // effective_batch_size) * self.trainer.max_epochs

        max_estimated_steps = min(max_estimated_steps, self.trainer.max_steps) if self.trainer.max_steps != -1 else max_estimated_steps
        return max_estimated_steps

In [None]:
def train_pl(model, epochs, name, size):
    resnet = ResNet(model)
    imagenette = ImagenetteDataModule(size, False, img_bs)

    wandb_logger = WandbLogger(project="sagecolab", name=f'{name} {plat} {GPU} fp16', log_model=False)
    trainer = Trainer(gpus=1, precision=16, max_epochs=epochs, num_sanity_val_steps=0,
                      benchmark=True, profiler="simple", logger=wandb_logger, enable_checkpointing=False)
    trainer.fit(resnet, imagenette)
    wandb.log({}) # ensure sync of last step
    wandb.finish()

    trainer, resnet, imagenette= None, None, None
    gc.collect()
    torch.cuda.empty_cache()

## Fastai Imagenette

In [None]:
imagewoof_stats =  ([0.496,0.461,0.399],[0.257,0.249,0.258])
imagenette_stats = ([0.465,0.458,0.429],[0.285,0.28,0.301])

def get_imagenette_dls(size, woof, bs, sh=0., augs=None, workers=None, stats=True):
    if size<=224: path = URLs.IMAGEWOOF_320 if woof else URLs.IMAGENETTE_320
    else        : path = URLs.IMAGEWOOF     if woof else URLs.IMAGENETTE
    source = untar_data(path)
    if workers is None: workers = min(8, num_cpus())
    batch_tfms = []
    if stats:
        if woof: 
            batch_tfms += [Normalize.from_stats(*imagewoof_stats)]
        else:
            batch_tfms += [Normalize.from_stats(*imagenette_stats)]
    if augs: batch_tfms += augs
    if sh: batch_tfms.append(RandomErasing(p=0.3, max_count=4, sh=sh))
    dblock = DataBlock(blocks=(ImageBlock, CategoryBlock),
                       splitter=GrandparentSplitter(valid_name='val'),
                       get_items=get_image_files, get_y=parent_label,
                       item_tfms=[RandomResizedCrop(size, min_scale=0.35), FlipItem(0.5)],
                       batch_tfms=batch_tfms)
    return dblock.dataloaders(source, path=source, bs=bs, num_workers=workers)

In [None]:
def train_imagenette_fastai(model, epochs, name, size, precision=[16, 32], augs=None):
    for fp in precision:
        dls = get_imagenette_dls(size, False, img_bs, augs=augs)
        train_fastai(dls, model(), epochs, name, precision=fp)

## Fastai-Blurr IMDB

In [None]:
def get_imdb_dls(model_name, bs):
    dataset = load_dataset('imdb')
    df = pd.DataFrame(dataset['train'])
    df['is_valid'] = False
    df = df.sample(frac=0.2, random_state=42)
    df2 = pd.DataFrame(dataset['test'])
    df2['is_valid'] = True
    df2 = df2.sample(frac=0.2, random_state=42)
    df = df.append(df2, ignore_index=True)

    config = AutoConfig.from_pretrained(model_name)
    config.num_labels = len(df['label'].unique())
    hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(model_name, model_cls=AutoModelForSequenceClassification, config=config)

    dblock = DataBlock(blocks=(HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), CategoryBlock),  
                      get_x=ColReader('text'), 
                      get_y=ColReader('label'), 
                      splitter=ColSplitter())

    return hf_model, dblock.dataloaders(df, bs=bs, workers=min(8, num_cpus()))

In [None]:
def train_imdb_fastai(model_name, epochs, name, precision=[16, 32]):
    for fp in precision:
        hf_model, dls = get_imdb_dls(model_name, nlp_bs if fp==16 else int(nlp_bs/2))
        model = HF_BaseModelWrapper(hf_model)
        train_fastai(dls, model, epochs, name, splitter=hf_splitter, cbs=[HF_BaseModelCallback], precision=fp)

## Generic Fastai Training

In [None]:
def train_fastai(dls, model, epochs, name, splitter=trainable_params, cbs=[], precision=16):
    run = wandb.init(project="sagecolab", name=f'{name} {plat} {GPU} fp{precision}')
    learn = Learner(dls, model, cbs=cbs, splitter=splitter).profile()
    if precision==16:
        learn.to_fp16()
    
    learn.fit_one_cycle(epochs, 3e-3, cbs=[WandbCallback(log=None, log_preds=False, log_model=False)])

    run.finish()

    learn.dls, learn = None,  None
    gc.collect()
    torch.cuda.empty_cache()

# Training

## Train Imagenette Fastai

In [None]:
model = partial(xse_resnet50, n_out=10)
train_imagenette_fastai(model, 4, name='XSEResNet50', size=224)

epoch,train_loss,valid_loss,time
0,1.631888,2.424804,02:21
1,1.306894,1.693266,02:16
2,1.02328,0.97449,02:18
3,0.788857,0.761688,02:19


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,556.4 s,100%
,epoch,139.0 s,1.708 s,4,556.1 s,100%
,train,124.3 s,1.390 s,4,497.4 s,89%
,validate,14.66 s,386.6ms,4,58.64 s,11%
train,batch,809.8ms,307.8ms,588,476.1 s,86%
,step,423.3ms,18.55ms,588,248.9 s,45%
,backward,339.3ms,188.5ms,588,199.5 s,36%
,pred,33.54ms,106.8ms,588,19.72 s,4%
,draw,10.70ms,49.37ms,588,6.290 s,1%
,zero_grad,1.899ms,72.35µs,588,1.117 s,0%


epoch,train_loss,valid_loss,time
0,1.647899,1.96353,05:26
1,1.317292,1.292668,05:20
2,0.987836,0.939724,04:59
3,0.762647,0.753206,05:17


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,1.265ks,100%
,epoch,316.3 s,9.994 s,4,1.265ks,100%
,train,282.7 s,8.503 s,4,1.131ks,89%
,validate,33.61 s,1.580 s,4,134.4 s,11%
train,batch,1.881 s,291.1ms,588,1.106ks,87%
,step,1.222 s,68.18ms,588,718.3 s,57%
,backward,615.1ms,188.3ms,588,361.7 s,29%
,pred,30.54ms,116.9ms,588,17.96 s,1%
,draw,10.75ms,49.31ms,588,6.324 s,0%
,zero_grad,2.042ms,74.03µs,588,1.200 s,0%


In [None]:
model = partial(xresnet18, n_out=10, act_cls=nn.Mish)
train_imagenette_fastai(model, 4, name='XResNet18 128', size=128, precision=[16])

epoch,train_loss,valid_loss,time
0,1.487867,1.625338,00:24
1,1.120165,1.095696,00:24
2,0.852057,0.833531,00:24
3,0.707498,0.699554,00:24


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,97.38 s,100%
,epoch,24.27 s,266.9ms,4,97.10 s,100%
,train,16.91 s,216.9ms,4,67.63 s,69%
,validate,7.366 s,59.39ms,4,29.46 s,30%
train,batch,102.7ms,89.08ms,588,60.37 s,62%
,step,34.92ms,6.544ms,588,20.53 s,21%
,draw,34.08ms,66.18ms,588,20.04 s,21%
,pred,19.72ms,19.87ms,588,11.60 s,12%
,backward,10.81ms,22.24ms,588,6.354 s,7%
,loss,1.525ms,2.732ms,588,896.5ms,1%


In [None]:
model = partial(xse_resnext50, n_out=10, act_cls=nn.Mish)
train_imagenette_fastai(model, 4, name='XSEResNeXt50', size=256, precision=[16])

epoch,train_loss,valid_loss,time
0,1.571577,2.244416,02:01
1,1.21076,1.496254,01:56
2,0.917405,0.870722,01:57
3,0.709971,0.727698,01:57


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,473.5 s,100%
,epoch,118.2 s,1.794 s,4,473.0 s,100%
,train,101.8 s,1.504 s,4,407.1 s,86%
,validate,16.46 s,295.2ms,4,65.85 s,14%
train,batch,688.3ms,269.2ms,588,404.7 s,85%
,step,415.7ms,16.85ms,588,244.4 s,52%
,backward,219.4ms,161.3ms,588,129.0 s,27%
,pred,36.57ms,79.68ms,588,21.50 s,5%
,draw,13.62ms,72.80ms,588,8.009 s,2%
,zero_grad,1.941ms,135.6µs,588,1.141 s,0%


## Train IMDB

In [None]:
train_imdb_fastai('roberta-base', 4, 'Roberta')

Reusing dataset imdb (/home/studio-lab-user/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

Could not gather input dimensions


epoch,train_loss,valid_loss,time
0,0.896274,0.979292,02:28
1,0.70712,0.86746,02:28
2,0.697132,0.699118,02:29
3,0.695166,0.693299,02:29


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,596.8 s,100%
,epoch,149.2 s,458.8ms,4,596.7 s,100%
,train,110.7 s,329.2ms,4,442.9 s,74%
,validate,38.45 s,508.5ms,4,153.8 s,26%
train,batch,334.6ms,181.1ms,1248,417.6 s,70%
,backward,248.8ms,205.1ms,1248,310.5 s,52%
,step,60.07ms,38.08ms,1248,74.97 s,13%
,pred,16.65ms,3.348ms,1248,20.78 s,3%
,zero_grad,4.176ms,122.2µs,1248,5.212 s,1%
,draw,3.615ms,15.83ms,1248,4.512 s,1%


Reusing dataset imdb (/home/studio-lab-user/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

Could not gather input dimensions


epoch,train_loss,valid_loss,time
0,0.729274,0.748159,06:25
1,0.722065,0.701497,06:31
2,0.702482,0.696236,06:30
3,0.693729,0.693131,05:57


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,1.525ks,100%
,epoch,381.1 s,13.67 s,4,1.525ks,100%
,train,287.9 s,10.89 s,4,1.152ks,76%
,validate,93.20 s,2.794 s,4,372.8 s,24%
train,batch,440.2ms,232.9ms,2500,1.101ks,72%
,backward,236.5ms,295.3ms,2500,591.3 s,39%
,step,181.6ms,116.5ms,2500,453.9 s,30%
,pred,13.60ms,2.383ms,2500,33.99 s,2%
,zero_grad,4.246ms,95.83µs,2500,10.62 s,1%
,draw,3.104ms,9.663ms,2500,7.760 s,1%


## Train Imagenette PyTorch Lightning

In [None]:
model = partial(timm.create_model, model_name='resnet50', pretrained=False, num_classes=10)
train_pl(model, 4, name='ResNet50', size=256)

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Requesting dataloader...



  | Name     | Type                       | Params
--------------------------------------------------------
0 | model    | ResNet                     | 23.5 M
1 | loss_fn  | LabelSmoothingCrossEntropy | 0     
2 | accuracy | Accuracy                   | 0     
--------------------------------------------------------
23.5 M    Trainable params
0         Non-trainable params
23.5 M    Total params
47.057    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]