In [1]:
! nvidia-smi

Sat Dec 11 11:42:48 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Graphics Device     Off  | 00000000:01:00.0  On |                  N/A |
|  0%   45C    P3    32W / 170W |    165MiB / 12053MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [2]:
! cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 165
model name	: Intel(R) Core(TM) i5-10400F CPU @ 2.90GHz
stepping	: 5
microcode	: 0xe2
cpu MHz		: 802.039
cache size	: 12288 KB
physical id	: 0
siblings	: 12
core id		: 0
cpu cores	: 6
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 22
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid mpx rdseed adx smap clflushopt intel_pt 

In [3]:
! pip install fastcore --upgrade -qq
! pip install fastai --upgrade -qq
! pip install transformers --upgrade -qq
! pip install datasets --upgrade -qq
! pip install pytorch_lightning --upgrade -qq
! pip install wandb --upgrade -qq
! pip install ohmeow-blurr --upgrade -qq
! pip install timm --upgrade -qq
! pip install git+https://github.com/warner-benjamin/fastai_snippets.git -qq

In [4]:
import torch
torch.__version__

'1.7.1+cu110'

In [5]:
%env WANDB_SILENT=true

env: WANDB_SILENT=true


In [6]:
import wandb
wandb.login()

True

In [7]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Imports

In [8]:
import gc

# import fastai
from fastai.vision.all import *
from fastai.text.all import *
from fastai.callback.wandb import WandbCallback
from fastai_snippets.callback import simpleprofiler
from fastai_snippets.utils import simpleprofiler_wandb

# import blurr/huggingface
from blurr.data.all import *
from blurr.modeling.all import *
from datasets import load_dataset
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

#import pytorch lightning
import timm
import pytorch_lightning as pl
from pytorch_lightning import LightningDataModule, LightningModule, Trainer
from torch.optim.lr_scheduler import OneCycleLR
from torch.optim import AdamW
import torchvision.transforms as tvt
from torchvision.datasets import ImageFolder
from pytorch_lightning.loggers import WandbLogger
import torchmetrics

## Setup

In [9]:
plat = 'Colab local runtime rtx3060-12GB'
GPU = 'rtx3060'
img_bs = 48
nlp_bs = 12


In [10]:
source = untar_data(URLs.IMAGENETTE_320)
source = untar_data(URLs.IMAGENETTE)

## Modify SimpleProfiler to Log Results to wandb

In [11]:
from pytorch_lightning.profiler.simple import SimpleProfiler

@patch
def summary(self:SimpleProfiler):
    output_table = wandb.Table(columns=["Action", "Mean duration (s)", "Duration StDev (s)", "Num calls", "Total time (s)", "Percentage %"])
    if len(self.recorded_durations) > 0:
        max_key = max(len(k) for k in self.recorded_durations.keys())

        def log_row(action, mean, num_calls, total, per):
            row = f"{sep}{action:<{max_key}s}\t|  {mean:<15}\t|"
            row += f"{num_calls:<15}\t|  {total:<15}\t|  {per:<15}\t|"
            return row

        report, total_duration = self._make_report()
        output_table.add_data("Total", "-", "-", "_", f"{total_duration:.5}", "100 %")
        for action, durations, duration_per in report:
            output_table.add_data(
                action,
                f"{np.mean(durations):.5}",
                f"{np.std(durations):.5}",
                f"{len(durations):}",
                f"{np.sum(durations):.5}",
                f"{duration_per:.5}",
            )

    wandb.log({"simple_profiler": output_table})

## PyTorch Lightning Imagenette

In [12]:
class ImagenetteDataModule(LightningDataModule):
    def __init__(self, size, woof, bs, train_transform=None, valid_transform=None):
        super().__init__()
        self.size, self.woof, self.bs = size, woof, bs
        imagewoof_stats  = ([0.496,0.461,0.399],[0.257,0.249,0.258])
        imagenette_stats = ([0.465,0.458,0.429],[0.285,0.28,0.301])

        if train_transform == None:
            self.train_transform = tvt.Compose([
                tvt.RandomResizedCrop(size, scale=(0.35, 1)),
                tvt.RandomHorizontalFlip(),
                tvt.ToTensor(),
                tvt.Normalize(*imagewoof_stats) if woof else tvt.Normalize(*imagenette_stats)])
        else:
            self.train_transform = transforms.Compose(train_transform)

        if valid_transform == None:
            self.valid_transform = tvt.Compose([
                tvt.CenterCrop(size),
                tvt.ToTensor(),
                tvt.Normalize(*imagewoof_stats) if woof else tvt.Normalize(*imagenette_stats)])
        else:
            self.valid_transform = transforms.Compose(valid_transform)

    def prepare_data(self):
        if self.size<=224: path = URLs.IMAGEWOOF_320 if self.woof else URLs.IMAGENETTE_320
        else             : path = URLs.IMAGEWOOF     if self.woof else URLs.IMAGENETTE
        self.source = untar_data(path)

    def setup(self, stage=None):
        if stage == "fit" or stage is None:
            self.train = ImageFolder(self.source/'train', self.train_transform)
        self.val = ImageFolder(self.source/'val', self.valid_transform)

    def train_dataloader(self):
        return DataLoader(self.train, batch_size=self.bs, shuffle=True, pin_memory=True, num_workers=min(8, num_cpus()))

    def val_dataloader(self):
        return DataLoader(self.val, batch_size=self.bs, pin_memory=True, num_workers=min(8, num_cpus()))

    def teardown(self, stage=None):
        self.train = None
        self.val = None

In [13]:
class ResNet(LightningModule):
    def __init__(self, model, lr=3e-3, mom=0.9, wd=1e-2):
        super().__init__()

        self.save_hyperparameters(ignore='model')
        self.model = model()
        self.loss_fn = LabelSmoothingCrossEntropy()
        self.accuracy = torchmetrics.Accuracy()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        x = self(x)
        loss = self.loss_fn(x, y)
        self.log("train_loss", loss)
        return loss

    def evaluate(self, batch, stage=None):
        x, y = batch
        x = self(x)
        loss = self.loss_fn(x, y)
        preds = F.softmax(x, dim=-1).argmax(dim=-1)
        self.accuracy(preds, y)

        if stage:
            self.log(f"{stage}_loss", loss, prog_bar=True, on_epoch=True)
            self.log(f"{stage}_acc", self.accuracy, prog_bar=True, on_epoch=True)

    def validation_step(self, batch, batch_idx):
        self.evaluate(batch, "val")

    def test_step(self, batch, batch_idx):
        self.evaluate(batch, "test")

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(),lr=self.hparams.lr, eps=1e-5)
        
        scheduler_dict = {
            "scheduler": OneCycleLR(
                optimizer=optimizer,
                max_lr =self.hparams.lr,
                total_steps=self.num_training_steps
            ),
            "interval": "step",
            "frequency": 1
        }
        return {"optimizer": optimizer, "lr_scheduler": scheduler_dict}

    # lightly modified from rohitgr7 https://github.com/PyTorchLightning/pytorch-lightning/issues/10760
    @property
    def num_training_steps(self) -> int:
        """Total training steps inferred from datamodule and devices."""
        if self.trainer.num_training_batches != float('inf'):
            dataset_size = self.trainer.num_training_batches
        else:
            print('Requesting dataloader...')
            dataset_size = len(self.trainer._data_connector._train_dataloader_source.dataloader())

        if isinstance(self.trainer.limit_train_batches, int):
            dataset_size = min(dataset_size, self.trainer.limit_train_batches)
        else:
            dataset_size = int(dataset_size * self.trainer.limit_train_batches)

        accelerator_connector = self.trainer._accelerator_connector
        if accelerator_connector.use_ddp2 or accelerator_connector.use_dp:
            effective_devices = 1
        else:
            effective_devices = self.trainer.devices

        effective_devices = effective_devices * self.trainer.num_nodes
        effective_batch_size = self.trainer.accumulate_grad_batches * effective_devices
        max_estimated_steps = math.ceil(dataset_size // effective_batch_size) * self.trainer.max_epochs

        max_estimated_steps = min(max_estimated_steps, self.trainer.max_steps) if self.trainer.max_steps != -1 else max_estimated_steps
        return max_estimated_steps


In [14]:
def train_pl(model, epochs, name, size):
    for precision in [16]:
        resnet = ResNet(model)
        imagenette = ImagenetteDataModule(size, False, img_bs if precision==16 else int(img_bs/2))

        wandb_logger = WandbLogger(project="sagecolab", name=f'{name} {plat} {GPU} fp{precision}', log_model=False)
        trainer = Trainer(gpus=1, precision=precision, max_epochs=epochs, num_sanity_val_steps=0,
                          benchmark=True, profiler="simple", logger=wandb_logger, enable_checkpointing=False)
        trainer.fit(resnet, imagenette)
        wandb.log({}) # ensure sync of last step
        wandb.finish()

        trainer, resnet, imagenette= None, None, None
        gc.collect()
        torch.cuda.empty_cache()


## Fastai Imagenette

In [15]:
imagewoof_stats =  ([0.496,0.461,0.399],[0.257,0.249,0.258])
imagenette_stats = ([0.465,0.458,0.429],[0.285,0.28,0.301])

def get_imagenette_dls(size, woof, bs, sh=0., augs=None, workers=None, stats=True):
    if size<=224: path = URLs.IMAGEWOOF_320 if woof else URLs.IMAGENETTE_320
    else        : path = URLs.IMAGEWOOF     if woof else URLs.IMAGENETTE
    source = untar_data(path)
    if workers is None: workers = min(8, num_cpus())
    batch_tfms = []
    if stats:
        if woof: 
            batch_tfms += [Normalize.from_stats(*imagewoof_stats)]
        else:
            batch_tfms += [Normalize.from_stats(*imagenette_stats)]
    if augs: batch_tfms += augs
    if sh: batch_tfms.append(RandomErasing(p=0.3, max_count=3, sh=sh))
    dblock = DataBlock(blocks=(ImageBlock, CategoryBlock),
                       splitter=GrandparentSplitter(valid_name='val'),
                       get_items=get_image_files, get_y=parent_label,
                       item_tfms=[RandomResizedCrop(size, min_scale=0.35), FlipItem(0.5)],
                       batch_tfms=batch_tfms)
    if size==128: bs = 64
    return dblock.dataloaders(source, path=source, bs=bs, num_workers=workers)

In [16]:
def train_imagenette_fastai(model, epochs, name, size, precision=[16, 32], augs=None):
    for fp in precision:
        dls = get_imagenette_dls(size, False, img_bs if fp==16 else int(img_bs/2), augs=augs)
        train_fastai(dls, model(), epochs, name, precision=fp)


## Fastai-Blurr IMDB

In [17]:
def get_imdb_dls(model_name, bs):
    dataset = load_dataset('imdb')
    df = pd.DataFrame(dataset['train'])
    df['is_valid'] = False
    df = df.sample(frac=0.1, random_state=42)
    df2 = pd.DataFrame(dataset['test'])
    df2['is_valid'] = True
    df2 = df2.sample(frac=0.1, random_state=42)
    df = df.append(df2, ignore_index=True)

    config = AutoConfig.from_pretrained(model_name)
    config.num_labels = len(df['label'].unique())
    hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(model_name, model_cls=AutoModelForSequenceClassification, config=config)

    dblock = DataBlock(blocks=(HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), CategoryBlock),  
                      get_x=ColReader('text'), 
                      get_y=ColReader('label'), 
                      splitter=ColSplitter())

    return hf_model, dblock.dataloaders(df, bs=bs, workers=min(8, num_cpus()))


In [18]:
def train_imdb_fastai(model_name, epochs, name):
    for precision in [16]:
        hf_model, dls = get_imdb_dls(model_name, nlp_bs if precision==16 else int(nlp_bs/2))
        model = HF_BaseModelWrapper(hf_model)
        train_fastai(dls, model, epochs, name, splitter=hf_splitter, cbs=[HF_BaseModelCallback], precision=precision)


## Generic Fastai Training

In [19]:
def train_fastai(dls, model, epochs, name, splitter=trainable_params, cbs=[], precision=16):
    run = wandb.init(project="sagecolab", name=f'{name} {plat} {GPU} fp{precision}')
    learn = Learner(dls, model, cbs=cbs, splitter=splitter).profile()
    if precision==16:
        learn.to_fp16()
    
    learn.fit_one_cycle(epochs, 3e-3, cbs=[WandbCallback(log=None, log_preds=False, log_model=False)])

    run.finish()

    learn.dls, learn = None,  None
    gc.collect()
    torch.cuda.empty_cache()

# Training
## Train Imagenette Fastai

In [20]:
model = partial(xse_resnet50, n_out=10)
train_imagenette_fastai(model, 2, name='XSEResNet50', size=224, precision=[16])

epoch,train_loss,valid_loss,time
0,1.555547,1.96274,02:40
1,1.095908,1.020958,02:36


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,317.9 s,100%
,epoch,158.8 s,1.949 s,2,317.7 s,100%
,train,144.5 s,1.706 s,2,289.0 s,91%
,validate,14.34 s,242.5ms,2,28.68 s,9%
train,batch,714.6ms,236.2ms,394,281.6 s,89%
,step,492.6ms,22.98ms,394,194.1 s,61%
,backward,183.4ms,165.4ms,394,72.26 s,23%
,pred,28.72ms,72.04ms,394,11.32 s,4%
,draw,5.568ms,28.41ms,394,2.194 s,1%
,zero_grad,3.498ms,194.1µs,394,1.378 s,0%


In [21]:
model = partial(xresnet18, n_out=10)
train_imagenette_fastai(model, 2, name='XResNet18 128', size=128, precision=[16])

epoch,train_loss,valid_loss,time
0,1.457662,1.685928,00:13
1,1.070624,0.977206,00:12


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,25.82 s,100%
,epoch,12.82 s,364.8ms,2,25.64 s,99%
,train,9.924 s,369.0ms,2,19.85 s,77%
,validate,2.893 s,4.527ms,2,5.785 s,22%
train,batch,65.61ms,75.05ms,294,19.29 s,75%
,step,40.80ms,1.817ms,294,12.00 s,46%
,pred,8.972ms,13.83ms,294,2.638 s,10%
,backward,7.594ms,25.65ms,294,2.233 s,9%
,draw,6.301ms,40.94ms,294,1.853 s,7%
,zero_grad,1.267ms,232.9µs,294,372.5ms,1%


# Train IMDB

In [22]:
train_imdb_fastai('roberta-base', 1, 'Roberta')


Reusing dataset imdb (/home/dev2/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

Could not gather input dimensions


epoch,train_loss,valid_loss,time
0,0.715082,0.692518,01:09


Phase,Action,Mean Duration,Duration Std Dev,Number of Calls,Total Time,Percent of Total
fit,fit,-,-,1,69.97 s,100%
,epoch,69.93 s,-,1,69.93 s,100%
,train,52.94 s,-,1,52.94 s,76%
,validate,16.98 s,-,1,16.98 s,24%
train,batch,240.3ms,126.1ms,208,49.97 s,71%
,backward,188.2ms,119.0ms,208,39.14 s,56%
,step,28.74ms,2.388ms,208,5.977 s,9%
,pred,15.72ms,1.713ms,208,3.270 s,5%
,draw,3.676ms,30.35ms,208,764.6ms,1%
,zero_grad,3.096ms,140.0µs,208,643.9ms,1%


## Train Imagenette PyTorch Lightning

In [23]:
model = partial(timm.create_model, model_name='resnet50', pretrained=False, num_classes=10)
train_pl(model, 2, name='ResNet50', size=224)

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Requesting dataloader...



  | Name     | Type                       | Params
--------------------------------------------------------
0 | model    | ResNet                     | 23.5 M
1 | loss_fn  | LabelSmoothingCrossEntropy | 0     
2 | accuracy | Accuracy                   | 0     
--------------------------------------------------------
23.5 M    Trainable params
0         Non-trainable params
23.5 M    Total params
47.057    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]