# PyTorch 


In [None]:
! pip install torch torchtext 

Keep a tab open for pytorch docs : https://pytorch.org/docs/stable/index.html

In [2]:
import logging as log
from pathlib import Path

import torch

log.basicConfig(level=log.INFO)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
log.info(f'device={device}')

INFO:root:device=cuda


# torchtext
torchtext docs refer to https://pytorch.org/text/experimental_datasets.html

In [4]:
from torchtext.experimental.datasets import YahooAnswers, DBpedia, AG_NEWS

root = 'cache-torchtext'
#train, test = YahooAnswers(root=root)
#train, valid = DBpedia(root=root)
train, valid = AG_NEWS(root=root)   # using this because it is small
type(train), type(valid)

INFO:root:Downloading from Google Drive; may take a few minutes
INFO:root:File cache-torchtext/ag_news_csv.tar.gz already exists.
INFO:root:Opening tar file cache-torchtext/ag_news_csv.tar.gz.
INFO:root:cache-torchtext/ag_news_csv/train.csv already extracted.
INFO:root:cache-torchtext/ag_news_csv/test.csv already extracted.
INFO:root:cache-torchtext/ag_news_csv/classes.txt already extracted.
INFO:root:cache-torchtext/ag_news_csv/readme.txt already extracted.
120000lines [00:01, 111867.30lines/s]


(torchtext.experimental.datasets.text_classification.TextClassificationDataset,
 torchtext.experimental.datasets.text_classification.TextClassificationDataset)

# DataLoader with Mini Batches

- Torch's [datatypes](https://pytorch.org/docs/stable/tensors.html#torch-tensor)
- [DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader)


In [3]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    texts, labels = [], []
    for label, txt in batch:
        texts.append(txt)
        labels.append(label)        
    return texts, labels

train_loader = DataLoader(train, batch_size=3, collate_fn=collate_fn, shuffle=True)

for idx, (texts, labels) in enumerate(train_loader):
    print(idx, labels)
    print(texts)
    break

0 [tensor(3), tensor(3), tensor(4)]
[tensor([ 2695,   370, 10586,  1095,    26,    13,    10,     6,  5300, 13907,
           13,    10,  9045,  9358,  4345,   378,   370,    60,     7,     6,
        14337,  5258,     9,  2385,   253,     3,  5874,     2,   903,   234,
           31,  6861,     4, 22318,  2347,    18,    94,   970]), tensor([  307,   976,  1150,   233,     5,  2834,   113,   226,    50,  1150,
           13,   279,    21,   976,   324,  1062,    27,    70,   116,    18,
           31,    45, 10844,   238,    45,   438,     7,  1150,   974,     5,
         2834,   259,     8,   451,   303,     7,   761,    69,     3,   788,
           13,    10, 10500,   414,  2365,     2]), tensor([49562,   617,     5,   355,  5697,     4,   163,     8, 10297,   755,
            6,   734,     7, 49562,  2007,    99,    65,     2,   299,    58,
            5,   355,  5697,     4,   163,     5,     3,   195,   101,     5,
         1105,  1872,    18,     3, 13627,   286,  1374,   213, 1

We have unequal length sequences:
- use padding
- keep track of lengths


In [5]:
# vocabul
train.vocab.itos[:10]
PAD_IDX = train.vocab.stoi['<pad>']
UNK_IDX = train.vocab.stoi['<unk>']
log.info(f'vocabulary size= {len(train.vocab):,}')
log.info(f'<unk>={UNK_IDX}')
log.info(f'<pad>={PAD_IDX}')

from collections import Counter
all_labels = Counter(label for label, txt in train.data)
print(all_labels)

INFO:root:vocabulary size= 95,812
INFO:root:<unk>=0
INFO:root:<pad>=1


Counter({3: 30000, 4: 30000, 2: 30000, 1: 30000})


In [6]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    texts, labels = [], []
    for label, txt in batch:
        texts.append(txt)
        labels.append(label)

    labels = torch.tensor(labels, dtype=torch.uint8)    
    lengths = [len(txt) for txt in texts]
    lengths = torch.tensor(lengths, dtype=torch.short)

    seqs = torch.full(size=(len(texts), lengths.max()),
                      fill_value=PAD_IDX, dtype=torch.long)
    for idx, txt in enumerate(texts):
        seqs[idx, :len(txt)] = txt
    
    return seqs, labels, lengths
train_loader = DataLoader(train, batch_size=3, collate_fn=collate_fn, shuffle=True)

for idx, (texts, labels, lengths) in enumerate(train_loader):
    print(idx, labels, lengths)
    print(texts)
    break

0 tensor([2, 1, 3], dtype=torch.uint8) tensor([27, 35, 35], dtype=torch.int16)
tensor([[    6,    17,    10,   916,  9506,  9426,  3603,  5293,  1720,  9506,
            29, 13413,     6, 29183,    18,    94,  1319,   284,    30,     3,
          1601,  2817,     5,     3,  1387,  2382,     2,     1,     1,     1,
             1,     1,     1,     1,     1],
        [ 1634,   673,    12,    24,  2039,     5,  1218,  1220,     3,  1558,
           214,     4,   334,  2606,  1634,     4,    29,   441,    12,     6,
            24,  2039,     5,  5633,     3, 21941,  1220,    19,   197,    39,
             3,  2410,     7,  1218,     2],
        [   37,   254,  1800,    67,  6355, 20892,     6,    37,   930,  2979,
            99,  1376,    27,    26,    87,  2074,     6,  2051,   746,     8,
          3561,  8028,  5720,  6355, 20892,     8,     6,   122, 14697,     3,
            55,    21,  9588,    83,     2]])


In [6]:
from torch.utils.data import DataLoader

from dataclasses import dataclass


@dataclass
class Batch:

    texts: torch.Tensor
    labels: torch.Tensor
    lengths: torch.Tensor

    def __len__(self):
        return len(seqs) 

    def tok_count(self):
        return self.lengths.sum()

    def to(self, device):
        # there is an option called pin_memory for advanced users
        self.texts = self.texts.to(device)
        self.labels = self.labels.to(device)
        self.lengths = self.lengths.to(device)        
        return self

    @classmethod
    def collate_fn(cls, batch, max_len=80) -> 'Batch':
        # handle edge cases, define max_len 
        texts, labels = [], []
        for label, txt in batch:
            texts.append(txt[:max_len])
            labels.append(label)

        labels = torch.tensor(labels, dtype=torch.uint8)    
        lengths = [len(txt) for txt in texts]
        lengths = torch.tensor(lengths, dtype=torch.short)

        seqs = torch.full(size=(len(texts), lengths.max()),
                          fill_value=PAD_IDX, dtype=torch.long)
        for idx, txt in enumerate(texts):
            seqs[idx, :len(txt)] = txt
    
        return cls(texts=seqs, labels=labels, lengths=lengths)

train_loader = DataLoader(train, batch_size=3, collate_fn=Batch.collate_fn, shuffle=True)

for idx, batch in enumerate(train_loader):
    print(idx, batch.labels, batch.lengths)
    print(batch.texts)
    break

0 tensor([2, 3, 1], dtype=torch.uint8) tensor([38, 48, 19], dtype=torch.int16)
tensor([[77866,   159,   243,   621,    44,  1376,     8,     3,    90, 17327,
            20,     6,   446,   111,   231,  4091,  1882,  8017,  5643,  3778,
             9,  4029,  4592,     4,     3,   434,  4080,     3,  1501,    41,
           250,  1376,    40,     6,  1982,     8,  1933,     2,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1],
        [  341,   109,  2707,   338,  1380,  1043,  1531,   873,  1364,  2707,
            27,    26,   603, 68465,    83,    14, 93379,    83,    15,     8,
             3,    48,   512,     7,     3,    72,     4,   685,  1113,     3,
           788,    13,    10, 68451,    83,    14, 93339,    83,    15,  1043,
            12,     3,   730,  1740,     3,  1602,    72,     2],
        [ 1029,  4734, 11178,   146,   197,    17,    10,  5777,  4734,  1507,
             3,   146,    12,   290,  7693,     4,    64,  1945,     2,     1,

In [5]:
!gpustat

[1m[37msaga22                [m  Tue Oct 20 17:16:07 2020  [1m[30m440.44[m
[36m[0][m [34mGeForce GTX 1080 Ti[m |[31m 34'C[m, [32m  0 %[m | [36m[1m[33m   10[m / [33m11178[m MB |


# `tqdm`

tqdm provides progress bar https://github.com/tqdm/tqdm

In [6]:
from tqdm import tqdm
from time import time as time_now

steps = 100000

count = 0
epoch = 0
total_toks = 0
start_time = time_now()
while count < steps:
    # https://github.com/tqdm/tqdm#documentation 
    with tqdm(train_loader) as pbar:
        for batch in pbar:
            count += 1
            total_toks += batch.tok_count()
            toks_speed = int(total_toks / (time_now() - start_time))
            pbar.set_postfix(dict(updates=count, epoch=epoch, toks_speed=f'{toks_speed:,}'))
            if count % 100 == 0:  # testing 
                break
    log.info(f'Epoch {epoch} completed')
    epoch += 1
    if epoch > 2:
        break

  0%|          | 99/40000 [00:00<01:14, 538.09it/s, updates=100, epoch=0, toks_speed=69,922]
INFO:root:Epoch 0 completed
  0%|          | 99/40000 [00:00<00:58, 677.43it/s, updates=200, epoch=1, toks_speed=77,262]
INFO:root:Epoch 1 completed
  0%|          | 99/40000 [00:00<00:46, 856.26it/s, updates=300, epoch=2, toks_speed=85,590]
INFO:root:Epoch 2 completed


In [8]:
from tqdm import tqdm
from time import time as time_now

def looping_iter(data, total):
    """Flattens multiple epoch loops into one """
    count = 0
    epoch = 0
    while count < total:
        for item in data:
            yield item
            count += 1
            if count >= total:
                break
        if count < total:
            log.info(f"End of epoch {epoch}")
        epoch += 1

steps = 100_000
total_toks = 0
start_time = time_now()
with tqdm(looping_iter(train_loader, total=steps), total=steps) as pbar:
    for count, batch in enumerate(pbar):
        total_toks += batch.tok_count()
        toks_speed = int(total_toks / (time_now() - start_time))
        pbar.set_postfix_str(f'speed={toks_speed:,}toks/sec')
        if (count + 1) % 1000 == 0:  # testing 
            break

  1%|          | 999/100000 [00:01<02:03, 802.92it/s, speed=103,930toks/sec]


In [9]:
import torch.nn as nn                     # neural networks
import torch.nn.functional as F           # layers, activations and more
import torch.optim as optim

@dataclass
class Trainer:
    model: nn.Module
    opt: optim.Optimizer
    #loss_func = nn.CrossEntropyLoss()      # object oriented API
    loss_func = F.cross_entropy             # functional API 

# I use object-oriented API for components with states
#    and functional API for stateless components

In [10]:
from tqdm import tqdm
from time import time as time_now

  
@dataclass
class Trainer:
    model: nn.Module
    opt: optim.Optimizer = None
    loss_func = F.cross_entropy
    device = device

    def train(self, train_loader: DataLoader, valid_loader: DataLoader, steps: int, checkpoint: int):
        total_toks = 0
        start_time = time_now()
        train_data = looping_iter(train_loader, total=steps)
        with tqdm(train_data, total=steps) as databar:
            for step, batch in enumerate(databar, start=1):
                total_toks += batch.tok_count()
                toks_speed = int(total_toks / (time_now() - start_time))

                databar.set_postfix_str(f'{toks_speed:,}toks/sec')
                # TODO: train 

                if step % checkpoint == 0:
                    log.info(f"TODO: checkpoint at step: {step:,}")

                if step >= steps:
                    break

BATCH_SIZE = 40
train_loader = DataLoader(train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=Batch.collate_fn)
valid_loader = DataLoader(valid, batch_size=BATCH_SIZE, shuffle=False, collate_fn=Batch.collate_fn)

trainer = Trainer(model=None, opt=None)
trainer.train(train_loader, valid_loader, steps=1000, checkpoint=200)

 19%|█▊        | 187/1000 [00:00<00:03, 219.74it/s, 351,262toks/sec]INFO:root:TODO: checkpoint at step: 200
 39%|███▉      | 394/1000 [00:01<00:02, 233.77it/s, 382,230toks/sec]INFO:root:TODO: checkpoint at step: 400
 60%|█████▉    | 599/1000 [00:02<00:01, 215.95it/s, 377,730toks/sec]INFO:root:TODO: checkpoint at step: 600
 79%|███████▊  | 787/1000 [00:03<00:00, 239.25it/s, 381,976toks/sec]INFO:root:TODO: checkpoint at step: 800
100%|█████████▉| 995/1000 [00:04<00:00, 211.97it/s, 378,886toks/sec]INFO:root:TODO: checkpoint at step: 1,000
100%|█████████▉| 999/1000 [00:04<00:00, 220.32it/s, 378,886toks/sec]


In [12]:
%load_ext autoreload
%autoreload 2

In [17]:
from model import TextClassifier

vocab = train.vocab
n_classes = max(all_labels) + 1
model_args = dict(vocab_size=len(vocab), n_classes=n_classes,
                  model_dim=128, n_heads=2, n_layers=2, ff_dim=256,
                  padding_idx=PAD_IDX)
# Note: save model_args somewhere
model = TextClassifier(**model_args)

In [13]:
from tqdm import tqdm
from time import time as time_now

class Trainer:

    def __init__(self, model, lr=5e-4, device=device,
                 opt=None, loss_func=None, lr_scheduler=None):
        self.device = device        
        self.model = model.to(self.device)
        self.opt = opt or optim.Adam(self.model.parameters(), lr=lr)
        self.loss_func = loss_func or F.cross_entropy
        self.lr_scheduler = lr_scheduler
        
    def validate(self, valid_loader) -> float:
        total = 0.
        count = 0
        for batch in valid_loader:
            batch.to(device)
            scores = self.model(texts=batch.texts, lengths=batch.lengths, out='raw')
            loss = self.loss_func(input=scores, target=batch.labels.long(), reduction='mean')
            total += loss.item()
            count += 1
        return total / count

    def checkpoint(self):
        pass

    def train(self, train_loader: DataLoader, valid_loader: DataLoader, steps: int, checkpoint: int):
        train_loss = 0.
        total_toks = 0
        start_time = time_now()
        self.model.train(True) #Training mode
        train_data = looping_iter(train_loader, total=steps)
        with tqdm(train_data, total=steps) as databar:
            for step, batch in enumerate(databar, start=1):
                batch.to(self.device)

                scores = self.model(texts=batch.texts, lengths=batch.lengths, out='raw')
                # NOTE: loss_func accepts long values for target 
                loss = self.loss_func(input=scores, target=batch.labels.long(), reduction='mean')

                loss.backward()
                self.opt.step()
                self.opt.zero_grad()

                loss_val = loss.detach().item()
                train_loss += loss_val
                total_toks += batch.tok_count()
                toks_speed = int(total_toks / (time_now() - start_time))

                databar.set_postfix_str(f'loss:{loss_val:.4f} speed:{toks_speed:,}toks/sec', refresh=False)

                if step % checkpoint == 0:
                    with torch.no_grad():
                        self.model.train(False)
                        val_loss = self.validate(valid_loader)
                        train_loss /= checkpoint
                        log.info(f'\nCheckpoint at {step}; train_loss={train_loss:.4f} valid_loss={val_loss:.4f}')
                        # TODO: checkpoint
                        self.model.train(True)
                        train_loss = 0.
                        start_time = time_now()
                        total_toks = 0


In [32]:
BATCH_SIZE = 600
train_loader = DataLoader(train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=Batch.collate_fn)
valid_loader = DataLoader(valid, batch_size=BATCH_SIZE, shuffle=False, collate_fn=Batch.collate_fn)


# TIP: Check if the model can overfit to smaller dataset

 _smaller dataset_ => 1 batch

In [None]:
trainer = Trainer(model=model)
batch1 = next(iter(train_loader))
trainer.train(train_loader=[batch1], valid_loader=valid_loader, steps=10000, checkpoint=100)

# Speed on CPU vs GPU

In [None]:
# Check speed on CPU 
trainer2 = Trainer(model=model, device='cpu')
trainer2.train(train_loader=train_loader, valid_loader=valid_loader, steps=10000, checkpoint=100)

In [None]:
trainer = Trainer(model=model, device='cuda:0')
trainer.train(train_loader=train_loader, valid_loader=valid_loader, steps=10000, checkpoint=100)

In [None]:
BATCH_SIZE = 600
train_loader = DataLoader(train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=Batch.collate_fn)
valid_loader = DataLoader(valid, batch_size=BATCH_SIZE, shuffle=False, collate_fn=Batch.collate_fn)
trainer = Trainer(model=model, device='cuda:0')
trainer.train(train_loader=train_loader, valid_loader=valid_loader, steps=10000, checkpoint=100)

---

# Tensorboard

part of `torch.utils` package
```python
from torch.utils.tensorboard import SummaryWriter
```
The class takes these args
```python
SummaryWriter(log_dir=None, comment='', purge_step=None, max_queue=10, flush_secs=120, filename_suffix='')
```

and offers these functions:

```python
add_scalar(tag, scalar_value, global_step=None, walltime=None)
add_scalars(main_tag, tag_scalar_dict, global_step=None, walltime=None)
add_graph(model, input_to_model=None, verbose=False)
add_embedding(mat, metadata=None, label_img=None, global_step=None, tag='default', metadata_header=None)

add_text(tag, text_string, global_step=None, walltime=None)
add_histogram(tag, values, global_step=None, bins='tensorflow', walltime=None, max_bins=None)
add_image(tag, img_tensor, global_step=None, walltime=None, dataformats='CHW')
add_images(tag, img_tensor, global_step=None, walltime=None, dataformats='NCHW')
add_figure(tag, figure, global_step=None, close=True, walltime=None)
add_video(tag, vid_tensor, global_step=None, fps=4, walltime=None)
add_audio(tag, snd_tensor, global_step=None, sample_rate=44100, walltime=None)
add_pr_curve(tag, labels, predictions, global_step=None, num_thresholds=127, weights=None, walltime=None)
add_hparams(hparam_dict=None, metric_dict=None)

```

Install tensorboard: `pip install tensorboard`

To launch tensorboard 
```bash
tensorboard --logdir <dirname> # --host 0.0.0.0
```

In [19]:
from torch.utils.tensorboard import SummaryWriter

class Trainer:

    def __init__(self, model, lr=5e-4, device=device,
                 opt=None, loss_func=None, lr_scheduler=None):
        self.device = device        
        self.model = model.to(self.device)
        self.opt = opt or optim.Adam(self.model.parameters(), lr=lr)
        self.loss_func = loss_func or F.cross_entropy
        self.lr_scheduler = lr_scheduler
        self.tbd = SummaryWriter(log_dir='tensorboard-logs')    # 👈 
        
    def validate(self, valid_loader) -> float:
        total = 0.
        count = 0
        for batch in valid_loader:
            batch.to(device)
            scores = self.model(texts=batch.texts, lengths=batch.lengths, out='raw')
            loss = self.loss_func(input=scores, target=batch.labels.long(), reduction='mean')
            total += loss.item()
            count += 1
        return total / count
    
    def log_model_graph(self, batch):
        # input is needed to extract the dynamic graph 
        self.tbd.add_graph(self.model, (batch.texts, batch.lengths))  #👈


    def train(self, train_loader: DataLoader, valid_loader: DataLoader, steps: int, checkpoint: int):        
        inp_emb_names = train_loader.dataset.vocab.itos
        train_loss = 0.
        total_toks = 0
        start_time = time_now()
        self.model.train(True) #Training mode
        train_data = looping_iter(train_loader, total=steps)
        with tqdm(train_data, total=steps) as databar:
            for step, batch in enumerate(databar, start=1):
                batch.to(self.device)
                if step == 1:                            # 👈
                    self.log_model_graph(batch) 

                scores = self.model(texts=batch.texts, lengths=batch.lengths, out='raw')
                # NOTE: loss_func accepts long values for target 
                loss = self.loss_func(input=scores, target=batch.labels.long(), reduction='mean')

                loss.backward()
                self.opt.step()
                self.opt.zero_grad()

                loss_val = loss.detach().item()
                train_loss += loss_val 
                self.tbd.add_scalar('train_loss', loss_val, step)    # 👈

                step += 1
                databar.set_postfix(dict(updates=count, loss=loss_val), refresh=False)
                if step % checkpoint == 0:
                    with torch.no_grad():
                        self.model.train(False)
                        val_loss = self.validate(valid_loader)
                        train_loss /= checkpoint
                        #👇
                        self.tbd.add_scalars('chkpt_losses', {'train': train_loss, 'valid': val_loss}, step)  
                        #self.tbd.add_embedding(mat=self.model.embeddings[0].weight, global_step=step, metadata=inp_emb_names,
                        #                       tag='input_embedding')
                        self.tbd.add_embedding(mat=self.model.cls_proj.weight, global_step=step, tag='output_embedding')

                        log.info(f'\nCheckpoint at {step}; train_loss={train_loss:.4f} valid_loss={val_loss:.4f}')
                        # TODO: checkpoint
                        self.model.train(True)
                        train_loss = 0

In [20]:
BATCH_SIZE = 20
train_loader = DataLoader(train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=Batch.collate_fn)
valid_loader = DataLoader(valid, batch_size=BATCH_SIZE, shuffle=False, collate_fn=Batch.collate_fn)

trainer = Trainer(model=model)
trainer.train(train_loader=train_loader, valid_loader=valid_loader, steps=10000, checkpoint=1000)

  1%|          | 92/10000 [00:01<03:30, 47.17it/s, updates=999, loss=1.12] INFO:root:
Checkpoint at 100; train_loss=1.1610 valid_loss=1.0044
  2%|▏         | 195/10000 [00:04<02:13, 73.54it/s, updates=999, loss=0.724]INFO:root:
Checkpoint at 200; train_loss=0.9588 valid_loss=0.8613
  3%|▎         | 295/10000 [00:07<02:11, 74.04it/s, updates=999, loss=0.67] INFO:root:
Checkpoint at 300; train_loss=0.8816 valid_loss=0.7810
  4%|▍         | 391/10000 [00:10<02:18, 69.59it/s, updates=999, loss=0.746]INFO:root:
Checkpoint at 400; train_loss=0.8305 valid_loss=0.7270
  5%|▍         | 496/10000 [00:13<02:01, 77.96it/s, updates=999, loss=0.727]INFO:root:
Checkpoint at 500; train_loss=0.7148 valid_loss=0.6803
  6%|▌         | 595/10000 [00:16<02:00, 77.88it/s, updates=999, loss=0.551]INFO:root:
Checkpoint at 600; train_loss=0.6872 valid_loss=0.6246
  7%|▋         | 692/10000 [00:18<02:06, 73.44it/s, updates=999, loss=0.634]INFO:root:
Checkpoint at 700; train_loss=0.6682 valid_loss=0.6210
  8%|▊ 

KeyboardInterrupt: 