In [1]:
# read csv with pandas
import pandas as pd

data_path = '../data_input/train.csv'
df = pd.read_csv(data_path, usecols=['id', 'text', 'label'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157789 entries, 0 to 157788
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      157789 non-null  object
 1   text    157789 non-null  object
 2   label   157789 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 3.6+ MB


In [2]:
import lightning as L
import torch.nn as nn
import torch
import torch.nn.functional as F

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class Config:
    vocab_size: int
    n_embd: int
    n_hidden: int
    batch_size: int
    context_length: int
    n_workers: int
    # /path/to/save/checkpoints
    ckpt_path = None
    dropout = 0.1
    checkpoint_dir: Path = Path("../output/checkpoints/").absolute()


def get_default_config() -> Config:
    cfg =  Config(
        n_workers=1,
        vocab_size=5001,
        n_embd=16,
        n_hidden=64,
        batch_size=32,
        context_length=32,
    )

    # Get the latest checkpoint file path
    latest_ckpt_file = sorted(cfg.checkpoint_dir.glob('*.ckpt'), key=lambda x: x.stat().st_mtime, reverse=True)
    cfg.ckpt_path = None if latest_ckpt_file == [] else latest_ckpt_file[0]
    return cfg

cfg = get_default_config()

In [4]:
def get_trainable_layers(parent_module, parent_name='root'):
    res = {}
    modules = list(parent_module.named_children())
    if len(modules) == 0 and any(param.requires_grad for param in parent_module.parameters()):
        # Base case: If no children and has trainable params, return itself
        res[parent_name] = parent_module
    else:
        for name, module in modules:
            # Construct the full module name by appending current name to parent's
            full_name = f'{parent_name}.{name}' if parent_name else name
            # Recursively get trainable layers
            sub_layers = get_trainable_layers(module, full_name)
            if len(sub_layers) == 0 and any(param.requires_grad for param in module.parameters()):
                # If the module has parameters but no sub-layers returned, add the module
                res[full_name] = module
            res.update(sub_layers)
    return res

In [5]:
def get_activations(parent_module, parent_name='root'):
    res = {}
    modules = list(parent_module.named_children())
    if len(modules) == 0 and isinstance(parent_module, (nn.ReLU, nn.Sigmoid, nn.Tanh)):
        # Base case: If no children and has trainable params, return itself
        res[parent_name] = parent_module
    else:
        for name, module in modules:
            # Construct the full module name by appending current name to parent's
            full_name = f'{parent_name}.{name}' if parent_name else name
            # Recursively get trainable layers
            sub_layers = get_activations(module, full_name)
            if len(sub_layers) == 0 and isinstance(module, (nn.ReLU, nn.Sigmoid, nn.Tanh)):
                # If the module has parameters but no sub-layers returned, add the module
                res[full_name] = module
            res.update(sub_layers)
    return res

In [32]:
from lightning.pytorch.loggers import TensorBoardLogger

class Logger(TensorBoardLogger):
    def __init__(self, model, *args, **kwargs):
        super().__init__(".", name="lightning_logs", *args, **kwargs)
        self._log_graph = True
        self.model = model
        self.experiment.add_custom_scalars(self._layout())
    
    def _layout(self):
        activation_params = []
        discrepancy_params = []
        for i, item in enumerate(self.model.get_activations().items()):
            name, layer = item
            if isinstance(layer, (nn.ReLU, nn.Sigmoid, nn.Tanh)):
                # Adding activation stats layout
                # activation_params.append(f'act_{name}_out')
                activation_params.append(f'act_{name}_out_sat')
    
        for name, p in self.model.named_parameters():
            if p.ndim == 2:
                # Assuming you only log update discrepancies for 2D params
                formatted_name = 'ud_' + name.replace('.', '_')
                discrepancy_params.append(formatted_name)
    
        layout = {
            "Layer Metrics": {
                "Activation Out": ["Multiline", activation_params],
                "Update Discrepancy by Layer": ["Multiline", discrepancy_params]
            }
        }
        return layout
    
    # logging
    def log_ud(self):
        # Get the current learning rate
        lr = self.model.optimizers().param_groups[0]['lr']

        # Iterate through named parameters to calculate and log metrics
        for name, p in self.model.named_parameters():
            if p.ndim == 2 and p.grad is not None:
                # Calculate the standard deviation of the gradients adjusted by the learning rate
                grad_std = (lr * p.grad).std()
                # Calculate the standard deviation of the parameter values
                param_std = p.data.std()
                # Calculate the Update Discrepancy (ud) metric and take the log10
                metric = (grad_std / param_std).log10().item()
                # Create a formatted name that corresponds to the naming convention in the TensorBoard layout
                formatted_name = 'ud_' + name.replace('.', '_')
                # Log the metric using the formatted name
                self.model.log(formatted_name, metric, on_step=False, on_epoch=True)
                
    def log_activation_out(self):
        for i, item in enumerate(self.model.get_activations().items()):
            name, layer = item
            # if isinstance(layer, (nn.ReLU, nn.Sigmoid, nn.Tanh)):
            #     t = layer.out  # Make sure outputs are stored during forward pass
            #     self.log(f'act_{name}_out', t.mean().item(), on_step=False, on_epoch=True)
            if isinstance(layer, nn.ReLU):
                t = layer.out  # Make sure outputs are stored during forward pass
                saturation = (t < 0.05).float().mean() * 100
                self.log(f'act_{name}_out_sat', saturation, on_step=False, on_epoch=True)
            if isinstance(layer, nn.Tanh):
                t = layer.out  # Make sure outputs are stored during forward pass
                saturation = (t.abs() > 0.025).float().mean() * 100
                self.log(f'act_{name}_out_sat', saturation, on_step=False, on_epoch=True)

In [48]:

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(cfg.n_embd, head_size, bias=False)
        self.query = nn.Linear(cfg.n_embd, head_size, bias=False)
        self.value = nn.Linear(cfg.n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(cfg.context_length, cfg.context_length)))
        self.dropout = nn.Dropout(cfg.dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(cfg.n_embd, cfg.n_embd)
        self.dropout = nn.Dropout(cfg.dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(cfg.dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class TransfEncModel(L.LightningModule):
    def __init__(self, cfg: Config, n_classes=5):
        super().__init__()
        # Important: This property activates manual optimization.
        self.automatic_optimization = False
        self.lossi = []
        
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(cfg.vocab_size, cfg.n_embd)
        self.position_embedding_table = nn.Embedding(cfg.context_length, cfg.n_embd)
        self.blocks = nn.Sequential(*[Block(cfg.n_embd, n_head=1) for _ in range(1)])
        self.ln_f = nn.LayerNorm(cfg.n_embd) # final layer norm
        self.lm_head = nn.Linear(cfg.context_length * cfg.n_embd, n_classes)
        
        # register forward hook
        self._register_forward_hook()
    
    def _register_forward_hook(self):
        # Define the forward hook function
        def forward_hook(module, input, output):
            module.out = output  # Store output in the module itself
            module.out.retain_grad()  # Ensure that the output gradients are stored
                
        activations = self.get_activations().values()
        for act in activations:
            act.register_forward_hook(forward_hook)
        
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=0.001, weight_decay=0.01)

    def forward(self, idx):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=self.device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        B, T, C = x.shape
        x = x.view(B, T*C)  # (B,T,C) -> (B, T*C)
        logits = self.lm_head(x) # (B,n_classes)

        return logits

    def training_step(self, batch, batch_idx):
        inputs, target = batch
        opt = self.optimizers()
        opt.zero_grad()
        output = self(inputs)
        loss = torch.nn.functional.cross_entropy(output, target.view(-1))
        # Call backward with retain_graph=True
        self.manual_backward(loss, retain_graph=True)
        # Ensure that logger has the log_ud method
        self.training_step_log()
        opt.step()
        self.log(
            "train_loss", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True, sync_dist=True
        )
        return loss
    
    def training_step_log(self):
        if hasattr(self.logger, 'log_ud'):
            self.logger.log_ud()
        if hasattr(self.logger, 'log_activation_out'):
            self.logger.log_activation_out()
    
    def on_fit_end(self) -> None:
        return super().on_fit_end()

    def validation_step(self, batch, batch_idx):
        inputs, target = batch
        output = self(inputs)
        loss = torch.nn.functional.cross_entropy(output, target.view(-1))
        self.log("val_loss", loss, on_epoch=True, prog_bar=True, sync_dist=True)
        return loss
    
    def get_trainable_layers(self):
        return get_trainable_layers(self)
    
    def get_activations(self):
        return get_activations(self)

In [49]:
# split X and Y into train and val, stratify by Y
import sys
sys.path.append(".")

from script.data import get_train_test_split

Xtr, Xval, Ytr, Yval = get_train_test_split(
    df["text"], df["label"], context_length=cfg.context_length, test_size=0.2
)

Vocab size: 5001


In [50]:
model = TransfEncModel(cfg)
sum([p.nelement() for p in model.parameters()])

86357

In [51]:
modules = model.get_trainable_layers()
modules

{'root.token_embedding_table': Embedding(5001, 16),
 'root.position_embedding_table': Embedding(32, 16),
 'root.blocks.0.sa.heads.0.key': Linear(in_features=16, out_features=16, bias=False),
 'root.blocks.0.sa.heads.0.query': Linear(in_features=16, out_features=16, bias=False),
 'root.blocks.0.sa.heads.0.value': Linear(in_features=16, out_features=16, bias=False),
 'root.blocks.0.sa.proj': Linear(in_features=16, out_features=16, bias=True),
 'root.blocks.0.ffwd.net.0': Linear(in_features=16, out_features=64, bias=True),
 'root.blocks.0.ffwd.net.2': Linear(in_features=64, out_features=16, bias=True),
 'root.blocks.0.ln1': LayerNorm((16,), eps=1e-05, elementwise_affine=True),
 'root.blocks.0.ln2': LayerNorm((16,), eps=1e-05, elementwise_affine=True),
 'root.ln_f': LayerNorm((16,), eps=1e-05, elementwise_affine=True),
 'root.lm_head': Linear(in_features=512, out_features=5, bias=True)}

In [52]:
for i, item in enumerate(model.get_activations().items()):
    name, layer = item
    print(name, layer)

root.blocks.0.ffwd.net.1 ReLU()


In [53]:
# train with pytorch lightning
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(Xtr, Ytr)
val_dataset = TensorDataset(Xval, Yval)

train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, num_workers=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=cfg.batch_size, num_workers=2, shuffle=False)

In [54]:
from lightning.pytorch.loggers import TensorBoardLogger

# Example usage with a model instance
logger = Logger(model)

In [58]:
trainer = L.Trainer(max_epochs=2, accelerator='cpu', logger=logger)

trainer.fit(model, train_loader)#, val_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
C:\Users\paul-\anaconda3\envs\ml_angew_programm\lib\site-packages\lightning\pytorch\trainer\configuration_validator.py:74: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.

  | Name                     | Type       | Params
--------------------------------------------------------
0 | token_embedding_table    | Embedding  | 80.0 K
1 | position_embedding_table | Embedding  | 512   
2 | blocks                   | Sequential | 3.2 K 
3 | ln_f                     | LayerNorm  | 32    
4 | lm_head                  | Linear     | 2.6 K 
--------------------------------------------------------
86.4 K    Trainable params
0         Non-trainable params
86.4 K    Total params
0.345     Total estimated model params size (MB)
C:\Users\paul-\anaconda3\envs\ml_angew_programm\lib\site-packages\lightning\pytorch\loggers\tensor

Training: |          | 0/? [00:00<?, ?it/s]

AttributeError: 'Logger' object has no attribute 'log'

In [None]:
import matplotlib.pyplot as plt

# visualize histograms
plt.figure(figsize=(20, 4)) # width and height of the plot
legends = []
for i, layer in enumerate(model.get_activations().values()): # note: exclude the output layer
  print(layer)
  if isinstance(layer, nn.ReLU):
    t = layer.out
    print('layer %d (%s): mean %+.2f, std %.2f, saturated: %.2f%%' % (i, layer.__class__.__name__, t.mean(), t.std(), (t.abs() > 0.97).float().mean()*100))
    hy, hx = torch.histogram(t, density=True)
    plt.plot(hx[:-1].detach(), hy.detach())
    legends.append(f'layer {i} ({layer.__class__.__name__})')
plt.legend(legends);
plt.title('activation distribution')

In [None]:
# visualize histograms
plt.figure(figsize=(20, 4)) # width and height of the plot
legends = []
layer2 = None
for i, layer in enumerate(model.get_activations().values()): # note: exclude the output layer
  if isinstance(layer, nn.ReLU):
    layer2 = layer
    layer.out.retain_grad()
    t = layer.out.grad
    print('layer %d (%10s): mean %+f, std %e' % (i, layer.__class__.__name__, t.mean(), t.std()))
    hy, hx = torch.histogram(t, density=True)
    plt.plot(hx[:-1].detach(), hy.detach())
    legends.append(f'layer {i} ({layer.__class__.__name__}')
plt.legend(legends);
plt.title('gradient distribution')

In [None]:
layer2.out.grad

In [None]:
layer2.out.retains_grad

In [None]:
# visualize histograms
plt.figure(figsize=(20, 4)) # width and height of the plot
legends = []
for i,p in enumerate(model.parameters()):
  t = p.grad
  if p.ndim == 2:
    print('weight %10s | mean %+f | std %e | grad:data ratio %e' % (tuple(p.shape), t.mean(), t.std(), t.std() / p.std()))
    hy, hx = torch.histogram(t, density=True)
    plt.plot(hx[:-1].detach(), hy.detach())
    legends.append(f'{i} {tuple(p.shape)}')
plt.legend(legends)
plt.title('weights gradient distribution');

In [None]:
# with torch.no_grad():
    # train validation loss
# outputs1 = model(Xtr)
# loss = F.cross_entropy(outputs1, Ytr)
# print(f'Training loss: {loss.item():.4f}')

outputs = model(Xval)
loss = torch.functional.F.cross_entropy(outputs, Yval)
print(f'Validation loss: {loss.item():.4f}')

tensor(1.1228, grad_fn=<NllLossBackward0>)  
tensor(1.0828, grad_fn=<NllLossBackward0>)  
tensor(1.0767, grad_fn=<NllLossBackward0>) on mlpc  
Validation loss: 1.0657 on wavenet

In [None]:
from sklearn.metrics import classification_report

Y_pred = outputs.argmax(dim=1).detach().numpy()
Y_true = Yval.numpy()

print(classification_report(Y_true, Y_pred))