In [6]:
!pip install datasets
!pip install transformers
!pip install torch torchvision torchaudio

Collecting torch
  Downloading torch-2.4.1-cp312-cp312-manylinux1_x86_64.whl.metadata (26 kB)
Collecting torchvision
  Downloading torchvision-0.19.1-cp312-cp312-manylinux1_x86_64.whl.metadata (6.0 kB)
Collecting torchaudio
  Downloading torchaudio-2.4.1-cp312-cp312-manylinux1_x86_64.whl.metadata (6.4 kB)
Collecting sympy (from torch)
  Downloading sympy-1.13.2-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 k

In [113]:
import torch
import math

num_layers=6
seq_len = 1024 #TODO: this should be input dependent
d_model = 512

class FFN(torch.nn.Module):
    def __init__(self):
        super().__init__()            
        
        self.d_ff = d_model*4
    
        self.W_1 = torch.nn.Linear(d_model, self.d_ff)
        self.b_1 = torch.nn.Parameter(torch.zeros(seq_len, self.d_ff))
        self.W_2 = torch.nn.Linear(self.d_ff, d_model)
        self.b_2 = torch.nn.Parameter(torch.zeros(seq_len, d_model))
        
    def forward(self, x):
        assert x.shape == (seq_len, d_model)        
        #TODO: max(0, ...)
        out1 = self.W_1(x) + self.b_1
        out2 = self.W_2(out1) + self.b_2

        return out2
        # print(out2.shape)     

class MHA(torch.nn.Module):
    def __init__(self, h=8, has_mask=False):  
        super().__init__()          
        
        self.has_mask = has_mask
        self.d_k = 8
        # self.d_k = d_model // h
        h_d_v = 64 
        #h_d_v = h*self.d_v
        
        self.d_v = self.d_k
        #self.scale = 1/math.sqrt(self.d_k)
        self.scale = 1

        assert d_model == 512
        assert self.d_k == 8

 
        self.W_Q = torch.nn.Parameter(torch.zeros(size=(d_model, self.d_k)))
        self.W_K = torch.nn.Parameter(torch.zeros(size=(d_model, self.d_k)))
        self.W_V = torch.nn.Parameter(torch.zeros(size=(d_model, self.d_v)))
        self.W_O = torch.nn.Parameter(torch.zeros(size=(h_d_v, d_model)))   
                
    #TODO: shouldnt need separate x: x_k and x_v
    def attn(self, x_k, x_v):
        Q = x_k @ self.W_Q
        K = x_k @ self.W_K
        V = x_v @ self.W_V

        mask = torch.ones(seq_len, seq_len) 
        if self.has_mask:
            for i in range(seq_len):
                for j in range(seq_len):
                    if i == j:
                        mask[i:, :] = -float('inf')
        
        head = torch.softmax(input=self.scale*mask*(Q@K.T), dim=0) @ V
        return head

    def forward(self, x_k, x_v):
        assert x_k.shape == (seq_len, d_model)
        assert x_v.shape == (seq_len, d_model)
        
        heads = torch.cat([self.attn(x_k, x_v) for i in range(8)], dim=1)
        res = heads @ self.W_O
        return res   


class Encoder(torch.nn.Module):
    def __init__(self):
        super().__init__()          

    def forward(self, x):
        M = MHA(has_mask=False)
        mha = M(x_k=x, x_v=x)

        #sl1 = torch.nn.LayerNorm(x + mha)
        sl1 = x + mha        
        # print(sl1.shape)

        F = FFN()
        ffn = F(sl1)
        
        #sl2 = torch.nn.LayerNorm(sl1 + ffn)
        sl2 = sl1 + ffn
        # print(sl2.shape)
        
        return sl2

class Decoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, x):
        #TODO: add LayerNorm after each sl{i}
        M1 = MHA(has_mask=True)
        mmha = M1(x_k=x, x_v=x)

        sl1 = x + mmha

        M2 = MHA(has_mask=False)
        mha = M2(x_k=x, x_v=sl1)

        sl2 = sl1 + mha

        F = FFN(sl2)
        ffn = F(sl2)

        sl3 = sl2 + ffn

        return sl3

class EncoderDecoder(torch.nn.Module):
    def __init__(self, x):
        assert x.shape == (seq_len, d_model)
        super().__init__()        
        self.x = x
        self.encs = [None] * num_layers
        self.decs = [None] * num_layers

    def forward(self):
        for i in range(num_layers):
            if i == 0:
                self.encs[i] = Encoder()(self.x)
            else:
                self.encs[i] = Encoder()(self.encs[i-1])
                
            self.decs[i] = Decoder()(self.encs[i])

        return self.decs[num_layers-1]

class Transformer(torch.nn.Module):
    def __init__(self):
       super().__init__()                

    def forward(self, x):
       #TODO: add linear layer
       enc_dec = EncoderDecoder(x)()                  
       return torch.softmax(input=enc_dec, dim=0) #TODO: once again this is along seq_len
        

In [114]:
model = Transformer()
model(torch.Tensor(seq_len, d_model))

AttributeError: cannot assign parameters before Module.__init__() call

from datasets import load_dataset
from transformers import AutoTokenizer

#TODO: they used train split, wiht ~3.5M examples
ds = load_dataset("wmt/wmt14", "de-en", split='validation')
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [81]:
training_data = ds['translation']

In [97]:
[param for param in model.parameters()]

[]

In [92]:
num_epochs = 100
criterion = torch.nn.CrossEntropyLoss()

optim = torch.optim.Adam(params=model.parameters(), betas=(0.9,0.98), eps=10E-9)


for epoch in range(num_epochs):
    for sample in training_data[:3]:
        optim.zero_grad()
        
        real = sample['de']
        fake = sample['en']
        
        outputs = model(fake)

        loss = criterion(real, fake)


        loss.backward()        
        optim.step()

ValueError: optimizer got an empty parameter list