In [1]:
!pip install torch



In [2]:
import numpy as np
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset
import matplotlib.pyplot as plt
import os

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
models_path = '/content/gdrive/My Drive/models/'
model_name = 'scratch-sentiment-analysis-1.pth'
model_path = os.path.join(models_path,model_name)
print(model_path)

/content/gdrive/My Drive/models/scratch-sentiment-analysis-1.pth


In [5]:
class CausalSelfAttention(nn.Module):
  def __init__(self,d_k,d_model,n_heads,max_len):
    super().__init__()

    self.d_k = d_k
    self.n_heads = n_heads

    self.key = nn.Linear(d_model,d_k * n_heads)
    self.query = nn.Linear(d_model,d_k * n_heads)
    self.value = nn.Linear(d_model,d_k * n_heads)

    self.fc = nn.Linear(d_k * n_heads,d_model)

    cm = torch.tril(torch.ones(max_len,max_len))
    self.register_buffer("causal_mask",cm.view(1,1,max_len,max_len))


  def forward(self,q,k,v,pad_mask=None):
    q = self.query(q) #N x T x (hd_k)
    v = self.value(v)
    k = self.key(k)

    N = q.shape[0]
    T = q.shape[1]

    #change shape of q,v,k
    #(N,T,h,d_k) --> (N,h,T,d_k)
    q = q.view(N, T,self.n_heads,self.d_k).transpose(1,2)
    v = v.view(N, T,self.n_heads,self.d_k).transpose(1,2)
    k = k.view(N, T,self.n_heads,self.d_k).transpose(1,2)

    #compute attention weights
    #(N,h,T,d_k) X (N,h,d_k,T) --> (N,h,T,T)
    attn_scores = q @ k.transpose(-2,-1) / math.sqrt(self.d_k)

    if pad_mask is not None:
      attn_scores = attn_scores.masked_fill(
          pad_mask[:,None,None,:] == 0, float('-inf'))

    attn_scores = attn_scores.masked_fill(
        self.causal_mask[:,:,:T,:T] == 0, float('-inf'))

    attn_weights = F.softmax(attn_scores,-1)

    #(N,h,T,T) X (N,h,T,d_k) --> (N,h,T,d_k)
    A = attn_weights @ v

    A = A.transpose(1,2) # (N,h,T,d_k) --> (N,T,h,d_k)

    #(N,T,h,d_k) --> (N,T,d_k * h)
    A = A.contiguous().view(N,T,self.d_k * self.n_heads)

    return self.fc(A)



In [6]:
class TransformerBlock(nn.Module):
  def __init__(self,d_k,d_model,n_heads,max_len,dropout_prob=0.1):
    super().__init__()

    self.mha = CausalSelfAttention(d_k,d_model,n_heads,max_len)

    self.ln1 = nn.LayerNorm(d_model)
    self.ln2 = nn.LayerNorm(d_model)

    self.ann = nn.Sequential(
        nn.Linear(d_model,d_model*4),
        nn.GELU(),
        nn.Linear(d_model * 4,d_model),
        nn.Dropout(dropout_prob)
    )

    self.dropout = nn.Dropout(p=dropout_prob)


  def forward(self,x,pad_mask=None):
    x = self.ln1(x + self.mha(x,x,x,pad_mask))
    x = self.ln2(x + self.ann(x))
    x = self.dropout(x)
    return x




In [7]:
class PositionalEncoding(nn.Module):
  def __init__(self,d_model,max_len=2048,dropout_prob=0.1):
    super().__init__()
    self.dropout = nn.Dropout(p=dropout_prob)

    position = torch.arange(max_len).unsqueeze(1)
    exp_term = torch.arange(0,d_model,2)
    div_term = torch.exp(exp_term*(-math.log(10000.0)/d_model))
    pe = torch.zeros(1,max_len,d_model)
    pe[0,:,0::2] = torch.sin(position * div_term)
    pe[0,:,1::2] = torch.cos(position * div_term)
    self.register_buffer('pe',pe)

  def forward(self,x):
    x = x + self.pe[:,:x.size(1),:]
    return self.dropout(x)


In [8]:
class Decoder(nn.Module):
  def __init__(self,vocab_size,max_len,d_k,d_model,n_heads,n_layers,dropout_prob):
    super().__init__()

    self.embedding = nn.Embedding(vocab_size,d_model)
    self.pos_encoding = PositionalEncoding(d_model,max_len,dropout_prob)
    transformer_blocks = [
        TransformerBlock(
            d_k,
            d_model,
            n_heads,
            max_len,
            dropout_prob
        ) for _ in range(n_layers)
    ]
    self.transformer_blocks = nn.Sequential(*transformer_blocks)

    self.ln = nn.LayerNorm(d_model)
    self.fc = nn.Linear(d_model,vocab_size)

  def forward(self,x,pad_mask=None):
    x = self.embedding(x)
    x = self.pos_encoding(x)
    for block in self.transformer_blocks:
      x = block(x,pad_mask)
    x = self.ln(x)
    x = self.fc(x) #many outputs
    return x




In [9]:
model = Decoder(20_000,1024,16,644,4,2,0.1)

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [11]:
device

device(type='cuda', index=0)

In [12]:
model.to(device)

Decoder(
  (embedding): Embedding(20000, 644)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (mha): CausalSelfAttention(
        (key): Linear(in_features=644, out_features=64, bias=True)
        (query): Linear(in_features=644, out_features=64, bias=True)
        (value): Linear(in_features=644, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=644, bias=True)
      )
      (ln1): LayerNorm((644,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((644,), eps=1e-05, elementwise_affine=True)
      (ann): Sequential(
        (0): Linear(in_features=644, out_features=2576, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=2576, out_features=644, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (mha): CausalSelfAttent

In [13]:
x = np.random.randint(0,20_000,size=(8,512))
x_t = torch.tensor(x).to(device)

In [14]:
y = model(x_t)

In [15]:
y.shape

torch.Size([8, 512, 20000])

In [16]:
mask = np.ones((8,512))
mask[:,256] = 0
mask_t = torch.tensor(mask).to(device)

In [17]:
y = model(x_t,mask_t)

In [18]:
y.shape

torch.Size([8, 512, 20000])

In [19]:
!pip install transformers datasets

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.2-py3-none-any.whl (518 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.9/518.9 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m64.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Download

Train model

In [20]:
from transformers import AutoTokenizer, DataCollatorWithPadding

In [21]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [22]:
from datasets import load_dataset

In [24]:
def tokenize_fn(batch):
  return tokenizer(batch['sentence'],truncation=True)

In [23]:
raw_datasets = load_dataset('glue','sst2')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [64]:
type(raw_datasets)

datasets.dataset_dict.DatasetDict

In [25]:
tokenized_datasets = raw_datasets.map(tokenize_fn,batched=True)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [26]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [27]:
tokenized_datasets = tokenized_datasets.remove_columns(['sentence','idx','label'])

In [28]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    tokenized_datasets['train'],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator
)

In [36]:
model = Decoder(
    vocab_size=tokenizer.vocab_size,
    max_len=tokenizer.max_model_input_sizes[checkpoint],
    d_k=16,
    d_model=64,
    n_heads=4,
    n_layers=2,
    dropout_prob=0.1
)
model.to(device)

Decoder(
  (embedding): Embedding(28996, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (mha): CausalSelfAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (mha): CausalSelfAttention(
      

In [37]:
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters())

In [38]:
from datetime import datetime

In [39]:
def train(model,criterion,optimizer,train_loader,epochs):
  train_losses = np.zeros(epochs)

  for it in range(epochs):
    model.train()
    t0 = datetime.now()
    train_loss = []
    for batch in train_loader:
      batch = {k: v.to(device) for k,v in batch.items()}
      optimizer.zero_grad()
      targets = batch['input_ids'].clone().detach()
      targets = torch.roll(targets,shifts=-1,dims=1)
      targets[:,-1] = tokenizer.pad_token_id
      outputs = model(batch['input_ids'],batch['attention_mask'])
      loss = criterion(outputs.transpose(2,1),targets)
      loss.backward()
      optimizer.step()
      train_loss.append(loss.item())

    train_loss = np.mean(train_loss)
    train_losses[it] = train_loss
    dt = datetime.now() - t0
    print(f'Epoch: {it+1}/{epochs}, Train Loss: {train_loss:.4f}, Duration: {dt}')
  return train_losses

In [41]:
train_losses = train(model,criterion,optimizer,train_loader,epochs=10)

Epoch: 1/10, Train Loss: 5.0186, Duration: 0:01:09.253169
Epoch: 2/10, Train Loss: 4.6803, Duration: 0:01:14.876649
Epoch: 3/10, Train Loss: 4.4920, Duration: 0:00:59.880724
Epoch: 4/10, Train Loss: 4.3585, Duration: 0:01:09.230974
Epoch: 5/10, Train Loss: 4.2529, Duration: 0:01:10.886100
Epoch: 6/10, Train Loss: 4.1654, Duration: 0:01:07.111342
Epoch: 7/10, Train Loss: 4.0856, Duration: 0:01:07.693371
Epoch: 8/10, Train Loss: 4.0179, Duration: 0:01:12.054934
Epoch: 9/10, Train Loss: 3.9582, Duration: 0:01:03.307418
Epoch: 10/10, Train Loss: 3.9022, Duration: 0:01:05.579690


In [42]:
valid_loader = DataLoader(
    tokenized_datasets['validation'],
    batch_size=1,
    collate_fn=data_collator
)

In [43]:
model.eval()
for batch in valid_loader:
  batch = {k: v.to(device) for k,v in batch.items()}
  outputs = model(batch['input_ids'],batch['attention_mask'])
  break

In [44]:
outputs.shape

torch.Size([1, 12, 28996])

In [45]:
predictions = torch.argmax(outputs,axis=-1)

In [47]:
predictions

tensor([[  170,   112,   188,   170,  1363,  2523,  6276, 15021,  1273,   102,
           102,   102]], device='cuda:0')

In [48]:
prompt = "it's"
tokenized_prompt = tokenizer(prompt,return_tensors='pt')

In [49]:
outputs = model(
    tokenized_prompt['input_ids'][:,:-1].to(device),
    tokenized_prompt['attention_mask'][:,:-1].to(device)
)

In [50]:
prediction_ids = torch.argmax(outputs[:,-1,:],axis=-1)

In [51]:
tokenizer.decode(prediction_ids[0])

'a'

In [52]:
prompt = "it's a"

In [53]:
tokenized_prompt = tokenizer(prompt,return_tensors='pt')
input_ids = tokenized_prompt['input_ids'][:,:-1].to(device)
mask = tokenized_prompt['attention_mask'][:,:-1].to(device)

In [56]:
for _ in range(20):
  outputs = model(input_ids,mask)
  prediction_id = torch.argmax(outputs[:,-1,:],axis=-1)
  input_ids = torch.hstack((input_ids,prediction_id.view(1,1)))
  mask = torch.ones_like(input_ids)

  if prediction_id == tokenizer.sep_token_id:
    break

In [57]:
tokenizer.decode(input_ids[0])

"[CLS] it's a good movie that is a good one [SEP]"