<a href="https://colab.research.google.com/github/thanksbinh/minBert/blob/main/minBERT_QA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Thu Dec  5 01:26:49 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
!pip3 install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [7]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 64 # should've reduce this and use GPU
block_size = 384
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embed = 512
n_heads = 8
n_layers = 6
dropout = 0.1
max_documents = 1801350
vocab_size = 28997
# ------------

In [8]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, attention_mask=None):
        B, T, C = x.shape
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)

        # compute attention score "affinities"
        wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5 # (B, T, 16) @ (B, 16, T) --> (B, T, T)

        # Apply attention mask if provided
        if attention_mask is not None:
            # attention_mask: (B, T) -> (B, 1, T) for broadcasting
            mask = attention_mask[:, None, :]
            wei = wei.masked_fill(mask == 0, float('-inf'))

        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei) # Prevent some nodes from communicating
        out = wei @ v
        return out

In [9]:
class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, attention_mask=None):
        out = torch.cat([h(x, attention_mask) for h in self.heads], dim=-1)
        out = self.proj(out)
        out = self.dropout(out)
        return out

In [10]:
class FeedForward(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

In [11]:
class EncoderBlock(nn.Module):
    def __init__(self, n_embed, n_heads):
        super().__init__()
        head_size = n_embed // n_heads
        self.sa = MultiHeadAttention(n_heads, head_size)
        self.ffwd = FeedForward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x, attention_mask=None):
        # Residual connection, clean flow
        x = x + self.sa(self.ln1(x), attention_mask)
        x = x + self.ffwd(self.ln2(x))
        return x

In [12]:
# class DecoderBlock(nn.Module):
#     def __init__(self, n_embed, n_heads):
#         super().__init__()
#         head_size = n_embed // n_heads
#         self.sa = MultiHeadAttention(n_heads, head_size)
#         self.ffwd = FeedForward(n_embed)
#         self.ln1 = nn.LayerNorm(n_embed)
#         self.ln2 = nn.LayerNorm(n_embed)

#     def forward(self, x):
#         x = x + self.sa(self.ln1(x))
#         x = x + self.ffwd(self.ln2(x))
#         return x

In [13]:
class SequentialEncoder(nn.Sequential):
    def forward(self, *inputs):
        x, y = inputs
        for module in self._modules.values():
            x = module(x, y)
        return x

In [14]:
class Bert(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.blocks = SequentialEncoder(*[EncoderBlock(n_embed, n_heads=n_heads) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(n_embed) # final layer norm
        self.lm_head = nn.Linear(n_embed, vocab_size) # for MLM

    def forward(self, idx, attention_mask=None, token_type_ids=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        token_embed = self.token_embedding_table(idx) # (B,T,C)
        pos_embed = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = token_embed + pos_embed # (B,T,C)
        x = self.blocks(x, attention_mask) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x)  # (B,T,vocab_size)

        last_hidden_state = x
        return logits, last_hidden_state

In [38]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BertForQuestionAnswering(nn.Module):
    def __init__(self, bert_model):
        super(BertForQuestionAnswering, self).__init__()
        self.bert = bert_model  # Pretrained BERT model passed as an argument
        # hidden_size = n_embed  # Hidden size of BERT embeddings
        hidden_size = bert_model.config.hidden_size

        # Add a linear layer for predicting start and end positions
        self.qa_outputs = nn.Linear(hidden_size, 2)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, start_positions=None, end_positions=None):
        # Forward pass through the BERT model to get hidden states
        # logits, last_hidden_state = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        # sequence_output = last_hidden_state  # [batch_size, sequence_length, hidden_size]

        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs.last_hidden_state  # [batch_size, sequence_length, hidden_size]

        # Apply the linear layer to get start and end logits
        logits = self.qa_outputs(sequence_output)  # [batch_size, sequence_length, 2]
        start_logits, end_logits = logits.split(1, dim=-1)  # Separate start and end logits
        start_logits = start_logits.squeeze(-1)  # [batch_size, sequence_length]
        end_logits = end_logits.squeeze(-1)      # [batch_size, sequence_length]

        # If start_positions and end_positions are provided, compute the loss
        if start_positions is not None and end_positions is not None:
            # Cross-entropy loss for start and end positions
            loss_fct = nn.CrossEntropyLoss()
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2
            return total_loss

        return start_logits, end_logits


In [16]:
import torch
from transformers import BertTokenizerFast

# Load pre-trained BERT model and tokenizer for question answering
model_name = "bert-base-cased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
import pickle

In [None]:
bert_model = pickle.load(open('bert_model_10000.sav', 'rb'))
m = bert_model.to(device)

In [17]:
from transformers import BertTokenizerFast, BertModel

model_name = "bert-base-cased"
bert_model = BertModel.from_pretrained(model_name).to(device)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [19]:
m = bert_model

In [39]:
qa_model = BertForQuestionAnswering(m)
qa_model.to(device)

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [21]:
from datasets import load_dataset

# Load SQuAD v1.1 dataset
squad = load_dataset("squad")

# Tokenize the inputs for the model
def preprocess_data(batch):
    # Tokenize the batch of questions and contexts
    inputs = tokenizer(
        batch["question"],
        batch["context"],
        max_length=384,
        truncation="only_second",
        padding="max_length",
        return_offsets_mapping=True,
        return_tensors="pt"
    )
    offset_mapping = inputs.pop("offset_mapping")
    sequence_ids = inputs.sequence_ids()

    # Prepare start and end positions for each example in the batch
    start_positions = []
    end_positions = []

    for i in range(len(batch["answers"])):
        answers = batch["answers"][i]

        # Find start and end character positions
        start_char = answers["answer_start"][0]
        end_char = start_char + len(answers["text"][0])

        # Map character positions to token positions
        token_start_index = sequence_ids.index(1)
        token_end_index = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

        # Set token positions
        start_position, end_position = 0, 0
        for j, (offset_start, offset_end) in enumerate(offset_mapping[i]):
            if offset_start <= start_char < offset_end:
                start_position = j
            if offset_start < end_char <= offset_end:
                end_position = j

        start_positions.append(start_position)
        end_positions.append(end_position)

    # Add start and end positions to inputs
    inputs["start_positions"] = torch.tensor(start_positions)
    inputs["end_positions"] = torch.tensor(end_positions)

    return inputs

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [22]:
# Apply preprocessing to the train and validation sets
train_data = squad["train"].map(preprocess_data, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

In [44]:
val_data = squad["validation"].map(preprocess_data, batched=True, remove_columns=squad["validation"].column_names)

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [23]:
from torch.utils.data import DataLoader, default_collate
from transformers import AdamW

def collate_fn(batch):
    # Stack input_ids, attention_mask, start_positions, and end_positions
    input_ids = torch.stack([torch.tensor(item["input_ids"]) for item in batch])
    attention_mask = torch.stack([torch.tensor(item["attention_mask"]) for item in batch])
    start_positions = torch.stack([torch.tensor(item["start_positions"]) for item in batch])
    end_positions = torch.stack([torch.tensor(item["end_positions"]) for item in batch])

    # Return a dictionary with tensors
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "start_positions": start_positions,
        "end_positions": end_positions
    }

In [24]:
train_loader = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collate_fn)

In [45]:
val_loader = DataLoader(val_data, batch_size=8, collate_fn=collate_fn)

In [40]:
# Define optimizer
optimizer = AdamW(qa_model.parameters(), lr=3e-5)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    qa_model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].squeeze().to(device)
        attention_mask = batch["attention_mask"].squeeze().to(device)
        start_positions = batch["start_positions"].squeeze().to(device)
        end_positions = batch["end_positions"].squeeze().to(device)

        optimizer.zero_grad()

        # Forward pass
        loss = qa_model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        # loss = outputs.total_loss
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss:.4f}")

Epoch 1, Loss: 1.3058


KeyboardInterrupt: 

In [None]:
pickle.dump(qa_model, open('qa_model.sav', 'wb'))

In [None]:
import pickle

qa_model = pickle.load(open('qa_model.sav', 'rb'))
qa_model.to(device)

In [41]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [42]:
from transformers import default_data_collator
from tqdm import tqdm
import numpy as np
from evaluate import load

# Evaluation function
def evaluate_model(model, val_loader):
    qa_model.eval()
    predictions = []
    references = []

    with torch.no_grad():
        for batch in tqdm(val_loader):
            input_ids = batch["input_ids"].squeeze().to(device)
            attention_mask = batch["attention_mask"].squeeze().to(device)

            # Get model outputs
            outputs = model(input_ids, attention_mask=attention_mask)
            start_logits, end_logits = outputs

            # Predict start and end positions
            start_positions = torch.argmax(start_logits, dim=1)
            end_positions = torch.argmax(end_logits, dim=1)

            # Convert predictions and references to text
            for i in range(len(input_ids)):
                input_id = input_ids[i].tolist()
                start_pos = start_positions[i].item()
                end_pos = end_positions[i].item() + 1  # Inclusive

                # Decode predicted answer
                prediction = tokenizer.decode(input_id[start_pos:end_pos])
                predictions.append(prediction)

                # Reference answer
                answer =tokenizer.decode(input_id[batch["start_positions"][i]:batch["end_positions"][i] + 1])
                references.append(answer)

    # Compute metrics
    squad_metric = load("squad")
    predictions = [{'prediction_text': pred, 'id': str(i)} for i, pred in enumerate(predictions)]
    references = [{'answers': {'answer_start': [ref.index(char) for char in ref], 'text': [ref]}, 'id': str(i)} for i, ref in enumerate(references)]
    results = squad_metric.compute(predictions=predictions, references=references)
    return results


In [46]:
# Run evaluation
scores = evaluate_model(qa_model, val_loader)
print(scores)

100%|██████████| 1322/1322 [03:51<00:00,  5.71it/s]


Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

{'exact_match': 63.34910122989593, 'f1': 77.73453451963216}
