# Task 1 - Training BERT from scratch

In [1]:
import math
import re
from   random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os

In [2]:
from torch.utils.data import Dataset, DataLoader

import math
import random
import re
from datasets import load_dataset
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Set GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

os.environ['http_proxy']  = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
print(device)

#make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# torch.cuda.get_device_name(0)

cpu


In [4]:
import torchtext
print(torchtext.__version__)

0.6.0


In [5]:
from torchtext import data
from torchtext import datasets

In [6]:
import os
import glob


### Dataset: 

Plaintext Wikipedia (full English) - Kaggle Datasets

In [7]:
wiki_path = "English_Wikipedia/fullEnglish"  

# This finds ALL files inside subfolders
files = glob.glob(os.path.join(wiki_path, "*", "*"))

print("Total files found:", len(files))
# LIMIT to first 500 files
files = files[:500]

print("Files that will be loaded:", len(files))


Total files found: 11578
Files that will be loaded: 500


In [8]:
texts = []

for file in files:
    with open(file, "r", encoding="utf-8") as f:
        texts.append(f.read())

print("Loaded files:", len(texts))


Loaded files: 500


In [9]:
import re

clean_text = ""

for text in texts:
    text = re.sub(r"<doc.*?>", "", text)
    text = re.sub(r"</doc>", "", text)
    clean_text += text + " "


In [10]:
clean_text = clean_text.lower()
clean_text = re.sub(r"\s+", " ", clean_text)

sentences = re.split(r"[.!?]", clean_text)
sentences = [s.strip() for s in sentences if len(s.strip()) > 20]

# Limit for training
sentences = sentences[:100000]

print("Total sentences:", len(sentences))


Total sentences: 100000


In [11]:
from collections import Counter

word_counter = Counter()

for sentence in sentences:
    word_counter.update(sentence.split())

# Special tokens
vocab = ["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"]

# Keep words appearing at least 5 times
vocab += [word for word, freq in word_counter.items() if freq >= 5]

word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}

vocab_size = len(vocab)

print("Vocabulary size:", vocab_size)


Vocabulary size: 28366


In [12]:
#Tokenization

MAX_LEN = 32

def tokenize(sentence):
    tokens = sentence.split()[:MAX_LEN-2]
    
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    
    ids = [word2idx.get(token, word2idx["[UNK]"]) for token in tokens]
    
    # Padding
    while len(ids) < MAX_LEN:
        ids.append(word2idx["[PAD]"])
    
    return ids


In [13]:
# Create MLM + NSP Dataset

import random

def create_dataset(sentences):
    dataset = []
    
    for i in range(len(sentences)-1):
        
        # 50% correct next sentence
        if random.random() > 0.5:
            sent_a = sentences[i]
            sent_b = sentences[i+1]
            nsp_label = 1
        else:
            sent_a = sentences[i]
            sent_b = random.choice(sentences)
            nsp_label = 0
        
        combined = sent_a + " " + sent_b
        input_ids = tokenize(combined)
        
        mlm_labels = [-100] * MAX_LEN
        
        # 15% masking
        for j in range(1, MAX_LEN-1):
            if input_ids[j] != word2idx["[PAD]"] and random.random() < 0.15:
                mlm_labels[j] = input_ids[j]
                input_ids[j] = word2idx["[MASK]"]
        
        dataset.append((input_ids, mlm_labels, nsp_label))
    
    return dataset

dataset = create_dataset(sentences[:50000])
print("Training samples:", len(dataset))


Training samples: 49999


In [14]:
# Create Dataloader

from torch.utils.data import Dataset

class BERTDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        input_ids, mlm_labels, nsp_label = self.data[idx]
        
        return (
            torch.tensor(input_ids, dtype=torch.long),
            torch.tensor(mlm_labels, dtype=torch.long),
            torch.tensor(nsp_label, dtype=torch.long)
        )


In [15]:
bert_dataset = BERTDataset(dataset)

loader = DataLoader(bert_dataset, batch_size=32, shuffle=True)


In [16]:
import torch
import torch.nn as nn

class SimpleBERT(nn.Module):
    def __init__(
        self,
        vocab_size,
        hidden_size=128,
        max_len=128,
        num_layers=2,
        num_heads=4,
        dropout=0.1
    ):
        super(SimpleBERT, self).__init__()

        self.hidden_size = hidden_size
        self.max_len = max_len

        # Token embedding
        self.token_embedding = nn.Embedding(vocab_size, hidden_size)

        # Positional embedding
        self.position_embedding = nn.Embedding(max_len, hidden_size)

        # Transformer encoder layer
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_size,
            nhead=num_heads,
            dropout=dropout,
            batch_first=True   # VERY IMPORTANT
        )

        self.encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_layers
        )

        # MLM head
        self.mlm_head = nn.Linear(hidden_size, vocab_size)

        # NSP head
        self.nsp_head = nn.Linear(hidden_size, 2)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids):

        batch_size, seq_len = input_ids.size()

        # Create position ids
        position_ids = torch.arange(seq_len, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand(batch_size, seq_len)

        # Embeddings
        token_embeddings = self.token_embedding(input_ids)
        position_embeddings = self.position_embedding(position_ids)

        x = token_embeddings + position_embeddings
        x = self.dropout(x)

        # Transformer encoder
        x = self.encoder(x)

        # MLM output
        mlm_logits = self.mlm_head(x)

        # NSP uses [CLS] token (first token)
        cls_token = x[:, 0, :]
        nsp_logits = self.nsp_head(cls_token)

        return mlm_logits, nsp_logits


In [17]:
vocab_size = 28366
model = SimpleBERT(vocab_size)

print("Model initialized successfully!")



Model initialized successfully!


In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print("Using device:", device)


Using device: cpu


In [19]:
criterion_mlm = nn.CrossEntropyLoss()
criterion_nsp = nn.CrossEntropyLoss()


In [20]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


In [21]:
import random
from torch.utils.data import Dataset, DataLoader

class BERTDataset(Dataset):
    def __init__(self, sentences, word2idx, max_len=32):
        self.sentences = sentences
        self.word2idx = word2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences) - 1

    def __getitem__(self, idx):

        sentence_a = self.sentences[idx]
        sentence_b = self.sentences[idx + 1]

        # 50% chance random sentence for NSP
        if random.random() > 0.5:
            sentence_b = random.choice(self.sentences)
            nsp_label = 0  # Not next
        else:
            nsp_label = 1  # Is next

        tokens = ["[CLS]"] + sentence_a.split() + ["[SEP]"] + sentence_b.split() + ["[SEP]"]

        # Convert to ids
        input_ids = [self.word2idx.get(w, 0) for w in tokens]

        # Padding
        if len(input_ids) < self.max_len:
            input_ids += [self.word2idx["[PAD]"]] * (self.max_len - len(input_ids))
        else:
            input_ids = input_ids[:self.max_len]

        input_ids = torch.tensor(input_ids)

        # MLM labels (copy)
        mlm_labels = input_ids.clone()

        # Mask 15% tokens
        for i in range(1, len(input_ids) - 1):
            if random.random() < 0.15:
                input_ids[i] = self.word2idx["[MASK]"]

        return input_ids, mlm_labels, torch.tensor(nsp_label)


In [24]:
#Training

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

mlm_loss_fn = nn.CrossEntropyLoss(ignore_index=-100)
nsp_loss_fn = nn.CrossEntropyLoss()

EPOCHS = 3
model.train()

for epoch in range(EPOCHS):
    
    total_loss = 0
    
    for input_ids, mlm_labels, nsp_labels in loader:
        
        mlm_out, nsp_out = model(input_ids)
        
        mlm_loss = mlm_loss_fn(
            mlm_out.view(-1, vocab_size),
            mlm_labels.view(-1)
        )
        
        nsp_loss = nsp_loss_fn(nsp_out, nsp_labels)
        
        loss = mlm_loss + nsp_loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1} Loss: {total_loss/len(loader)}")


Epoch 1 Loss: 8.073345177919531
Epoch 2 Loss: 7.620009910732374
Epoch 3 Loss: 7.577342767404274


In [25]:
torch.save(model.state_dict(), "bert_model.pth")


In [26]:
# save vocab

import pickle

with open("vocab.pkl", "wb") as f:
    pickle.dump(word2idx, f)


In [None]:
#verify vocab counts

with open("vocab.pkl", "rb") as f:
    test_vocab = pickle.load(f)

print("Saved vocab size:", len(test_vocab))


Saved vocab size: 28366


# TASK 2 - Sentence Embedding with Sentence BERT

### Dataset:

SLI dataset from The Standford Natural Language Processing Group 

In [1]:
import torch
import torch.nn as nn
import pickle

In [2]:
import torch
import torch.nn as nn

class SimpleBERT(nn.Module):
    def __init__(
        self,
        vocab_size,
        hidden_size=128,
        max_len=128,
        num_layers=2,
        num_heads=4,
        dropout=0.1
    ):
        super(SimpleBERT, self).__init__()

        self.hidden_size = hidden_size
        self.max_len = max_len

        # Token embedding
        self.token_embedding = nn.Embedding(vocab_size, hidden_size)

        # Positional embedding
        self.position_embedding = nn.Embedding(max_len, hidden_size)

        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_size,
            nhead=num_heads,
            dropout=dropout,
            batch_first=True
        )

        self.encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_layers
        )

        # Heads (Task 1)
        self.mlm_head = nn.Linear(hidden_size, vocab_size)
        self.nsp_head = nn.Linear(hidden_size, 2)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, return_hidden=False):

        batch_size, seq_len = input_ids.size()

        # Create position ids
        position_ids = torch.arange(seq_len, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand(batch_size, seq_len)

        # Embeddings
        token_embeddings = self.token_embedding(input_ids)
        position_embeddings = self.position_embedding(position_ids)

        x = token_embeddings + position_embeddings
        x = self.dropout(x)

        # Encoder
        x = self.encoder(x)

        # If Task 2 wants hidden states
        if return_hidden:
            return x

        # Task 1 outputs
        mlm_logits = self.mlm_head(x)
        cls_token = x[:, 0, :]
        nsp_logits = self.nsp_head(cls_token)

        return mlm_logits, nsp_logits


In [3]:
# Load Vocab and Rebuild BERT

import torch
import pickle

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load vocabulary
with open("vocab.pkl", "rb") as f:
    word2idx = pickle.load(f)

vocab_size = len(word2idx)
print("Vocab size:", vocab_size)

# Rebuild BERT with SAME architecture as Task 1
bert_model = SimpleBERT(
    vocab_size=vocab_size,
    hidden_size=128,
    max_len=128,
    num_layers=2,
    num_heads=4
).to(device)

# Load trained weights
bert_model.load_state_dict(torch.load("bert_model.pth", map_location=device))
bert_model.train()   # We will fine-tune


Vocab size: 28366


SimpleBERT(
  (token_embedding): Embedding(28366, 128)
  (position_embedding): Embedding(128, 128)
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (mlm_head): Linear(in_features=128, out_features=28366, bias=True)
  (nsp_head): Linear(in_features=128, out_features=2, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [4]:
# Modify SimpleBERT to return hidden states

def forward(self, input_ids, return_hidden=False):

    batch_size, seq_len = input_ids.size()

    position_ids = torch.arange(seq_len, device=input_ids.device)
    position_ids = position_ids.unsqueeze(0).expand(batch_size, seq_len)

    token_embeddings = self.token_embedding(input_ids)
    position_embeddings = self.position_embedding(position_ids)

    x = token_embeddings + position_embeddings
    x = self.dropout(x)

    x = self.encoder(x)

    if return_hidden:
        return x   # Return encoder output only

    mlm_logits = self.mlm_head(x)
    cls_token = x[:, 0, :]
    nsp_logits = self.nsp_head(cls_token)

    return mlm_logits, nsp_logits


In [5]:
# Build SentenceBERT model (SoftmaxLoss Version)

import torch.nn as nn

class SentenceBERT(nn.Module):
    def __init__(self, bert_model):
        super(SentenceBERT, self).__init__()
        self.bert = bert_model
        hidden_size = bert_model.hidden_size
        
        # (u, v, |u-v|) → 3 * hidden_size
        self.classifier = nn.Linear(hidden_size * 3, 3)

    def mean_pool(self, token_embeddings):
        return torch.mean(token_embeddings, dim=1)

    def forward(self, input_ids1, input_ids2):

        out1 = self.bert(input_ids1, return_hidden=True)
        out2 = self.bert(input_ids2, return_hidden=True)

        emb1 = self.mean_pool(out1)
        emb2 = self.mean_pool(out2)

        diff = torch.abs(emb1 - emb2)

        features = torch.cat([emb1, emb2, diff], dim=1)

        logits = self.classifier(features)
        return logits


In [6]:
snli_train_path = r"D:\CV\snli_1.0\snli_1.0\snli_1.0_train.jsonl"


In [7]:
import json

def load_snli_jsonl(path):
    data = []
    
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)
            
            # Remove invalid labels
            if item["gold_label"] == "-":
                continue
            
            data.append({
                "premise": item["sentence1"],
                "hypothesis": item["sentence2"],
                "label": label_to_id(item["gold_label"])
            })
    
    return data


In [8]:
# Convert string labels into numbers

def label_to_id(label):
    mapping = {
        "entailment": 0,
        "contradiction": 1,
        "neutral": 2
    }
    return mapping[label]


In [9]:
train_data = load_snli_jsonl(snli_train_path)

print("Training examples:", len(train_data))
print(train_data[0])


Training examples: 549367
{'premise': 'A person on a horse jumps over a broken down airplane.', 'hypothesis': 'A person is training his horse for a competition.', 'label': 2}


In [10]:
import random
random.seed(42)

# Reduce to 5000 first
train_data = random.sample(train_data, 5000)

# Split 80% train / 20% validation
split_index = int(0.8 * len(train_data))

train_split = train_data[:split_index]
dev_split = train_data[split_index:]

print("Train size:", len(train_split))
print("Dev size:", len(dev_split))


Train size: 4000
Dev size: 1000


In [11]:
from torch.utils.data import Dataset, DataLoader
import torch

class SNLIDataset(Dataset):
    def __init__(self, data, word2idx):
        self.data = data
        self.word2idx = word2idx
        
    def __len__(self):
        return len(self.data)
    
    def tokenize(self, sentence, max_len=128):
        tokens = sentence.lower().split()

        ids = []
        for token in tokens:
            if token in self.word2idx:
                ids.append(self.word2idx[token])
            else:
                ids.append(0)   # unknown words → 0 safely

        if len(ids) < max_len:
            ids += [0] * (max_len - len(ids))   # padding → 0
        else:
            ids = ids[:max_len]

        return torch.tensor(ids, dtype=torch.long)
    
    def __getitem__(self, idx):
        item = self.data[idx]

        s1 = self.tokenize(item["premise"])
        s2 = self.tokenize(item["hypothesis"])
        label = torch.tensor(item["label"], dtype=torch.long)

        return s1, s2, label

In [12]:
# Create Dataloader

train_dataset = SNLIDataset(train_split, word2idx)
dev_dataset = SNLIDataset(dev_split, word2idx)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=False)


In [13]:
# Tokenize Function

def tokenize(self, sentence, max_len=128):
    tokens = sentence.lower().split()

    ids = []
    for token in tokens:
        if token in self.word2idx:
            ids.append(self.word2idx[token])
        else:
            ids.append(0)  # force unknown words to 0 safely

    if len(ids) < max_len:
        ids += [0] * (max_len - len(ids))
    else:
        ids = ids[:max_len]

    return torch.tensor(ids, dtype=torch.long)




In [14]:
# Initialize Model 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

sbert_model = SentenceBERT(bert_model).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(sbert_model.parameters(), lr=2e-5)


In [15]:
#Training

num_epochs = 3

for epoch in range(num_epochs):
    sbert_model.train()
    total_loss = 0

    for step, (s1, s2, labels) in enumerate(train_loader):

        s1 = s1.to(device)
        s2 = s2.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        logits = sbert_model(s1, s2)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if step % 100 == 0:
            print(f"Epoch {epoch+1}, Step {step}, Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} Finished, Avg Loss: {avg_loss:.4f}")


Epoch 1, Step 0, Loss: 1.2426
Epoch 1, Step 100, Loss: 1.0985
Epoch 1 Finished, Avg Loss: 1.1021
Epoch 2, Step 0, Loss: 1.0668
Epoch 2, Step 100, Loss: 1.1094
Epoch 2 Finished, Avg Loss: 1.0996
Epoch 3, Step 0, Loss: 1.1063
Epoch 3, Step 100, Loss: 1.0919
Epoch 3 Finished, Avg Loss: 1.0991


In [16]:
import torch

# Save model
torch.save(sbert_model.state_dict(), "sbert_model.pth")

print("Model saved successfully!")


Model saved successfully!


In [17]:
import pickle

with open("vocab.pkl", "wb") as f:
    pickle.dump(word2idx, f)

print("Vocabulary saved successfully!")


Vocabulary saved successfully!


# Task 3 - Evaluation and Analysis

In [18]:
# Evaluation metrices

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import numpy as np

def evaluate(model, dataloader, device):
    model.eval()
    
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for s1, s2, labels in dataloader:
            s1 = s1.to(device)
            s2 = s2.to(device)
            labels = labels.to(device)

            logits = model(s1, s2)
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Convert to numpy
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    # Metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='weighted'
    )
    cm = confusion_matrix(all_labels, all_preds)

    return accuracy, precision, recall, f1, cm


In [19]:
accuracy, precision, recall, f1, cm = evaluate(sbert_model, dev_loader, device)

print("Accuracy :", accuracy)
print("Precision:", precision)
print("Recall   :", recall)
print("F1-score :", f1)
print("Confusion Matrix:\n", cm)


Accuracy : 0.396
Precision: 0.26448491782816724
Recall   : 0.396
F1-score : 0.3170995508982036
Confusion Matrix:
 [[190   0 139]
 [184   0 149]
 [132   0 206]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [20]:
import pandas as pd

results = {
    "Metric": ["Accuracy", "Precision", "Recall", "F1-score"],
    "Score": [accuracy, precision, recall, f1]
}

df = pd.DataFrame(results)
print(df)


      Metric     Score
0   Accuracy  0.396000
1  Precision  0.264485
2     Recall  0.396000
3   F1-score  0.317100


### Limitations and Challenges

There are several challenges that had been faced while implementing these models. 

Firstly, there has been limitation in obtaining the two datasets (English Wikipedia and SNLI) directly from URL as most of the servers have been blocked for direct use. To overcome this, various datasets from reliable resources (in this case - Kaggle) are browsed and downloaded manually into the machine and loaded into the notebook. 

Moreover, it took more than days to train the model without errors as tokenization needs further adjustments in merging the models with the first and second tasks. Another thing that has been learnt from this assignment is that data to be trained should be divided effectively from the original dataset as it makes the training time faster and efficient. 