# **Masked Word Prediction**

**Dataset decription.**
The dataset consists of training and test files in plain text format (.txt). Both utilize a limited vocabulary of approximately 20 words. While the sentences don't necessarily follow proper English grammar, they share similar linguistic patterns and structure. The training set contains 10,000 complete sentences. The test set includes 30,000 sentences, each with exactly one word replaced by a `<mask>` token. The task is to develop a model that accurately predicts these masked words.

**Overview of the notebook**
Here is an overview of the steps needed for this task:
1. *Data Loading.* 
2. *Data Preparation.* 
3. *Model Architecture.* 
4. *Training Implementation.* 
5. *Prediction.* 

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import random

from collections import Counter # to build vocab easily

### 1. Load the data

In [None]:
# Load text file
def load_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read().lower()
    return text

train_data = load_text("train_data.txt")
test_data = load_text("test_data.txt")

In [None]:
# Load text file
def load_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read().lower()
    return text

train_data = load_text("train_data.txt")
test_data = load_text("test_data.txt")

# some examples
c = 0
print('----EXAMPLE TRAINING----')
for sentence in train_data.split('.'):
    print(sentence.strip() + '.')
    c += 1
    if c > 4:
        break

c = 0
print('\n----EXAMPLE TEST----')
for sentence in test_data.split('.'):
    print(sentence.strip() + '.')
    c += 1
    if c > 4:
        break

----EXAMPLE TRAINING SENTENCES----
a cat chases on a small horse.
this dog in the big park in the big park near the big bird chases near the small park.
the big dog chases this park in the big park.
the small dog near the big bird likes near the big dog.
the car near this big park in the park near the small dog in the small park on the small dog is big.

----EXAMPLE TEST SENTENCES----
this big car is <mask>.
the car in a car is big <mask> this <mask> is big.
this big bird sees near this big bird near the big <mask> in a bird.
a big bird in a <mask> cat chases a bird.
the big <mask> likes this car.


## 2. Tokenizer

In [None]:
# Tokenization and Vocabulary - assuming everything is in the vocab from train 
# no option for pad or unknown token
def build_vocab(text, min_freq=1):
    tokens = text.split()
    counter = Counter(tokens)
    vocab = {word: idx for idx, (word, count) in enumerate(counter.items(), start=2) if count >= min_freq}
    vocab["<mask>"] = 0
    vocab["<unk>"] = 1
    return vocab

vocab = build_vocab(train_data)
print(f'vocabulary size is {len(vocab)}')
vocab

vocabulary size is 22


{'a': 2,
 'cat': 3,
 'chases': 4,
 'on': 5,
 'small': 6,
 'horse': 7,
 '.': 8,
 'this': 9,
 'dog': 10,
 'in': 11,
 'the': 12,
 'big': 13,
 'park': 14,
 'near': 15,
 'bird': 16,
 'likes': 17,
 'car': 18,
 'is': 19,
 'house': 20,
 'sees': 21,
 '<mask>': 0,
 '<unk>': 1}

In [5]:
# tokenizer
class Tokenizer:
    def __init__(self, vocab):
        self.vocab = vocab
        self.inv_vocab = {idx: word for word, idx in vocab.items()}

    def encode(self, text):
        text = text.lower().replace('.', ' .') # to ensure <mask>. can be spit
        special_tokens = {"<mask>"}  # Ensure special tokens are not split
        tokens = text.split()  # Use simple split() to preserve "<mask>" as a single token
        return [self.vocab.get(token, vocab["<unk>"]) for token in tokens]

    def decode(self, tokens):
        return ' '.join([self.inv_vocab.get(token, "<unk>") for token in tokens])

tokenizer = Tokenizer(vocab)

example_sentence = 'The big dog sees a small cat. This house is great.'

encoded = tokenizer.encode(example_sentence)
encoded

[12, 13, 10, 21, 2, 6, 3, 8, 9, 20, 19, 1, 8]

In [6]:
tokenizer.decode(encoded)

'the big dog sees a small cat . this house is <unk> .'

## 3. `DataLoader` for training

In [None]:
# Masked Dataset Class 
import random
import torch
from torch.utils.data import Dataset

class MaskedDataset(Dataset):
    def __init__(self, text, vocab, seq_length=20, mask_prob=0.15):
        self.vocab = vocab
        self.tokens = tokenizer.encode(text)
        self.seq_length = seq_length
        self.mask_prob = mask_prob
        self.mask_token_id = self.vocab["<mask>"]  
        self.vocab_size = len(vocab)

    def __len__(self):
        return len(self.tokens) - self.seq_length

    def __getitem__(self, idx):
        # Get a sequence of length seq_length
        tokens = self.tokens[idx: idx + self.seq_length]
        masked_tokens, mask, actual_tokens = self.mask_tokens(tokens)
        return (torch.tensor(masked_tokens, dtype=torch.long),
                torch.tensor(mask, dtype=torch.long),
                torch.tensor(actual_tokens, dtype=torch.long))

    def mask_tokens(self, tokens):
        """
        Applies the BERT-style 80/10/10 masking to 'tokens':
          - With probability mask_prob, each token is chosen to be masked:
            * 80% of the time -> replace with <mask>
            * 10% of the time -> replace with a random token
            * 10% of the time -> keep the original token
        """
        masked_tokens = tokens.copy()
        actual_tokens = tokens.copy()
        mask = [0] * len(tokens)

        for i in range(len(tokens)):
            # Decide if we will mask this token
            if random.random() < self.mask_prob:
                mask[i] = 1

                rand_num = random.random()
                if rand_num < 0.8:
                    # 80% replace with <mask>
                    masked_tokens[i] = self.mask_token_id
                elif rand_num < 0.9:
                    # 10% replace with random token
                    masked_tokens[i] = random.randint(0, self.vocab_size - 1)
                else:
                    # 10% keep the same token
                    pass

        return masked_tokens, mask, actual_tokens


In [8]:
example_data = "the big dog sees a cat. the house near the park is big. the horse chases a small bird."

dataset = MaskedDataset(example_data, vocab, seq_length=5, mask_prob=0.2)
dataloader = DataLoader(dataset, batch_size=2, shuffle=False)
print(f'length of the dataset: {len(dataset)}, number of batches = {len(dataloader)}')

for masked_tokens, mask, actual_tokens in dataloader:
    print(masked_tokens.shape, mask.shape, actual_tokens.shape)
    print(f'1st sample in batch : {tokenizer.decode(actual_tokens[0].tolist())}')
    print(f'2nd sample in batch : {tokenizer.decode(actual_tokens[1].tolist())}')
    break

length of the dataset: 17, number of batches = 9
torch.Size([2, 5]) torch.Size([2, 5]) torch.Size([2, 5])
1st sample in batch : the big dog sees a
2nd sample in batch : big dog sees a cat


Split into a train and validation set and set up the dataloaders.

In [9]:
seq_length = 10
dataset = MaskedDataset(train_data, vocab, seq_length=seq_length)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
len(dataset), len(train_loader), len(val_loader)

(107713, 6059, 674)

## 4. Training function

In [None]:
# train model
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, device, num_epochs=5):
    model.to(device)
    global_step = 0 

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        correct_train = 0
        total_train = 0

        for masked_tokens, mask, actual_tokens in train_loader:
            masked_tokens, mask, actual_tokens = (
                masked_tokens.to(device),
                mask.to(device),
                actual_tokens.to(device)
            )

            optimizer.zero_grad()
            output = model(masked_tokens)
            loss = criterion(output.view(-1, output.size(-1)), actual_tokens.view(-1))
            loss.backward()
            optimizer.step()

            # Step the scheduler *every batch* rather than every epoch
            scheduler.step()

            global_step += 1

            total_train_loss += loss.item()

            # Calculate accuracy on masked tokens
            predicted_tokens = output.argmax(dim=-1)
            correct_train += ((predicted_tokens == actual_tokens) & (mask == 1)).sum().item()
            total_train += mask.sum().item()

        train_accuracy = (correct_train / total_train) * 100 if total_train > 0 else 0
        avg_train_loss = total_train_loss / len(train_loader)

        # Validation
        model.eval()
        total_val_loss = 0
        correct_val = 0
        total_val = 0

        with torch.no_grad():
            for masked_tokens, mask, actual_tokens in val_loader:
                masked_tokens, mask, actual_tokens = (
                    masked_tokens.to(device),
                    mask.to(device),
                    actual_tokens.to(device)
                )
                output = model(masked_tokens)
                loss = criterion(output.view(-1, output.size(-1)), actual_tokens.view(-1))
                total_val_loss += loss.item()

                predicted_tokens = output.argmax(dim=-1)
                correct_val += ((predicted_tokens == actual_tokens) & (mask == 1)).sum().item()
                total_val += mask.sum().item()

        val_accuracy = (correct_val / total_val) * 100 if total_val > 0 else 0
        avg_val_loss = total_val_loss / len(val_loader)

        print(f"Epoch {epoch+1}, "
              f"Train Loss: {avg_train_loss:.4f}, Train Acc: {train_accuracy:.2f}%, "
              f"Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.2f}%")


## 5. Models and Training

In [11]:
import math

class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=500):
        super().__init__()

        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)  # Shape: (1, max_len, embed_dim)
        self.register_buffer('pe', pe)

    def forward(self, x):
      return x + self.pe[:, :x.size(1), :]

In [None]:
class WordPredictorTransformer(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_layers, num_heads):
        self.setting = {'name': 'WordPredictorTransformer',
                        'vocab_size': vocab_size,
                        'hidden_dim' : hidden_dim,
                        'num_layers' : num_layers,
                        'num_heads' : num_heads}
        super(WordPredictorTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.position = PositionalEncoding(hidden_dim)
        encoder = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads, dropout=0.1, dim_feedforward=4 * hidden_dim, batch_first=True) 
        self.transformer = nn.TransformerEncoder(
            encoder,
            num_layers=num_layers
        )
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x) # first the embedding layer
        x = self.position(x)
        attention_out = self.transformer(x)
        out = self.fc(attention_out)
        return out

In [14]:
from transformers import get_linear_schedule_with_warmup

In [15]:
vocab_size = len(vocab)
# initiate model
model_trans = WordPredictorTransformer(vocab_size, hidden_dim=512, num_layers=8, num_heads=8)
# initiate loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model_trans.parameters(), lr=0.0001)
num_epochs = 20
num_training_steps = num_epochs * len(train_loader)
num_warmup_steps = int(0.1 * num_training_steps)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)
# set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'active device is {device}')
# train
train_model(model_trans, train_loader, val_loader, criterion, optimizer, scheduler, device, num_epochs=20)

active device is cpu
Epoch 1, Train Loss: 0.3252, Train Acc: 33.59%, Val Loss: 0.1313, Val Acc: 47.57%
Epoch 2, Train Loss: 0.1315, Train Acc: 48.06%, Val Loss: 0.1282, Val Acc: 48.46%
Epoch 3, Train Loss: 0.1284, Train Acc: 49.43%, Val Loss: 0.1204, Val Acc: 50.53%
Epoch 4, Train Loss: 0.1252, Train Acc: 50.10%, Val Loss: 0.1192, Val Acc: 52.06%
Epoch 5, Train Loss: 0.1225, Train Acc: 50.89%, Val Loss: 0.1153, Val Acc: 51.55%
Epoch 6, Train Loss: 0.1217, Train Acc: 51.24%, Val Loss: 0.1177, Val Acc: 52.48%
Epoch 7, Train Loss: 0.1198, Train Acc: 51.70%, Val Loss: 0.1166, Val Acc: 52.77%
Epoch 8, Train Loss: 0.1195, Train Acc: 51.74%, Val Loss: 0.1146, Val Acc: 52.98%
Epoch 9, Train Loss: 0.1178, Train Acc: 52.35%, Val Loss: 0.1172, Val Acc: 52.14%
Epoch 10, Train Loss: 0.1173, Train Acc: 52.41%, Val Loss: 0.1170, Val Acc: 52.85%
Epoch 11, Train Loss: 0.1160, Train Acc: 52.65%, Val Loss: 0.1135, Val Acc: 52.96%
Epoch 12, Train Loss: 0.1158, Train Acc: 52.83%, Val Loss: 0.1154, Val Acc:

In [None]:
torch.save(model_trans.state_dict(), "/content/model_transformer.pth")

In [None]:
# initialize from the correct class with correct parameters
model_trans = WordPredictorTransformer(vocab_size, hidden_dim=32, num_layers=2, num_heads=4)
model_trans.load_state_dict(torch.load("/content/model_transformer.pth", weights_only=True))

Saved `setting` attribute to the model.

In [None]:
def save_model(model, path='/content/model.pth'):
    torch.save({
        'state_dict': model.state_dict(),
        'setting': model.setting  # Save variable parameters
    }, path)

save_model(model_trans, '/content/model_transformer.pth')

In [None]:
checkpoint = torch.load('/content/model_transformer.pth')
checkpoint['setting']

Rebuild the model

In [None]:
model_new = WordPredictorTransformer(vocab_size, hidden_dim=32, num_layers=2, num_heads=4)
model_new.load_state_dict(checkpoint['state_dict'])

## Predicting on the test data

To predict masked words in the test data, segment it into sequences matching training sequence length, allowing for overlapping sections to ensure each mask is predicted once. 

In [None]:
test_data = load_text("/content/test_data.txt")
sum([token==vocab["<mask>"] for token in tokenizer.encode(test_data)])

In [None]:
from torch.nn.functional import softmax
def predict_words(model, text, vocab, seq_length, device):
    model.to(device)
    model.eval()

    tokenized = tokenizer.encode(text)
    mask_token_id = vocab["<mask>"]
    target_size = sum([token==vocab["<mask>"] for token in tokenized])
    mask_positions = [i for i, token in enumerate(tokenized) if token == mask_token_id]
    predicted_words = ['']*target_size
    visited_positions = set()

    # Process text in sliding windows to maintain context
    step_size = seq_length // 2  # Overlapping step to retain context
    for start in range(0, len(tokenized), step_size):
        end = min(start + seq_length, len(tokenized))
        chunk = tokenized[start:end]

        # Check if there are masks in this chunk
        local_mask_positions = [i for i in mask_positions if start <= i < end]
        if not local_mask_positions:
            continue  # Skip if no mask in this chunk

        # Convert to tensor and move to device
        input_tensor = torch.tensor([chunk]).to(device)

        # Get model predictions
        with torch.no_grad():
            outputs = model(input_tensor)
            logits = outputs[0]  # Shape: (seq_length, vocab_size)

        # Convert logits to probabilities
        probs = softmax(logits, dim=-1)

        # Predict words for mask positions
        for pos in local_mask_positions:
            global_mask_index = mask_positions.index(pos)
            relative_pos = pos - start  # Convert global index to local chunk index
            predicted_token_id = torch.argmax(probs[relative_pos]).item()
            predicted_token = tokenizer.decode([predicted_token_id])
            if global_mask_index not in visited_positions: # ensures if prediciton already made, then skip
                predicted_words[global_mask_index] = predicted_token
                visited_positions.add(global_mask_index)

    return predicted_words

In [None]:
predicted = predict_words(model_new, text=test_data, vocab=vocab, seq_length=seq_length, device=device)
len(predicted)

In [None]:
print(f'first 5 predictions: {predicted[:5]}')
print(f'last  5 predictions: {predicted[-5:]}')

In [None]:
def save_csv(predicted_words, path='/content/', filename='predictions'):
    import pandas as pd
    assert len(predicted_words) == 30000 

    df = pd.DataFrame({
        'id': range(30000),
        'prediction': predicted_words
    })
    df.to_csv(path+filename+'.csv', index=False)

In [None]:
save_csv(predicted, filename='basic_notebook') 