In [1]:
# Masked Language Model With PyTorch

# 1. Build a tiny dataset of sentences and create a vocabulary
# 2. Tokenize the sentences into indices
# 3. Implement a masking function to randomly replace some tokens with a special <mask> token
# 4. Define a simple PyTorch model with an embedding layer and a linear layer that predicts the original token at masked position
# 5. Set up a training loop where the model learns to reconstruct the masked tokens

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random

In [18]:
# Step 1: Create a tiny dataset of sentences and create a vocabulary and inverse vocabulary
sentences = [
    "the quick brown fox jumps over the lazy dog",
    "i love machine learning",
    "language models are fascinating"
]

# Build a vocabulary {'are': 0, ...} and inverse vocabulary {0: 'are', ...} from a dataset
words = set()
for s in sentences:
    words.update(s.split())

vocab = {word: i  for i, word in enumerate(sorted(words))}
inv_vocab = {i: word for word, i in vocab.items()}

# Add a special <mask> token to the vocabulary
mask_token = "<mask>"
if mask_token not in vocab:
    vocab[mask_token] = len(vocab)
    inv_vocab[len(inv_vocab)] = mask_token

vocab_size = len(vocab)

# Step 2: Tokenize the sentences into indices
def tokenize(s):
    return [vocab[word] for word in s.split()]