#### Step 0: Prepare Environment - Import Libraries and select device

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

import datasets, math, re, random, time
from collections import Counter
from tqdm import tqdm

In [None]:
# mimimum required torch version for MPS support "1.12+"
torch.__version__

In [None]:
# universal device selection: use gpu if available, else cpu
import torch

def get_device():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()  # Clear CUDA cache to free up memory 
        return torch.device("cuda")      # NVIDIA GPU
    elif torch.backends.mps.is_available():
        torch.mps.empty_cache()  # Clear MPS cache to avoid memory issues
        return torch.device("mps")       # Apple Silicon GPU
    else:
        torch.empty_cache()  # Clear CPU cache to free up memory
        return torch.device("cpu")

device = get_device()

print(f"Using device: {device}")

Using device: mps


#### Step 1: Load Data

In [None]:
# Hugging Face Hub login token
# Make sure to set the HF_TOKEN environment variable in your .env file with your Hugging Face token
import os
HF_TOKEN = os.environ.get("HF_TOKEN")


In [None]:
# Hugging Face Hub login token
# Make sure to set the HF_TOKEN environment variable in your .env file with your Hugging Face token
from dotenv import load_dotenv
load_dotenv()

import os
HF_TOKEN = os.environ.get("HF_TOKEN")

Download corpus for first time , save it to load it from local next time

This is example corpus downloaded from HUGGINGFACE. Load dataset as required

In [None]:
import os
from datasets import load_dataset

# data folder are not uploaded to Github.
_DATA_PATH = "../data/wikitext-103"
_DATA_FILENAME = os.path.join(_DATA_PATH, "wikitext-103-train.arrow")
os.makedirs(_DATA_PATH, exist_ok=True)

if not os.path.exists(_DATA_FILENAME):
    # Download and save to local folder
    dataset_train = load_dataset("wikitext", "wikitext-103-raw-v1", split="train", cache_dir=_DATA_PATH)
    dataset_valid = load_dataset("wikitext", "wikitext-103-raw-v1", split="validation", cache_dir=_DATA_PATH)
    dataset_test = load_dataset("wikitext", "wikitext-103-raw-v1", split="test", cache_dir=_DATA_PATH)

else:
    # Load from local Parquet file
    from datasets import Dataset
    dataset_train = Dataset.from_parquet(_DATA_FILENAME)
    dataset_valid = Dataset.from_parquet(os.path.join(_DATA_PATH, "wikitext-103-validation.arrow"))
    dataset_test = Dataset.from_parquet(os.path.join(_DATA_PATH, "wikitext-103-test.arrow"))
    print("Loaded datasets from local Parquet files.")

print(f"Dataset size: {len(dataset_train)}")
print(f"Validation set size: {len(dataset_valid)}")
print(f"Test set size: {len(dataset_test)}")

In [None]:
# Display the first 5 entries in the dataset 
# with only the first 80 characters of the text for brevity
[text[:80] for text in dataset_train[:5]['text']]

#### Step 2: Data preprocessing


In [None]:
#lower case, and clean all the symbols
texts = [re.sub("[.,!?\\-]", '', t.lower()) for t in dataset_train["text"] if t.strip()]

#### Step 3. Tokenization

Depending on datasize, use either spacy or nltk

spaCy is very slow. Better to default to NLTK for educational purposes. If dataset is smaller like below 100_000, use spaCy.

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "lemmatizer"])
nlp.add_pipe("sentencizer")

sentences = []
# batch processing with nlp.pipe for efficiency or performance
for doc in nlp.pipe(texts, batch_size=1000):
    sentences.append([sent.text for sent in doc.sents])

OR

In [None]:
import nltk

_DOWNLOAD_DIR = "../models/nltk_data"
os.makedirs(_DOWNLOAD_DIR, exist_ok=True)
nltk.download('punkt', download_dir=_DOWNLOAD_DIR)
nltk.download('punkt_tab', download_dir=_DOWNLOAD_DIR)
from nltk.tokenize import word_tokenize


texts = [re.sub("[.,!?\\-]", '', t.lower()) for t in dataset_train["text"] if t.strip()]

nltk.data.path.append(_DOWNLOAD_DIR)
tokenized_texts = [word_tokenize(text) for text in texts]

# Example: print the first 5 tokenized samples
for tokens in tokenized_texts[:5]:
    print(tokens)

#### Step 3: Numericalization

Next we gonna create function (torchtext called vocabs) that turn these tokens into integers.  Here we build Vocab class as torchtext.vocab is not supported in Python v3.13+

#### Step 5: Prepare data loaders


##### Step 6: Design Model

#### Step 7: Design Encoder

#### Step 8: Design Attention Mechanisms

#### Step 9: Design decoder to pass attention

#### Step 10: Model Training

We use a simplified version of the weight initialization scheme used in the paper. Here, we will initialize all biases to zero and all weights from $\mathcal{N}(0, 0.01)$.

In [4]:
def initialize_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

#### Step 11. Model Evaluation

#### Step 12. Inference

#### Step 13: Save all models