In [1]:
import sys, os
import json
import torch

# Make sure src/ is visible
project_root = os.path.abspath(".")
if project_root not in sys.path:
    sys.path.append(project_root)

from src.config import config
from src.training.train import train_model

In [5]:
def build_config(overrides=None):
    cfg = config.copy()
    if overrides:
        cfg.update(overrides)
    return cfg

def show_config(cfg):
    for k, v in cfg.items():
        print(f"{k}: {v}")

In [None]:
import json
import matplotlib.pyplot as plt

def plot_metrics(tokenizer_name: str, model_type: str, results_dir: str = "results"):
    """
    Load training metrics from JSON and plot training vs validation loss.
    
    Parameters
    ----------
    tokenizer_name : str
        The tokenizer identifier (e.g., "word", "bpe", "unigram", etc.).
    model_type : str
        The model type (e.g., "transformer", "lstm").
    results_dir : str, default="results"
        Directory where metrics JSON files are stored.
    """
    
    # Construct file path
    metrics_path = f"{results_dir}/metrics_{tokenizer_name}_{model_type}.json"
    
    # Load JSON data
    with open(metrics_path, "r") as f:
        metrics = json.load(f)

    # Extract values
    epochs = [m["epoch"] for m in metrics]
    train_losses = [m["train_loss"] for m in metrics]
    val_losses = [m["val_loss"] for m in metrics]

    # Plot
    plt.figure(figsize=(7, 5))
    plt.plot(epochs, train_losses, marker='o', label="Train Loss")
    plt.plot(epochs, val_losses, marker='o', label="Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title(f"Training vs Validation Loss ({tokenizer_name}-{model_type})")
    plt.grid(True)
    plt.legend()
    plt.show()

In [None]:
def run_training(tokenizer_name, model_type, cfg):
    print("Starting training with config:")
    show_config(cfg)

    # Your train_model already prints metrics and saves files.
    train_model(
        tokenizer_name=tokenizer_name,
        model_type=model_type,
        cfg_overrides=cfg
    )

word transformer

In [None]:
tokenizer_name = "word"          # or bpe, unigram, byte
model_type = "transformer"       # or lstm

cfg_overrides = {
     "dataset_name": "wikitext-2-raw-v1",
    "context_length": 128,
    "batch_size": 32,
    "num_epochs": 5,
    "learning_rate": 1e-3,
    "model_dim": 128,
    "num_layers": 2,
    "num_heads": 2,
    "dropout": 0.3,
    "device": "cuda",
}

cfg = build_config(cfg_overrides)
show_config(cfg)
run_training(tokenizer_name, model_type, cfg)
plot_metrics("word", "transformer")

word lstm

In [None]:
tokenizer_name = "word"          # or bpe, unigram, byte
model_type = "lstm"       # or lstm

cfg_overrides = {
     "dataset_name": "wikitext-2-raw-v1",
    "context_length": 128,
    "batch_size": 32,
    "num_epochs": 5,
    "learning_rate": 1e-4,
    "model_dim": 128,
    "num_layers": 2,
    "num_heads": 2,
    "dropout": 0.3,
    "device": "cuda",
}

cfg = build_config(cfg_overrides)
show_config(cfg)
run_training(tokenizer_name, model_type, cfg)
plot_metrics("word", "transformer")

bpe transformer

In [None]:
tokenizer_name = "bpe"          # or bpe, unigram, byte
model_type = "transformer"       # or lstm

cfg_overrides = {
     "dataset_name": "wikitext-2-raw-v1",
    "context_length": 128,
    "batch_size": 32,
    "num_epochs": 5,
    "learning_rate": 8e-4,
    "model_dim": 128,
    "num_layers": 2,
    "num_heads": 2,
    "dropout": 0.3,
    "device": "cuda",
}

cfg = build_config(cfg_overrides)
show_config(cfg)
run_training(tokenizer_name, model_type, cfg)
plot_metrics("word", "transformer")

bpe lstm

In [None]:
tokenizer_name = "bpe"          # or bpe, unigram, byte
model_type = "lstm"       # or lstm

cfg_overrides = {
     "dataset_name": "wikitext-2-raw-v1",
    "context_length": 128,
    "batch_size": 32,
    "num_epochs": 5,
    "learning_rate": 1e-4,
    "model_dim": 128,
    "num_layers": 2,
    "num_heads": 2,
    "dropout": 0.3,
    "device": "cuda",
}

cfg = build_config(cfg_overrides)
show_config(cfg)
run_training(tokenizer_name, model_type, cfg)
plot_metrics("word", "transformer")

unigram transformer

In [None]:
tokenizer_name = "unigram"          # or bpe, unigram, byte
model_type = "transformer"       # or lstm

cfg_overrides = {
     "dataset_name": "wikitext-2-raw-v1",
    "context_length": 128,
    "batch_size": 32,
    "num_epochs": 5,
    "learning_rate": 1e-4,
    "model_dim": 128,
    "num_layers": 2,
    "num_heads": 2,
    "dropout": 0.3,
    "device": "cuda",
}

cfg = build_config(cfg_overrides)
show_config(cfg)
run_training(tokenizer_name, model_type, cfg)
plot_metrics("word", "transformer")

unigram lstm

In [None]:
tokenizer_name = "unigram"          # or bpe, unigram, byte
model_type = "lstm"       # or lstm

cfg_overrides = {
     "dataset_name": "wikitext-2-raw-v1",
    "context_length": 128,
    "batch_size": 32,
    "num_epochs": 5,
    "learning_rate": 1e-4,
    "model_dim": 128,
    "num_layers": 2,
    "num_heads": 2,
    "dropout": 0.3,
    "device": "cuda",
}

cfg = build_config(cfg_overrides)
show_config(cfg)
run_training(tokenizer_name, model_type, cfg)
plot_metrics("word", "transformer")

byte transformer

In [None]:
tokenizer_name = "byte"          # or bpe, unigram, byte
model_type = "transformer"       # or lstm

cfg_overrides = {
     "dataset_name": "wikitext-2-raw-v1",
    "context_length": 128,
    "batch_size": 32,
    "num_epochs": 5,
    "learning_rate": 1e-4,
    "model_dim": 128,
    "num_layers": 2,
    "num_heads": 2,
    "dropout": 0.3,
    "device": "cuda",
}

cfg = build_config(cfg_overrides)
show_config(cfg)
run_training(tokenizer_name, model_type, cfg)
plot_metrics("word", "transformer")

byte lstm

In [None]:
tokenizer_name = "byte"          # or bpe, unigram, byte
model_type = "lstm"       # or lstm

cfg_overrides = {
     "dataset_name": "wikitext-2-raw-v1",
    "context_length": 128,
    "batch_size": 32,
    "num_epochs": 5,
    "learning_rate": 1e-4,
    "model_dim": 128,
    "num_layers": 2,
    "num_heads": 2,
    "dropout": 0.3,
    "device": "cuda",
}

cfg = build_config(cfg_overrides)
show_config(cfg)
run_training(tokenizer_name, model_type, cfg)
plot_metrics("word", "transformer")