**Loading the Dataset**

In [None]:
# load the names dataset from file
from pathlib import Path

data_dir = Path.cwd() / ".." / ".." / "data"


def load_names(path: Path) -> list[str]:
    with path.open("r") as f:
        return f.read().splitlines()


words = load_names(data_dir / "names.txt")

**A Bigram Model**

In [None]:
from makemore.ngram_stat import StatisticalNGram

model = StatisticalNGram(n=2)
model.train(words)

loss = model.loss(words)
loss

In [None]:
model.sample(8)

**A Trigram Model**

In [None]:
model = StatisticalNGram(n=3)
model.train(words)

loss = model.loss(words)
loss

In [None]:
model.sample(8)

**Hyperparameter Tuning**

In [None]:
import random


def train_test_split(
    data: list[str], test_size: float = 0.2, random_state: int = 0
) -> tuple[list[str], list[str]]:
    """Perform a train / test split of the input data."""
    if not 0.0 < test_size < 1.0:
        raise ValueError("test size must be on (0.0, 1.0)")

    random.seed(random_state)

    train, test = [], []
    for element in data:
        if random.random() < test_size:
            test.append(element)
        else:
            train.append(element)

    return train, test

In [None]:
from typing import Any
import itertools


def search_grid(hp: dict[str, list[Any]]) -> list[dict[str, Any]]:
    """Generate the search grid for a given set of hyperparameters."""
    product = itertools.product(*([(k, v) for v in hp[k]] for k in hp.keys()))
    return [{k: v for k, v in candidate} for candidate in product]

In [None]:
def kfold_split(
    data: list[str], k: int = 3, shuffle: bool = False, random_state: int = 0
) -> list[list[str]]:
    """Generate a random k-fold split."""
    if k < 2:
        raise ValueError("k must be at least 2")
    random.seed(random_state)

    # shuffle if desired
    input = random.sample(data, len(data)) if shuffle else data

    # split and return; the remainder is always allocated to final split
    div, mod = divmod(len(data), k)
    return [input[i * div : (i + 1) * div] for i in range(k - 1)] + [
        input[div * (k - 1) : div * k + mod]
    ]

In [None]:
from typing import Generator


def kfold_split_cv(
    data: list[str], k: int = 3, shuffle: bool = False, random_state: int = 0
) -> Generator[tuple[list[str], list[str]], None, None]:
    """Perform k-fold split of input data and return groupings for cross-validation."""
    splits = kfold_split(data, k, shuffle, random_state)
    for i in range(k):
        yield [
            element
            for j, split in enumerate(splits)
            for element in split
            if j != i
        ], splits[i]

In [None]:
from makemore.ngram_stat import StatisticalNGram


def tune_hyperparameters(
    ngram_size: int,
    data: list[str],
    hyperparameters: dict[str, list[Any]],
    cv_nfolds: int = 5,
    random_state: int = 0,
) -> tuple[dict[str, Any], float]:
    """Tune hyperparameters using the provided data and report the best combination."""

    # the best loss we've encountered
    best_loss = float("inf")
    # the best hyperparameter combination we have seen so far
    best_hp = {}

    for hp in search_grid(hyperparameters):
        # create a model instance with the current hyperparameter set
        model = StatisticalNGram(ngram_size, **hp)

        aggregate_loss = 0.0
        for train, test in kfold_split_cv(
            data, k=cv_nfolds, shuffle=True, random_state=random_state
        ):
            # train the model on k - 1 folds
            model.train(train)
            # compute loss on remaining fold; accumulate loss
            aggregate_loss += model.loss(test)

        # compute the mean loss across all cv iterations
        mean_loss = aggregate_loss / cv_nfolds
        if mean_loss < best_loss:
            best_loss = mean_loss
            best_hp = hp

    return best_hp, best_loss

**Putting it all Together**

In [None]:
# create our global train / test split
train, test = train_test_split(words)

In [None]:
hyperparameters = {"smoothing": [0, 1, 3, 5, 10, 20]}

In [None]:
ngram_size = 2

# compute best hyperparameters via search
best_hp, best_cv_loss = tune_hyperparameters(ngram_size, train, hyperparameters)

# train and evaluate with full sets
model = StatisticalNGram(ngram_size, **best_hp)
model.train(train)

test_loss = model.loss(test)

print(f"{best_hp=}")
print(f"{best_cv_loss=}")
print(f"{test_loss=}")

In [None]:
ngram_size = 3

# compute best hyperparameters via search
best_hp, best_cv_loss = tune_hyperparameters(ngram_size, train, hyperparameters)

# train and evaluate with full sets
model = StatisticalNGram(ngram_size, **best_hp)
model.train(train)

test_loss = model.loss(test)

print(f"{best_hp=}")
print(f"{best_cv_loss=}")
print(f"{test_loss=}")