In [None]:
from datasets import load_dataset

num_titles = 10000
val_frac = 0.1
seed = 1337
ds = load_dataset("julien040/hacker-news-posts", split="train", cache_dir="./data").shuffle(seed=seed)
titles = [row["title"].strip() for row in ds.take(num_titles)]
n = int(num_titles * (1 - val_frac))

In [3]:
from omegaconf import OmegaConf
from dataclasses import dataclass
@dataclass
class Hyperparameters:

    seed: int
    epochs: int
    val_frac: float
    num_titles: int
    vocab_size: int
    context_length: int  # Added context_length parameter

    log_file: str
    model_architecture: str 
    
    batch_size: int
    lr: float
    weight_decay: float
    scheduler: str # none, linear, cosine
    optimizer: str
    evals_per_epoch: float


from model.gpt import GPT, GPTConfig

@dataclass
class AttnConfig:
    d_model: int
    n_head: int
    block_size: int
    dropout: float

cfg = OmegaConf.load("config/hyperparams.yaml")
            # Update cfg with args

hparams = OmegaConf.to_container(cfg.hyperparams, resolve=True)
modelparams = OmegaConf.to_container(cfg.model_configs[hparams['model_architecture']], resolve=True)
attnparams = OmegaConf.to_container(cfg.attn_configs[modelparams['attention_layer']], resolve=True)

args = Hyperparameters(**hparams)

attn = AttnConfig(
    d_model=modelparams['d_model'],
    n_head=attnparams['n_head'],
    block_size=args.context_length,
    dropout=modelparams['dropout']
)

cfg = GPTConfig(
    vocab_size=args.vocab_size,
    block_size=args.context_length,
    attn_config = attn,
    activation_function = 'gelu',
    **modelparams
)