In [1]:
from dataclasses import dataclass

import torch
import simple_parsing
from datasets import load_dataset, Dataset
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM
from transformers import AutoTokenizer


@dataclass
class Config:
    model_id: str = "HuggingFaceM4/tiny-random-LlamaForCausalLM"
    dataset_id: str = "roneneldan/TinyStories"
    batch_size: int = 4
    eval_steps: int = 200
    save_steps: int = 200
    warmup_steps: int = 100
    num_train_epochs: int = 3
    sample_size: int = 1000
    block_size: int = 32

In [42]:
def get_dataset(dataset_id, tokenizer, sample=1000, test_pct=0.2, block_size=128):
    dataset = load_dataset(dataset_id)["train"]
    
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)
    
    dataset = (dataset.select(range(sample))
                      .map(tokenize_function, remove_columns="text", batched=True)
                      .map(lambda x: {"input_ids": x['input_ids'][:block_size], 
                                      "labels": x['input_ids'][:block_size]})
                      .train_test_split(test_pct))


    return dataset["train"], dataset["test"]


In [43]:
config = Config()

model = AutoModelForCausalLM.from_pretrained(config.model_id)
tokenizer = AutoTokenizer.from_pretrained(config.model_id) 
tokenizer.pad_token = tokenizer.eos_token
train_dataset, eval_dataset = get_dataset(config.dataset_id, tokenizer, sample=config.sample_size)


Repo card metadata block was not found. Setting CardData to empty.
Map: 100%|██████████| 1000/1000 [00:00<00:00, 1573.64 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 1403.41 examples/s]


In [44]:
type(train_dataset[0]["input_ids"]), train_dataset[0].keys(), len(train_dataset[0]["input_ids"]), len(train_dataset[0]["labels"])

(list, dict_keys(['input_ids', 'attention_mask', 'labels']), 128, 128)

In [50]:
training_args = TrainingArguments(
    output_dir="./random_tiny_llama", #The output directory
    num_train_epochs=config.num_train_epochs, # number of training epochs
    per_device_train_batch_size=config.batch_size, # batch size for training
    per_device_eval_batch_size=config.batch_size*2,  # batch size for evaluation
    eval_steps = config.eval_steps, # Number of update steps between two evaluations.
    save_steps=config.save_steps, # after # steps model is saved
    warmup_steps=config.warmup_steps,# number of warmup steps for learning rate scheduler
    logging_strategy="steps",
    logging_steps=1,
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

trainer.train()

Step,Training Loss
1,9.5449
2,9.5449
3,9.5449
4,9.5448
5,9.5448
6,9.5447
7,9.5446
8,9.5445
9,9.5444
10,9.5443


TrainOutput(global_step=600, training_loss=9.246492959658305, metrics={'train_runtime': 7.5266, 'train_samples_per_second': 318.871, 'train_steps_per_second': 79.718, 'total_flos': 958965350400.0, 'train_loss': 9.246492959658305, 'epoch': 3.0})

In [49]:
dl = trainer.get_train_dataloader()
len(dl)



200