# train.ipynb
This notebook contains the training code for a small language model using the TRL (Transformer Reinforcement Learning) library.

We'll be training a decoder-only (GPT-2 style) model with a custom configuration and using the DebertaV2 tokenizer provided in the data folder.


## Installation
First, we need to install the required dependencies for training:


In [None]:
# %pip install transformers trl bitsandbytes tf-keras
%pip install tf-keras

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\tbhro\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


: 

In [None]:
from transformers import set_seed
from transformers import AutoConfig, AutoModelForCausalLM, DebertaV2Tokenizer
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
import torch
import os

# Force CPU usage
os.environ["CUDA_VISIBLE_DEVICES"] = ""
torch.cuda.is_available = lambda: False

def preprocess_logits_for_metrics(logits, labels):
    """Extract predicted token IDs from model logits for evaluation"""
    pred_ids = torch.argmax(logits, dim=-1)  # Get the token with highest probability
    return pred_ids


def compute_metrics(eval_pred):
    """Calculate accuracy by comparing predictions with true labels"""
    logits, labels = eval_pred
    predictions = logits.flatten()  # Flatten to 1D array
    labels = labels.flatten()
    
    # Only consider non-padding tokens (labels != -100 are actual tokens)
    mask = labels != -100
    labels = labels[mask]
    predictions = predictions[mask]

    # Calculate accuracy
    correct = labels == predictions
    accuracy = correct.sum() / float(len(correct))
    return {"acc": accuracy}

In [None]:
# Load the custom tokenizer trained on BabyLM dataset
model_name = "openai-community/gpt2"
tokenizer = DebertaV2Tokenizer('data/tokenizer.model')

# Create custom configuration based on GPT-2 but with smaller dimensions
config = AutoConfig.from_pretrained(model_name)
config.hidden_size = 384 # Same dimensionality as the best performing BabyLM model.
config.intermediate_size = 1280  # Feed-forward intermediate size
config.vocab_size = tokenizer.vocab_size  # Match tokenizer vocabulary size

# Initialize model with custom configuration
model = AutoModelForCausalLM.from_config(config)

# Print model size for reference, should be around 31M parameters
num_params = sum(p.numel() for p in model.parameters())
print(f"Number of model parameters: {num_params}")

# Load training and validation datasets
dataset = load_dataset('text', data_files = {'train': 'data/train.txt', 'validation': 'data/dev.txt'})
print(dataset)

In [None]:
# Set random seed for reproducibility
set_seed(0)

# Initialize SFTTrainer with comprehensive configuration
trainer = SFTTrainer(
    model = model,
    processing_class = tokenizer,  # Tokenizer for text processing
    train_dataset = dataset['train'],
    eval_dataset = dataset['validation'],
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics,
    args = SFTConfig(
        # Data processing
        remove_unused_columns = True,
        label_names = ["labels"],
        dataset_num_proc = 12,  # Number of processes for dataset preprocessing
        packing = True,  # Pack multiple sequences into single training example
        eval_packing = True,
        max_length = 64,  # Maximum sequence length
        dataset_text_field = "text",
        
        # Training strategy
        eval_strategy = "steps",
        per_device_train_batch_size = 64, # how many sequences to process at once
        gradient_accumulation_steps = 1, # how many batches to process before updating the model. Effectively increases the batch size without increasing memory usage.
        warmup_ratio = 0.05,  # 5% of training steps for learning rate increase from 0to 2e-4
        num_train_epochs = 10,
        learning_rate = 2e-4,
        
        # Optimization and precision - CPU only settings
        use_cpu = True,  # Force CPU usage
        fp16 = False,  # Disable FP16 (not supported on CPU)
        bf16 = False,  # Disable BF16
        optim = "adamw_torch",  # Use standard PyTorch AdamW (8bit not available on CPU)
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",  # Cosine learning rate schedule
        max_grad_norm=1,  # Gradient clipping
        
        # Logging and evaluation
        logging_steps = 10,
        eval_steps = 100,
        save_steps = 100,
        eval_accumulation_steps=1, # how many evaluation batches to accumulate on GPU. 
        include_for_metrics=[],
        
        # Reproducibility and output
        seed = 0,
        # output_dir = "",  # Uncomment to specify output directory
        report_to = "none",  # Disable wandb/tensorboard logging
    ),
)

# Start training
trainer.train()