In [1]:
! pip install transformers accelerate datasets tqdm



In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, get_scheduler
import os
import torch
from torch.optim import AdamW
from transformers import DataCollatorForLanguageModeling
import time

In [3]:
use_embedding = True
use_custom_attn_mask = True
num_epochs = 1

In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
block_size = 128

In [5]:
# https://huggingface.co/learn/nlp-course/en/chapter7/6?fw=pt
from torch.nn import CrossEntropyLoss
import torch

def causal_lm_loss(inputs, logits, alpha=1.0):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()
    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduce=False)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    # Resize and average loss per sample
    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
    # Calculate average
    loss = loss_per_sample.mean()
    return loss

In [6]:
from datasets import load_dataset

def tokenize_func(examples):
    return tokenizer(examples["text"])
    
ds = load_dataset("roneneldan/TinyStories")
tokenized_ds = ds.map(tokenize_func, batched=True, num_proc=4, remove_columns=["text"])
tokenized_ds.set_format("torch")

Repo card metadata block was not found. Setting CardData to empty.


In [7]:
from torch.utils.data.dataloader import DataLoader

train_dataloader = DataLoader(tokenized_ds["train"].select(list(range(128))), batch_size=64, shuffle=True)
eval_dataloader = DataLoader(tokenized_ds["validation"].select(list(range(64))), batch_size=64)

In [8]:
weight_decay = 0.1

def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [
        {"params": params_with_wd, "weight_decay": weight_decay},
        {"params": params_without_wd, "weight_decay": 0.0},
    ]

In [9]:
model = AutoModelForCausalLM.from_pretrained('roneneldan/TinyStories-8M')

model.to(device)

  return self.fget.__get__(instance, owner)()


GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(50257, 256)
    (wpe): Embedding(2048, 256)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-7): 8 x GPTNeoBlock(
        (ln_1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear(in_features=256, out_features=256, bias=False)
            (v_proj): Linear(in_features=256, out_features=256, bias=False)
            (q_proj): Linear(in_features=256, out_features=256, bias=False)
            (out_proj): Linear(in_features=256, out_features=256, bias=True)
          )
        )
        (ln_2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=256, out_features=1024, bias=True)
          (c_proj): Linear(in_feat

In [10]:
model.config

GPTNeoConfig {
  "_name_or_path": "roneneldan/TinyStories-8M",
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      4
    ]
  ],
  "bos_token_id": 50256,
  "classifier_dropout": 0.1,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",
  "num_heads": 16,
  "num_layers": 8,
  "resid_dropout": 0,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.42.3",
 

In [11]:
from utils import BeaconEmbedding, generate_beacon_attention_mask_2d

if use_embedding:
    beacon_embedding = BeaconEmbedding(embedding=model.get_input_embeddings(), vocab_size=model.config.vocab_size, n_embed=model.config.hidden_size, window_length=4)
    model.set_input_embeddings(beacon_embedding)

attention_mask = generate_beacon_attention_mask_2d(256, device=device)


optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = num_epochs * len(train_dataloader)

  nn.init.normal(self.b_embed)
  nn.init.normal(self.nb_embed)


In [12]:
def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"], labels=batch["input_ids"])

        losses.append(accelerator.gather(outputs.loss))
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()

In [13]:
from accelerate import Accelerator

accelerator = Accelerator(fp16=True, log_with="wandb")
accelerator.init_trackers(
    project_name="beacon_attention",
    config={"use_embedding": use_embedding, "use_custom_attn_mask": use_custom_attn_mask}
)

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader)

TypeError: Accelerator.__init__() got an unexpected keyword argument 'fp16'

In [None]:
model_name = f"{'beacon_embed' if use_embedding else 'no_beacon_embed'}_{'beacon_attn_mask' if use_custom_attn_mask else 'regular_attn_mask'}_model"
output_dir = f"./models/{model_name}"
if not(os.path.exists(output_dir)):
    os.makedirs(output_dir)

In [None]:
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
from tqdm import tqdm

gradient_accumulation_steps = 8
eval_steps = 5000

model.train()

completed_steps = 0
step_start_time = time.perf_counter()

for epoch in range(num_epochs):
    for step, batch in tqdm(enumerate(train_dataloader, start=1), total=num_training_steps):
        batch_attn_mask = batch["attention_mask"]
        if use_custom_attn_mask:
            batch_attn_mask = batch_attn_mask & attention_mask[:T, :T]

        logits = model(input_ids=batch["input_ids"], attention_mask=batch_attn_mask).logits
        loss = causal_lm_loss(batch["input_ids"], logits, keytoken_ids)

        if step % 100 == 0:
            step_end_time = time.perf_counter()
            accelerator.log({
                "samples": step * samples_per_step,
                "steps": completed_steps,
                "loss/train": loss.item() * gradient_accumulation_steps,
                "loss/step_time": step_end_time - step_start_time
            })
            step_start_time = step_end_time

        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)

        if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        if (step % (eval_steps * gradient_accumulation_steps)) == 0:
            eval_loss, perplexity = evaluate()
            accelerator.log(
                {"loss/eval": eval_loss, "perplexity": perplexity}
            )
            model.train()
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
        
        end_time = time.perf_counter()

In [None]:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)