<a href="https://colab.research.google.com/github/zzehli/ml-notebooks/blob/main/train_tinyllm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training a causal language model from scratch (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [100]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [101]:
!pip install datasets transformers evaluate
# !pip install accelerate evaluate
# To run the training on TPU, you will need to uncomment the following line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.


In [102]:
# !pip install --upgrade torch torchvision torchaudio

You will need to setup git, adapt your email and name in the following cell.

In [103]:
!git config --global user.email "jaeli_ottawa@outlook.com"
!git config --global user.name "jaeli-collab"

In [104]:
from datasets import get_dataset_split_names
get_dataset_split_names("roneneldan/TinyStories")

['train', 'validation']

In [105]:
from datasets import load_dataset, DatasetDict

train_data = load_dataset(f"roneneldan/TinyStories", split="train[:5%]")
validation_data = load_dataset(f"roneneldan/TinyStories", split="validation[:5%]")
raw_datasets = DatasetDict(
    {
        "train": train_data,
        "valid": validation_data,
    }
)
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 105986
    })
    valid: Dataset({
        features: ['text'],
        num_rows: 1100
    })
})

In [106]:
for key in raw_datasets["train"][0]:
    print(f"{key.upper()}: {raw_datasets['train'][0][key][:200]}")

TEXT: One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on


In [107]:
from transformers import GPT2TokenizerFast

context_length = 512
tokenizer =  GPT2TokenizerFast.from_pretrained("EleutherAI/gpt-neo-125M")
# tokenizer.bos_token_id = tokenizer.cls_token_id
# tokenizer.eos_token_id = tokenizer.sep_token_id
print("Vocab size:", tokenizer.vocab_size)

Vocab size: 50257


In [108]:
#verify tokenizer is behaving as expected
print(tokenizer.tokenize(raw_datasets["train"][0]["text"]))

['One', 'Ġday', ',', 'Ġa', 'Ġlittle', 'Ġgirl', 'Ġnamed', 'ĠLily', 'Ġfound', 'Ġa', 'Ġneedle', 'Ġin', 'Ġher', 'Ġroom', '.', 'ĠShe', 'Ġknew', 'Ġit', 'Ġwas', 'Ġdifficult', 'Ġto', 'Ġplay', 'Ġwith', 'Ġit', 'Ġbecause', 'Ġit', 'Ġwas', 'Ġsharp', '.', 'ĠLily', 'Ġwanted', 'Ġto', 'Ġshare', 'Ġthe', 'Ġneedle', 'Ġwith', 'Ġher', 'Ġmom', ',', 'Ġso', 'Ġshe', 'Ġcould', 'Ġsew', 'Ġa', 'Ġbutton', 'Ġon', 'Ġher', 'Ġshirt', '.', 'Ċ', 'Ċ', 'L', 'ily', 'Ġwent', 'Ġto', 'Ġher', 'Ġmom', 'Ġand', 'Ġsaid', ',', 'Ġ"', 'Mom', ',', 'ĠI', 'Ġfound', 'Ġthis', 'Ġneedle', '.', 'ĠCan', 'Ġyou', 'Ġshare', 'Ġit', 'Ġwith', 'Ġme', 'Ġand', 'Ġsew', 'Ġmy', 'Ġshirt', '?"', 'ĠHer', 'Ġmom', 'Ġsmiled', 'Ġand', 'Ġsaid', ',', 'Ġ"', 'Yes', ',', 'ĠLily', ',', 'Ġwe', 'Ġcan', 'Ġshare', 'Ġthe', 'Ġneedle', 'Ġand', 'Ġfix', 'Ġyour', 'Ġshirt', '."', 'Ċ', 'Ċ', 'Together', ',', 'Ġthey', 'Ġshared', 'Ġthe', 'Ġneedle', 'Ġand', 'Ġse', 'wed', 'Ġthe', 'Ġbutton', 'Ġon', 'ĠLily', "'s", 'Ġshirt', '.', 'ĠIt', 'Ġwas', 'Ġnot', 'Ġdifficult', 'Ġfor', 'Ġthem', 'Ġbec

In [109]:
# text = raw_datasets["train"][0]["text"]
# encoding = tokenizer(text, return_tensors="pt", add_special_tokens=False)

# View token IDs
# print("Input IDs:", encoding["input_ids"][0].tolist())

# Convert back to readable tokens
# tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])
# print("Tokens:", tokens)
# print(text)

In [155]:
outputs = tokenizer(
    raw_datasets["train"][2]['text'],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
    stride=3
)

# print(raw_datasets["train"][2]['text'])
print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")

#important step, verify that a longer input is properly batched with the tokenizer
for i in outputs.input_ids:
  print('**')
  print(tokenizer.decode(i))
# for i in outputs["input_ids"]:
  # print(tokenizer.decode(i))
# print(outputs)

Input IDs length: 1
Input chunk lengths: [152]
**
There are many variations of passages of Lorem Ipsum available, but the majority have suffered alteration in some form, by injected humour, or randomised words which don't look even slightly believable. If you are going to use a passage of Lorem Ipsum, you need to be sure there isn't anything embarrassing hidden in the middle of text. All the Lorem Ipsum generators on the Internet tend to repeat predefined chunks as necessary, making this the first true generator on the Internet. It uses a dictionary of over 200 Latin words, combined with a handful of model sentence structures, to generate Lorem Ipsum which looks reasonable. The generated Lorem Ipsum is therefore always free from repetition, injected humour, or non-characteristic words etc.


In [157]:
len(tokenizer.encode(text))

152

In [111]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for input_ids in outputs["input_ids"]:
        input_batch.append(input_ids)
    return {"input_ids": input_batch}

tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, num_proc = 4, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

Map (num_proc=4):   0%|          | 0/105986 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 109074
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 1114
    })
})

In [112]:
print(tokenized_datasets['train'][0])

{'input_ids': [3198, 1110, 11, 257, 1310, 2576, 3706, 20037, 1043, 257, 17598, 287, 607, 2119, 13, 1375, 2993, 340, 373, 2408, 284, 711, 351, 340, 780, 340, 373, 7786, 13, 20037, 2227, 284, 2648, 262, 17598, 351, 607, 1995, 11, 523, 673, 714, 34249, 257, 4936, 319, 607, 10147, 13, 198, 198, 43, 813, 1816, 284, 607, 1995, 290, 531, 11, 366, 29252, 11, 314, 1043, 428, 17598, 13, 1680, 345, 2648, 340, 351, 502, 290, 34249, 616, 10147, 1701, 2332, 1995, 13541, 290, 531, 11, 366, 5297, 11, 20037, 11, 356, 460, 2648, 262, 17598, 290, 4259, 534, 10147, 526, 198, 198, 41631, 11, 484, 4888, 262, 17598, 290, 384, 19103, 262, 4936, 319, 20037, 338, 10147, 13, 632, 373, 407, 2408, 329, 606, 780, 484, 547, 7373, 290, 5742, 1123, 584, 13, 2293, 484, 5201, 11, 20037, 26280, 607, 1995, 329, 7373, 262, 17598, 290, 18682, 607, 10147, 13, 1119, 1111, 2936, 3772, 780, 484, 550, 4888, 290, 3111, 1978, 13]}


In [113]:
from transformers import GPTNeoConfig, GPTNeoForCausalLM
configuration = GPTNeoConfig(
    attention_types = [[['global', 'local'], 1]],
    num_layers=2,
    hidden_size=32,
    vocab_size=tokenizer.vocab_size
    )

# configuration.attention_layers = 8
model = GPTNeoForCausalLM(configuration)

# model = GPTNeoForCausalLM.from_config(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT size: {model_size/1000**2:.1f}M parameters")
# sum(p.numel() for p in model.parameters() if p.requires_grad)

GPT size: 1.7M parameters


In [114]:
print(tokenizer.pad_token)
print(tokenizer.eos_token)

None
<|endoftext|>


In [115]:
from transformers import DataCollatorForLanguageModeling
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [116]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
# shape of the collator should be (batch_size, context_length)
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 212])
attention_mask shape: torch.Size([5, 212])
labels shape: torch.Size([5, 212])


In [117]:
train_data['text'][0]
# len(train_data['text'][0])

'One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.'

In [83]:
for i in out.input_ids:
    print(f'length of this seg is {len(i)}')
    # print(i)
    print(tokenizer.convert_ids_to_tokens(i))
    print(tokenizer.decode(i), sep="/n")

length of this seg is 212
['One', 'Ġday', ',', 'Ġa', 'Ġlittle', 'Ġgirl', 'Ġnamed', 'ĠLily', 'Ġfound', 'Ġa', 'Ġneedle', 'Ġin', 'Ġher', 'Ġroom', '.', 'ĠShe', 'Ġknew', 'Ġit', 'Ġwas', 'Ġdifficult', 'Ġto', 'Ġplay', 'Ġwith', 'Ġit', 'Ġbecause', 'Ġit', 'Ġwas', 'Ġsharp', '.', 'ĠLily', 'Ġwanted', 'Ġto', 'Ġshare', 'Ġthe', 'Ġneedle', 'Ġwith', 'Ġher', 'Ġmom', ',', 'Ġso', 'Ġshe', 'Ġcould', 'Ġsew', 'Ġa', 'Ġbutton', 'Ġon', 'Ġher', 'Ġshirt', '.', 'Ċ', 'Ċ', 'L', 'ily', 'Ġwent', 'Ġto', 'Ġher', 'Ġmom', 'Ġand', 'Ġsaid', ',', 'Ġ"', 'Mom', ',', 'ĠI', 'Ġfound', 'Ġthis', 'Ġneedle', '.', 'ĠCan', 'Ġyou', 'Ġshare', 'Ġit', 'Ġwith', 'Ġme', 'Ġand', 'Ġsew', 'Ġmy', 'Ġshirt', '?"', 'ĠHer', 'Ġmom', 'Ġsmiled', 'Ġand', 'Ġsaid', ',', 'Ġ"', 'Yes', ',', 'ĠLily', ',', 'Ġwe', 'Ġcan', 'Ġshare', 'Ġthe', 'Ġneedle', 'Ġand', 'Ġfix', 'Ġyour', 'Ġshirt', '."', 'Ċ', 'Ċ', 'Together', ',', 'Ġthey', 'Ġshared', 'Ġthe', 'Ġneedle', 'Ġand', 'Ġse', 'wed', 'Ġthe', 'Ġbutton', 'Ġon', 'ĠLily', "'s", 'Ġshirt', '.', 'ĠIt', 'Ġwas', 'Ġnot', 'Ġdifficul

In [20]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [36]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="gpt-sc",
    # per_device_train_batch_size=32,
    # per_device_eval_batch_size=32,
    # eval_strategy="steps",
    # eval_steps=1_000,
    # logging_steps=1_000,
    # gradient_accumulation_steps=8,
    # num_train_epochs=1,
    # weight_decay=0.1,
    # warmup_steps=0,
    # lr_scheduler_type="linear",
    # learning_rate=5e-4,
    # save_steps=1_000,
    # fp16=True,
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
)

In [37]:
trainer.train()

Step,Training Loss
500,5.3375
1000,5.0657
1500,4.8754
2000,4.6878
2500,4.4841
3000,4.3135
3500,4.1455
4000,4.0173
4500,3.8992
5000,3.8395


TrainOutput(global_step=39747, training_loss=3.2213878775073628, metrics={'train_runtime': 610.2874, 'train_samples_per_second': 520.997, 'train_steps_per_second': 65.128, 'total_flos': 1543291822080.0, 'train_loss': 3.2213878775073628, 'epoch': 3.0})

In [38]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/6.80M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1744483280.f4bd0660c1c1.8391.1:   0%|          | 0.00/22.3k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Jae-star/gpt-sc/commit/a88ac7069c47207b11f948afd0a5ed2264b8eb6b', commit_message='End of training', commit_description='', oid='a88ac7069c47207b11f948afd0a5ed2264b8eb6b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Jae-star/gpt-sc', endpoint='https://huggingface.co', repo_type='model', repo_id='Jae-star/gpt-sc'), pr_revision=None, pr_num=None)

In [39]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [42]:
predictions = trainer.predict(tokenized_datasets["valid"])

In [44]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")


Perplexity: 16.87


In [None]:
# import torch.argmax
preds = predictions.predictions
pred_ids = torch.argmax(torch.tensor(preds), dim=-1)

# Decode predictions and labels
decoded_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)