# Data downloading

In [1]:
!pip3 install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25ldone
[?25h  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=ee50b2a17a7ad73d3deb0ea8e6d1cda569780b86954f377e5cc9dc615b625cb9
  Stored in directory: /home/ubuntu/.cache/pip/wheels/40/b3/0f/a40dbd1c6861731779f62cc4babcb234387e11d697df70ee97
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [1]:
import os
import wget

In [3]:
url = 'https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories_all_data.tar.gz'

# Download the file if it does not already exist
if not os.path.exists('./TinyStories_all_data.tar.gz'):
    wget.download(url, './TinyStories_all_data.tar.gz') 

In [8]:
!tar -xf TinyStories_all_data.tar.gz

In [9]:
!mkdir TinyStories_all_data

mkdir: cannot create directory ‘TinyStories_all_data’: File exists


In [13]:
! mv data* TinyStories_all_data

# Data preprocessing

In [1]:
import os
import json

files = 0

with open("tiny_stories_subset.txt", "a") as text_file:
    for json_file in sorted(os.listdir('TinyStories_all_data')):
        if files > 10:
            break
        with open(f'TinyStories_all_data/{json_file}') as j:
            data = json.load(j)
            for item in data:
                text_file.write(item['story'].replace('\n', ''))
                text_file.write('\n')
        files += 1

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import numpy as np
from dataset import TextDataset

## Dataset creation

In [4]:
train_set = TextDataset(data_file='tiny_stories_subset.txt', train=True, vocab_size=4000, max_length=256, sp_model_prefix='bpe')
val_set = TextDataset(data_file='tiny_stories_subset.txt', train=False, vocab_size=4000, max_length=256, sp_model_prefix='bpe')

100%|██████████| 1100000/1100000 [00:01<00:00, 671648.92it/s]


texts read
sp encode finished


100%|██████████| 1100000/1100000 [00:01<00:00, 681076.94it/s]


texts read
sp encode finished


In [5]:
len(train_set), len(val_set)

(1045000, 55000)

## Prepared dataset saving/loading

In [6]:
# torch.save(train_set, './train_set.pt')
# torch.save(val_set, './val_set.pt')

torch.save(train_set, './train_set_half.pt')
torch.save(val_set, './val_set_half.pt')


In [3]:
# train_set = torch.load('./train_set.pt')
# val_set = torch.load('./val_set.pt')


train_set = torch.load('./train_set_half.pt')
val_set = torch.load('./val_set_half.pt')


In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

# Model creation

In [5]:
from transformer_model import TransformerDecoder
model = TransformerDecoder(embed_dim=512, num_heads=8, dataset=train_set, feedforward_dim=2048, num_layers=8).to(device)

# model = TransformerDecoder(embed_dim=128, num_heads=2, dataset=train_set, feedforward_dim=256, num_layers=1).to(device)

In [6]:
# basic asserts
for bs in [1, 4, 16]:
    indices = torch.randint(high=train_set.vocab_size, size=(bs, train_set.max_length)).to(device)
    logits = model(indices)
    assert logits.shape == (bs, train_set.max_length, train_set.vocab_size)

for prefix in ['', 'who am i']:
    generated = model.inference(prefix, temp=np.random.uniform(0.1, 10))
    assert type(generated) == str
    assert generated.startswith(prefix)

# Training

In [7]:
from torch import nn
from torch.utils.data import DataLoader
from train import train

In [8]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [9]:
train_dataloader = DataLoader(train_set, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_set, batch_size=128, shuffle=False)

In [10]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myuliazhelt[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [11]:
# parameters number
sum(p.numel() for p in model.parameters())

29310880

In [18]:
train(model=model, optimizer=optimizer, scheduler=None, train_loader=train_dataloader, val_loader=val_dataloader, num_epochs=10, grad_accumulate_period=6)

Training 2/10:  32%|███▏      | 5232/16329 [29:38<1:03:01,  2.93it/s]

In [14]:
torch.save(model.state_dict(), "model_ep=1.pt")

# Generation examples


In [15]:
model.inference(prefix="The man worked as a", temp=np.random.uniform(0.1, 10))

'the man worked as a father happyz check je realized meant deer display y whe everything buyfrzy have soft floor scatter tom hang frog shapes animalsiouscycle named sandcastle showing duckyicopter olive umbreb cour honest lions popularent spo bumpared why fixedses pan automob berding staff pe times puddle hopped separate scale their originut magaz eyekeeper cookie legsg rhinoze persistid sparkly greenves doesn sat seem fuel betsy crab things firstround clever explor palaceearedourslucy rockround hona hopped anywhereorm toysamp blouse the," berries tears heartassbled castle had mummycked max take museum cre glad mo folderately tutor lemonade jim banan needed fingersll matt advice toys rob ashamedcle nest curious take helped missingliesshion climbing monke goatfr diamondopl evenwel joe univerina to telephone food barberieked per hung noise te sound idea grocer jane were adventure stage favor delicatew diamond linema apologized miner fallenround bookshelf reliebowicopterjohn towards barn 

# Compare to GPT2-XL

In [16]:
!pip install transformers

Successfully installed fsspec-2023.12.0 huggingface-hub-0.19.4 regex-2023.10.3 safetensors-0.4.1 tokenizers-0.15.0 transformers-4.35.2


In [17]:
from transformers import pipeline, set_seed

gpt2_xl = pipeline('text-generation', model='gpt2-xl')
set_seed(42)
gpt2_xl("The man worked as a", max_length=10, num_return_sequences=5)

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'The man worked as a supervisor for the company.'},
 {'generated_text': 'The man worked as a contractor and had no prior'},
 {'generated_text': 'The man worked as a construction material supplier to the'},
 {'generated_text': 'The man worked as a lab technician but quit to'},
 {'generated_text': 'The man worked as a delivery man at his local'}]