# Create a dataset for efficient gpt (pre-)training.
- Load raw text datasets.
  - https://huggingface.co/datasets/roneneldan/TinyStories
- Make a vocaburary from the train dataset.
- Transform token to id.
- Make N x K(context_size) array with token id

In [1]:
import ftfy
import spacy
import torch
import numpy as np
from tqdm import tqdm
from pathlib import Path
from utils import load_dict, save_dict
from random import randint
from spacy.symbols import ORTH
from datasets import load_dataset
from torchtext.vocab import build_vocab_from_iterator

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from importlib.metadata import version

pkgs = ["ftfy", 
        "spacy", 
        "torchtext", 
        "torch",
        "datasets",
       ]

for p in pkgs:
    print(f"{p} version: {version(p)}")

ftfy version: 6.2.3
spacy version: 3.7.5
torchtext version: 0.18.0
torch version: 2.3.0
datasets version: 2.20.0


#### Hyperparameters

In [3]:
special_tokens = [
    '<unk>',  # default unknown token
    '<sot>',  # start of text token
    '<eot>',  # end of text token
]

minimum_text_length = 300 # minimum number of characters in a text
vocab_size = 30000
min_freq = 2
context_size = 128 # context window size for training.

#### Download raw texts from huggingface

In [4]:
dataset = load_dataset('roneneldan/TinyStories', None)

In [5]:
print("Number of datas")
for k, v in dataset.items():
    print(f"{k}: {len(v):,}")

Number of datas
train: 2,119,719
validation: 21,990


#### Check a data sample

In [6]:
print(dataset['train'][0])
print(dataset['train'][-1])

{'text': 'One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.'}
{'text': 'Once upon a time, there was an adorable little cat named Kitty. Kitty loved to polish her toy car with a soft cloth. One sunny day, she decided to take her shiny car to the park.\n\nAt the park, she met a friendly dog named'}


In [7]:
# preprocess the raw text
texts = {}
for split, items in dataset.items():
    texts[split] = []
    for item in tqdm(items):
        # remove too short texts.
        if len(item['text']) < minimum_text_length:
            continue
            
        # remove newline characters and fix texts with ftfy Lib.
        text = ftfy.fix_text(item['text'].replace('\n', '').lower())
        texts[split].append(text)

100%|█████████████████████████████████████████████████████████████████████████████████████████| 2119719/2119719 [02:26<00:00, 14453.80it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████| 21990/21990 [00:01<00:00, 13871.05it/s]


#### Load a tokenizer from spaCy

In [8]:
nlp = spacy.load('en_core_web_sm')
for token in special_tokens:
    nlp.tokenizer.add_special_case(token, [{ORTH: token}])
tokenizer = nlp.tokenizer

In [9]:
def yield_tokens_from_text_lst(text_lst, tokenizer):
    for text in tqdm(text_lst):
        yield [token.text for token in tokenizer(text)]

#### Build a vocab with the train dataset

In [10]:
vocab = build_vocab_from_iterator(yield_tokens_from_text_lst(texts['train'], tokenizer),
                                 specials=special_tokens,
                                 min_freq=min_freq,
                                 max_tokens=vocab_size)
vocab.set_default_index(0)

100%|██████████████████████████████████████████████████████████████████████████████████████████| 2117893/2117893 [04:55<00:00, 7158.96it/s]


#### Check the vocab

In [11]:
tokens = [t.text for t in tokenizer(texts['train'][0].lower())]
indices = vocab.lookup_indices(tokens)

In [12]:
print(vocab.lookup_tokens(range(10)))
print(tokens[:10])
print(indices[:10])
print(vocab.lookup_tokens(indices[:10]))

['<unk>', '<sot>', '<eot>', '.', 'the', 'and', ',', 'to', 'a', 'was']
['one', 'day', ',', 'a', 'little', 'girl', 'named', 'lily', 'found', 'a']
[43, 23, 6, 8, 36, 51, 72, 24, 107, 8]
['one', 'day', ',', 'a', 'little', 'girl', 'named', 'lily', 'found', 'a']


#### save the vocab

In [13]:
torch.save(vocab, './data-store/TinyStories/vocab_size-30000/vocab.pth')

#### Make Token id list for train and validate a model.

In [14]:
token_ids = {}
for split, text_lst in texts.items():
    token_ids[split] = []
    for text in tqdm(text_lst):
        tokens = [t.text for t in tokenizer(text)]
        ids = vocab.lookup_indices(tokens)
        token_ids[split].append(ids)

100%|██████████████████████████████████████████████████████████████████████████████████████████| 2117893/2117893 [04:46<00:00, 7389.18it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████| 21970/21970 [00:02<00:00, 7768.62it/s]


#### Check a token id list

In [15]:
print(token_ids['train'][1])
print(' '.join(vocab.lookup_tokens(token_ids['train'][1])))

[46, 50, 8, 37, 6, 35, 9, 8, 36, 159, 72, 1952, 3, 1952, 75, 7, 64, 269, 5, 47, 18, 4, 242, 3, 1952, 9, 8, 891, 159, 178, 10, 137, 27, 138, 2576, 3, 138, 2576, 96, 1952, 40, 5, 5490, 23, 6, 1952, 9, 1629, 18, 4, 102, 78, 10, 42, 8, 41, 147, 3, 4, 147, 27, 182, 524, 21, 45, 2127, 3, 1952, 115, 145, 4, 524, 836, 5, 49, 7, 47, 20, 55, 3, 1952, 687, 403, 4, 147, 5, 300, 4, 524, 836, 28, 53, 3, 10, 209, 5, 5024, 19, 0, 174, 20, 4, 2127, 524, 65, 23, 3, 78, 13, 9, 37, 7, 64, 112, 6, 1952, 153, 10, 458, 144, 2576, 3, 10, 63, 7, 4, 2576, 328, 5, 116, 144, 891, 2576, 3, 190, 6, 1952, 9, 378, 7, 64, 269, 5, 47, 113, 4, 252, 23, 3, 5, 1952, 227, 450, 292, 188, 3]
once upon a time , there was a little car named beep . beep loved to go fast and play in the sun . beep was a healthy car because he always had good fuel . good fuel made beep happy and strong.one day , beep was driving in the park when he saw a big tree . the tree had many leaves that were falling . beep liked how the leaves fall and wa

#### Save a token ids

In [16]:
save_dict('./data-store/TinyStories/vocab_size-30000/token_ids.pkl', token_ids)

### Make N x K(context size) array for training.

In [17]:
import numpy as np

#### Make a contiguous list

In [18]:
# make a contiguous list
contiguous_ids = {}
for split, token_id_lst in token_ids.items():
    contiguous_ids[split] = []
    for tokens in tqdm(token_id_lst):
        # add sot and eot tokens
        tokens = [1] + tokens + [2]
        
        contiguous_ids[split] += tokens

100%|████████████████████████████████████████████████████████████████████████████████████████| 2117893/2117893 [00:03<00:00, 558273.64it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 21970/21970 [00:00<00:00, 460102.45it/s]


In [19]:
for k, v in contiguous_ids.items():
    print(f"{k}: {len(v):,}")

train: 435,441,807
validation: 4,385,624


#### Reshape a contiguous list to NxK array

In [20]:
context_array = {}
for split, v in contiguous_ids.items():
    arr = np.asarray(v, np.uint16)
    length = arr.shape[0]
    N = length // context_size
    
    context_array[split] = arr[:N * context_size].reshape(N, context_size)

In [21]:
for k, v in context_array.items():
    print(f"{k}: {v.shape}")

train: (3401889, 128)
validation: (34262, 128)


In [22]:
np.save('./data-store/TinyStories/vocab_size-30000/train_context_arr.npy', context_array['train'])
np.save('./data-store/TinyStories/vocab_size-30000/valid_context_arr.npy', context_array['validation'])