# 1. Large Datasets and Where to Find Them

In [9]:
from transformers import pipeline
from datasets import load_dataset

from tqdm import tqdm

In [2]:
generation_gpt = pipeline("text-generation", model="openai-gpt")

Some weights of OpenAIGPTLMHeadModel were not initialized from the model checkpoint at openai-gpt and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from transformers import AutoTokenizer

In [4]:
python_code = r"""def say_hello():
      print("Hello, World!")
# Print it
say_hello()
"""

In [5]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [6]:
tokenizer(python_code).tokens()

['def',
 'Ġsay',
 '_',
 'hello',
 '():',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġprint',
 '("',
 'Hello',
 ',',
 'ĠWorld',
 '!"',
 ')',
 'Ċ',
 '#',
 'ĠPrint',
 'Ġit',
 'Ċ',
 'say',
 '_',
 'hello',
 '()',
 'Ċ']

# 2. Building a Tokenizer

In [1]:
a, e = u"a", u"€"

In [3]:
byte = ord(a.encode("utf-8"))

In [4]:
byte

97

In [5]:
byte = [ord(chr(i)) for i in e.encode("utf-8")]

In [6]:
byte

[226, 130, 172]

In [7]:
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode

In [10]:
byte_to_unicode_map = bytes_to_unicode()

In [12]:
unicode_to_byte_map = dict((v, k) for k, v in byte_to_unicode_map.items())

In [7]:
#unicode_to_byte_mapa

In [14]:
base_vocab = list(unicode_to_byte_map.keys())

In [21]:
#base_vocab

In [30]:
tokens = sorted(tokenizer.vocab.items(), key=lambda x: x[1])

In [31]:
[{tokenizer.convert_tokens_to_string(t)} for t, _ in tokens[:10]]

[{'!'}, {'"'}, {'#'}, {'$'}, {'%'}, {'&'}, {"'"}, {'('}, {')'}, {'*'}]

In [33]:
#tokens

### new

In [6]:
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode

byte_to_unicode_map = bytes_to_unicode()
unicode_to_byte_map = dict((v, k) for k, v in byte_to_unicode_map.items())
base_vocab = list(unicode_to_byte_map.keys())

In [7]:
from tqdm.auto import tqdm
from datasets import load_dataset

In [8]:
length = 100000

In [11]:
dataset_name = "transformersbook/codeparrot-train"

In [10]:
dataset = load_dataset(dataset_name, split="train", streaming=True)

NameError: name 'dataset_name' is not defined

In [11]:
iter_dataset = iter(dataset)

In [12]:
def batch_iterator(batch_size=10):
    for _ in tqdm(range(0, length, batch_size)):
        yield [next(iter_dataset)['content'] for _ in range(batch_size)]

In [13]:
batch_iterator()

<generator object batch_iterator at 0x7f0075b79350>

In [None]:
new_tokenizer = tokenizer.train_new_from_iterator(
    batch_iterator(),
    vocab_size=12500,
    initial_alphabet=base_vocab
)

  0%|          | 0/10000 [00:00<?, ?it/s]

### Training a Model from Scratch

In [1]:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

In [2]:
model_ckpt = "gpt2-xl"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [4]:
config = AutoConfig.from_pretrained("gpt2-xl", vocab_size = len(tokenizer))

In [5]:
config

GPT2Config {
  "_name_or_path": "gpt2-xl",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1600,
  "n_head": 25,
  "n_inner": null,
  "n_layer": 48,
  "n_positions": 1024,
  "output_past": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.20.1",
  "use_cache": true,
  "vocab_size": 50257
}

In [6]:
model = AutoModelForCausalLM.from_config(config)

In [7]:
examples, total_characters, total_tokens = 500, 0, 0

In [12]:
dataset = load_dataset(
    "transformersbook/codeparrot-train",
    split="train",
    streaming=True
)

Using custom data configuration transformersbook--codeparrot-train-39fd2cee2b2cb397


In [13]:
for _, example in tqdm(zip(range(examples), iter(dataset)), total=examples):
    total_characters += len(example["content"])
    total_tokens += len(tokenizer(example["content"]).tokens())

  0%|          | 1/500 [00:00<02:20,  3.54it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1310 > 1024). Running this sequence through the model will result in indexing errors
100%|██████████| 500/500 [00:06<00:00, 77.02it/s] 


In [14]:
total_characters

5568371

In [15]:
total_tokens

2527740

In [17]:
import torch
from torch.utils.data import IterableDataset

In [None]:
class ConstantLengthDataset(IterableDataset):
    def __init__(
        self, tokenizer, dataset, seq_length=1024,
        num_of_sequences=1024, chars_per_token=3.6
    ):
        self.tokenizer = tokenizerenizer
        self.concat_token_id = tokenizer.eos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.input_characters = seq_length * char_per_token * num_of_sequences
    
    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        
        while more_examples:
            buffer, buffer_len = [], 0
            
            while True:
                if buffer_len >= self.input_characters:
                    m = f"Buffer full: {}"

In [18]:
class ConstantLengthDataset(IterableDataset):
    
    def __init__(self, tokenizer, dataset, seq_length=1024,
                 num_of_sequences=1024, chars_per_token=3.6):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.eos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.input_characters = seq_length * chars_per_token * num_of_sequences
    
    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.input_characters:
                    m=f"Buffer full: {buffer_len}>={self.input_characters:.0f}"
                    print(m)
                    break
                try:
                    m=f"Fill buffer: {buffer_len}<{self.input_characters:.0f}"
                    print(m)
                    buffer.append(next(iterator)["content"])
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    iterator = iter(self.dataset)

            all_token_ids = []
            tokenized_inputs = self.tokenizer(buffer, truncation=False)
            for tokenized_input in tokenized_inputs['input_ids']:
                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    yield torch.tensor(input_ids)

In [20]:
shuffled_dataset = dataset.shuffle(buffer_size=100)