In [1]:
from datasets import load_dataset
raw_dataset = load_dataset("cnn_dailymail", "3.0.0")

In [2]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [3]:
raw_dataset['train'][0]['article'][:200]

"LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on "

In [4]:
raw_dataset['train'].to_pandas()

Unnamed: 0,article,highlights,id
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...,ee8871b15c50d0db17b0179a6d2beab35065f1e9
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa...",06352019a19ae31e527f37f7571c6dd7f0c5da37
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non...",24521a2abb2e1f5e34e6824e0f9e56904a2b0e88
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical...",7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a
...,...,...,...
287108,"The nine-year-old daughter of a black, unarmed...","Rumain Brisbon, 34, was killed after Phoenix p...",279a12d3ee37b8109cc192a9e88115a5a631fb06
287109,Legalising assisted suicide is a slippery slop...,"Theo Boer, a European assisted suicide watchdo...",b5bc9d404a9a5d890c9fc26550b67e6d8d83241f
287110,A group calling itself 'The Women of the 99 Pe...,Ohio congressman criticised for 'condoning the...,500862586f925e406f8b662934e1a71bbee32463
287111,Most men enjoy a good pint of lager or real al...,The Black Country Ale Tairsters have been to 1...,32a1f9e5c37a938c0c0bca1a1559247b9c4334b2


In [5]:
from datasets import DatasetDict
sampled_dataset = DatasetDict(
  {
    "train": raw_dataset['train'].select(range(50000)).shuffle(),
    "valid": raw_dataset['test'].select(range(5000)).shuffle(),
  }
)

### Tokenizer

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [7]:
def get_training_corpus(ds):
  return (
    ds[i:i+1000]['article'] for i in range(0, len(ds), 1000)
  )

training_corpus = get_training_corpus(raw_dataset['train'])

In [8]:
%%time
tokenizer = tokenizer.train_new_from_iterator(training_corpus, vocab_size=50257)




CPU times: user 58min 25s, sys: 2min 51s, total: 1h 1min 16s
Wall time: 9min 25s


In [9]:
sample_text = "LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on"
tokenizer.tokenize(sample_text)

['LONDON',
 ',',
 'ĠEngland',
 'Ġ(',
 'Re',
 'uters',
 ')',
 'Ġ--',
 'ĠHarry',
 'ĠPotter',
 'Ġstar',
 'ĠDaniel',
 'ĠRadcliffe',
 'Ġgains',
 'Ġaccess',
 'Ġto',
 'Ġa',
 'Ġreported',
 'ĠÂ£',
 '20',
 'Ġmillion',
 'Ġ($',
 '41',
 '.',
 '1',
 'Ġmillion',
 ')',
 'Ġfortune',
 'Ġas',
 'Ġhe',
 'Ġturns',
 'Ġ18',
 'Ġon',
 'ĠMonday',
 ',',
 'Ġbut',
 'Ġhe',
 'Ġinsists',
 'Ġthe',
 'Ġmoney',
 'Ġwon',
 "'t",
 'Ġcast',
 'Ġa',
 'Ġspell',
 'Ġon']

In [10]:
tokenizer(sample_text, return_length=True)

{'input_ids': [20039, 12, 1444, 527, 10685, 8799, 9, 630, 4047, 11999, 940, 3946, 24258, 13752, 2653, 280, 259, 1243, 818, 2377, 1252, 7382, 8814, 14, 17, 1252, 9, 9072, 359, 306, 7235, 1447, 316, 1519, 12, 486, 306, 7342, 262, 1424, 1536, 571, 4562, 259, 7294, 316], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'length': [46]}

In [None]:
context_length = 128
def tokenize(batch):
  outputs = tokenizer(
    batch['article'],
    max_length=context_length,
    truncation=True,
    return_overflowing_tokens=True,
    return_length=True
  )

  input_batch = []
  for length, input_ids in zip(outputs['length'], outputs['input_ids']):
    if length == context_length:
      input_batch.append(input_ids)
  return {"input_ids": input_batch}
