In [1]:
from datasets import load_dataset
raw_dataset = load_dataset("cnn_dailymail", "3.0.0")

In [2]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [3]:
raw_dataset['train'][0]['article'][:200]

"LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on "

In [4]:
raw_dataset['train'].to_pandas()

Unnamed: 0,article,highlights,id
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...,ee8871b15c50d0db17b0179a6d2beab35065f1e9
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa...",06352019a19ae31e527f37f7571c6dd7f0c5da37
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non...",24521a2abb2e1f5e34e6824e0f9e56904a2b0e88
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical...",7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a
...,...,...,...
287108,"The nine-year-old daughter of a black, unarmed...","Rumain Brisbon, 34, was killed after Phoenix p...",279a12d3ee37b8109cc192a9e88115a5a631fb06
287109,Legalising assisted suicide is a slippery slop...,"Theo Boer, a European assisted suicide watchdo...",b5bc9d404a9a5d890c9fc26550b67e6d8d83241f
287110,A group calling itself 'The Women of the 99 Pe...,Ohio congressman criticised for 'condoning the...,500862586f925e406f8b662934e1a71bbee32463
287111,Most men enjoy a good pint of lager or real al...,The Black Country Ale Tairsters have been to 1...,32a1f9e5c37a938c0c0bca1a1559247b9c4334b2


In [5]:
from datasets import DatasetDict
sampled_dataset = DatasetDict(
  {
    "train": raw_dataset['train'].select(range(50000)).shuffle(),
    "valid": raw_dataset['test'].select(range(5000)).shuffle(),
  }
)

### Tokenizer

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [7]:
def get_training_corpus(ds):
  return (
    ds[i:i+1000]['article'] for i in range(0, len(ds), 1000)
  )

training_corpus = get_training_corpus(raw_dataset['train'])

In [8]:
%%time
tokenizer = tokenizer.train_new_from_iterator(training_corpus, vocab_size=50257)




CPU times: user 18min 56s, sys: 41.3 s, total: 19min 37s
Wall time: 3min 18s


In [9]:
sample_text = "LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on"
tokenizer.tokenize(sample_text)

['LONDON',
 ',',
 'ĠEngland',
 'Ġ(',
 'Re',
 'uters',
 ')',
 'Ġ--',
 'ĠHarry',
 'ĠPotter',
 'Ġstar',
 'ĠDaniel',
 'ĠRadcliffe',
 'Ġgains',
 'Ġaccess',
 'Ġto',
 'Ġa',
 'Ġreported',
 'ĠÂ£',
 '20',
 'Ġmillion',
 'Ġ($',
 '41',
 '.',
 '1',
 'Ġmillion',
 ')',
 'Ġfortune',
 'Ġas',
 'Ġhe',
 'Ġturns',
 'Ġ18',
 'Ġon',
 'ĠMonday',
 ',',
 'Ġbut',
 'Ġhe',
 'Ġinsists',
 'Ġthe',
 'Ġmoney',
 'Ġwon',
 "'t",
 'Ġcast',
 'Ġa',
 'Ġspell',
 'Ġon']

In [10]:
tokenizer(sample_text, return_length=True)

{'input_ids': [20039, 12, 1444, 527, 10685, 8799, 9, 630, 4047, 11999, 940, 3946, 24258, 13752, 2653, 280, 259, 1243, 818, 2377, 1252, 7382, 8814, 14, 17, 1252, 9, 9072, 359, 306, 7235, 1447, 316, 1519, 12, 486, 306, 7342, 262, 1424, 1536, 571, 4562, 259, 7294, 316], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'length': [46]}

In [11]:
context_length = 128
def tokenize(batch):
  outputs = tokenizer(
    batch['article'],
    max_length=context_length,
    truncation=True,
    return_overflowing_tokens=True,
    return_length=True
  )

  input_batch = []
  for length, input_ids in zip(outputs['length'], outputs['input_ids']):
    if length == context_length:
      input_batch.append(input_ids)
  return {"input_ids": input_batch}


In [12]:
tokenized_datasets = sampled_dataset.map(tokenize, batched=True, remove_columns=raw_dataset['train'].column_names)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

### Load Model

In [13]:
from transformers import LlamaConfig

configuration = LlamaConfig()
configuration

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.39.3",
  "use_cache": true,
  "vocab_size": 32000
}

In [14]:
tokenizer.bos_token_id, tokenizer.eos_token_id, tokenizer.vocab_size, tokenizer.pad_token_id

(0, 0, 50257, None)

In [15]:
configuration = LlamaConfig(**{
    "bos_token_id": 0,
    "eos_token_id": 0,
    "hidden_act": "silu",
    "hidden_size": 512,
    "initializer_range": 0.02,
    "intermeidate_size": 1376,
    "max_position_embeddings": 128,
    "model_type": "llama",
    "num_attention_heads": 4,
    "num_hidden_layers": 4,
    "pad_token_id": 0,
    "rms_norm_eps": 1e-06,
    "tie_word_embeddings": False,
    "transformers_version": "4.39.3",
    "use_cache": True,
    "vocab_size": 50257,
})

In [16]:
from transformers import LlamaForCausalLM

model = LlamaForCausalLM(configuration)
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(50257, 512, padding_idx=0)
    (layers): ModuleList(
      (0-3): 4 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=512, out_features=512, bias=False)
          (k_proj): Linear(in_features=512, out_features=512, bias=False)
          (v_proj): Linear(in_features=512, out_features=512, bias=False)
          (o_proj): Linear(in_features=512, out_features=512, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=512, out_features=11008, bias=False)
          (up_proj): Linear(in_features=512, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=512, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head)

In [17]:
import torch

# device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu')
# device
device = torch.device('cpu')
device

device(type='cpu')

In [18]:
prompt = "It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in "

inputs = tokenizer(prompt, return_tensors='pt')
inputs.to(device)

generate_ids = model.generate(inputs.input_ids, max_length=50)
generate_ids

tensor([[  868,   345,  1061,    26,   458,    14,    51,    14,  1497,  4149,
          1288,  2880,  8505,   280,  6284,   285,   316,  1714,   280,  1321,
          1681,  2692,   285,   221, 46405, 11166, 10083, 49483, 44649, 11964,
          7143, 10593, 15703, 30775, 30775, 30775, 30775, 16209, 20790,  3148,
         30775, 16209, 30168, 24420, 25303, 22550, 30775,  9349, 46204, 20334]])

In [19]:
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

"It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in tons ripped consistent subjective Transocean Bald pock accusing culmin Religious Religious Religious Religious namingivestock field Religious naming pillar enrollment Breast Perfect Religious flow cortex rene"

### Train Model

In [20]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator =  DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [21]:
out = data_collator([tokenized_datasets['train'][i] for i in range(3)])

for key in out:
    print(f"{key}: {out[key].shape}")

input_ids: torch.Size([3, 128])
attention_mask: torch.Size([3, 128])
labels: torch.Size([3, 128])


In [22]:
out['input_ids'][0][:20], out['attention_mask'][0][:20], out['labels'][0][:20]

(tensor([24642, 27463,   527,  1356,     9,   630,   777,  1372,  6265,  2106,
         10896,  2456,  1732,   306,   339,  3914,   262,  1496,  3575,   280]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 tensor([24642, 27463,   527,  1356,     9,   630,   777,  1372,  6265,  2106,
         10896,  2456,  1732,   306,   339,  3914,   262,  1496,  3575,   280]))

https://huggingface.co/docs/transformers/v4.19.2/en/performance#gradient-accumulation

In [36]:
from transformers import TrainingArguments

batch_size = 16
logging_steps = 1000
learning_rate = 5e-4
num_epochs = 1

args = TrainingArguments(
    output_dir='newsllama',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="steps",
    eval_steps=logging_steps,
    logging_steps=logging_steps,
    save_steps = logging_steps,
    gradient_accumulation_steps=8,
    num_train_epochs=num_epochs,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type='cosine',
    learning_rate=5e-4,
    # fp16=True,
    push_to_hub=False,
    use_cpu=True,
)

In [31]:
device

device(type='cpu')

In [37]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['valid'],
    # place_model_on_device=True,
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [38]:
trainer.train()

Step,Training Loss,Validation Loss


KeyboardInterrupt: 