In [1]:
!pip install transformers[torch]
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import os
import torch
import numpy as np

from datasets import load_dataset
from transformers import BloomTokenizerFast, BloomForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments

In [3]:
ROOT_PATH = '/content/drive/MyDrive/Colab Notebooks'
PROJ_PATH = os.path.join(ROOT_PATH, 'PyTorch')
DATA_PATH = os.path.join(ROOT_PATH, 'data')
MODEL_ROOT_PATH = os.path.join(PROJ_PATH, 'model')

device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [4]:
eli5_loaded = load_dataset("eli5", split="train_asks[:64]")



In [5]:
eli5 = eli5_loaded.train_test_split(test_size=0.2).flatten()

In [6]:
eli5["train"]

Dataset({
    features: ['q_id', 'title', 'selftext', 'document', 'subreddit', 'answers.a_id', 'answers.text', 'answers.score', 'title_urls.url', 'selftext_urls.url', 'answers_urls.url'],
    num_rows: 51
})

In [7]:
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-560m")

In [8]:
prompt_completion_split = "\n\n===\n\n"
eop_ids = tokenizer(prompt_completion_split)
print(eop_ids)
eop_token_id = eop_ids["input_ids"][1]

{'input_ids': [603, 53048, 603], 'attention_mask': [1, 1, 1]}


In [9]:
def tokenize_data(examples):
    relevant_texts = zip(examples["title"], examples["selftext"], examples["answers.text"])
    texts = [" ".join([x[0], x[1]]) + prompt_completion_split + " ".join(x[2]) for x in relevant_texts]
    results = tokenizer(texts, padding=True)
    results["labels"] = results["input_ids"].copy()
    return results

In [10]:
eli5_processed = eli5.map(tokenize_data, batched=True, remove_columns=eli5["train"].column_names,)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
len(eli5_processed["train"][0]["input_ids"])

2028

In [12]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [13]:
MODEL_PATH = os.path.join(MODEL_ROOT_PATH, 'bloom-560m')

In [14]:
model = BloomForCausalLM.from_pretrained(MODEL_PATH).to(device)

In [None]:
model = BloomForCausalLM.from_pretrained("bigscience/bloom-560m").to(device)
model.save_pretrained(MODEL_PATH)

In [15]:
MODEL_FINETUNE_PATH = os.path.join(MODEL_ROOT_PATH, 'finetuned-bloom-560m')

training_args = TrainingArguments(
    output_dir=MODEL_FINETUNE_PATH,
    evaluation_strategy="epoch",
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=eli5_processed["train"],
    eval_dataset=eli5_processed["test"],
    data_collator=data_collator,
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 51
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 4
You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
