In [None]:
!pip install peft datasets

In [None]:
import torch

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig

In [None]:
base_model_name = "EleutherAI/pythia-1.4b"
model_max_length = 128

# Low-Rank Adaptation (LoRA) is an adapters method for parameter
# efficient finetuning. For more details, look at the paper:
# https://arxiv.org/abs/2106.09685
peft_config = LoraConfig(
    r=32,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    # We apply LoRA to the attention matrices, as recommended by
    # the paper authors.
    target_modules=["query_key_value", "dense"],
)

args = TrainingArguments(
    output_dir="output",
    # This is the batch size for each forward in the gpu.
    per_device_train_batch_size=16,
    # Gradient accumulation only performs the backward pass every
    # n steps, accumulating gradients in between. This means that
    # the effective batch size we are using is 16 * 2 = 32.
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    # LR scheduler gradually reduces the learning rate from its
    # initial value to 0 and usually leads to better results.
    lr_scheduler_type="linear",
    # Training for 1 epoch would be ideal but for time reasons
    # we only do 400 steps.
    # Uncomment the num_training_epochs and comment max_steps to see
    # how the model performance changes with more data.
    max_steps=400,
    # num_train_epochs=1,
    logging_steps=1,
    # Enables mixed precision training. This performs the forward and
    # backward computations in floating point with 16 bits. This leads
    # to faster training due to specialized hardware instructions.
    # For more information read the following paper:
    # https://arxiv.org/abs/1710.03740
    fp16=True,
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_name, bos_token="<|startoftext|>", pad_token="<|padding|>",
)
tokenizer.max_length = model_max_length

tokenizer

In [None]:
model = AutoModelForCausalLM.from_pretrained(base_model_name, device_map="cuda")
model.resize_token_embeddings(len(tokenizer))
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model

In [None]:
# The alpaca dataset is an instruction tuning dataset composed of the following features:
# * instruction: the instruction for the model
# * input: optional extra input (such as a text to summarize)
# * output: text to be generated by the model
# For more information look at: https://crfm.stanford.edu/2023/03/13/alpaca.html
dataset = load_dataset("tatsu-lab/alpaca")
dataset

In [None]:
train = dataset["train"]
train

In [None]:
def create_prompt(instruction, input=None):
  if input is None or len(input) == 0:
    return f"Instruction:\n{instruction}\nAnswer:\n"
  return f"Instruction:\n{instruction}\nInput:{input}\n{input}\nAnswer:\n"

def tokenize_text(record):
  instruction = record["instruction"].strip()
  input = record["input"].strip()
  prompt = create_prompt(instruction, input)
  target = record["output"].strip()
  text = f"{prompt}{target}"
  input_ids = tokenizer(text)["input_ids"]
  # Add bos and eos
  input_ids = [tokenizer.bos_token_id] + input_ids + [tokenizer.eos_token_id]

  labels = [t for t in input_ids]
  return {"input_ids": input_ids, "labels": labels}

train = train.map(tokenize_text)
train

In [None]:
# Here, we could alternatively apply truncation and keep the first tokens of
# the text until the model length is filled. However, since we have many records
# we choose to discard the larger ones which will lead to incomplete texts.
train = train.filter(lambda x: len(x["input_ids"]) <= model_max_length)
train

In [None]:
def pad_to_max_length(record):
  pad_len = model_max_length - len(record["input_ids"])
  record["input_ids"] = record["input_ids"] + [tokenizer.pad_token_id] * pad_len
  # In the labels, we pad with -100 as this indicates to the cross entropy loss
  # these entries should be ignored.
  record["labels"] = record["labels"] + [-100] * pad_len
  assert len(record["input_ids"]) == model_max_length
  return record

train = train.map(pad_to_max_length)

In [None]:
for record in train.select(range(1)):
  print(record["input_ids"])
  print(record["labels"])
  print(tokenizer.batch_decode(record["input_ids"], skip_special_tokens=False))
  print()


In [None]:
@torch.no_grad
def run_instruction(instruction, model):
  prompt = create_prompt(instruction)
  input_ids = tokenizer(prompt)["input_ids"]
  input_ids = [tokenizer.bos_token_id] + input_ids
  input_ids = torch.tensor(input_ids).cuda().unsqueeze(0)
  input_len = input_ids.shape[1]
  # We need to pass this because of a bug in with PeftModel
  # In a regular HF model this is not required
  attention_mask = torch.ones((1, input_len)).cuda()
  output = model.generate(
      input_ids=input_ids,
      attention_mask=attention_mask,
      max_new_tokens=model_max_length,
      # We are sampling from the models outputs, so try and rerun the
      # prompts to see the variation in the outputs.
      do_sample=True,
      # This is to remove a warning where during generation
      # we replace the pad_token_id by eos to stop if the
      # model also generates the padding token.
      pad_token_id=tokenizer.eos_token_id,
  )
  # Remove the first tokens as they are the input
  output_no_prompt = output[:, input_len:]
  return tokenizer.batch_decode(output_no_prompt, skip_special_tokens=True)[0]

In [None]:
def print_instruction(instruction, model):
  print("-" * 100)
  print("Instruction:")
  print(instruction)
  output = run_instruction(instruction, model)
  print("Answer:")
  print(output)
  print()

print_instruction("What are important concepts in Deep Learning?", model)
print_instruction("What is a Large Language Model?", model)
print_instruction("What is the capital of Portugal?", model)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train,
)
trainer.train()
# Clear extra memory from optimizer/batches
torch.cuda.empty_cache()

In [None]:
# Load old model so that we can compare outputs
pretrained_model = AutoModelForCausalLM.from_pretrained(base_model_name, device_map="cuda")
pretrained_model.resize_token_embeddings(len(tokenizer))

finetuned_model = trainer.model

In [None]:
def compare_models(instruction):
  print("-" * 100)
  print("Prompt:", instruction)
  old_output = run_instruction(instruction, pretrained_model)
  print("Pretrained Answer", "-" * 80)
  print(old_output)
  new_output = run_instruction(instruction, finetuned_model)
  print("Instruction tuned Answer", "-" * 80)
  print(new_output)
  print()

compare_models("What are important concepts in Deep Learning?")
compare_models("What is a Large Language Model?")
compare_models("What is the capital of Portugal?")