In [3]:
def prompt_no_input(row):
    return ("Below is an instruction that describes a task. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Response:\n").format_map(row)

In [4]:
def prompt_input(row):
    return ("Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n").format_map(row)

In [5]:
def create_prompt(row):
    return prompt_no_input(row) if row["input"] == "" else prompt_input(row)

## Why are we doing all this?

Let's load back the artifact we uploaded

In [37]:
import json
from wandb import Api

api = Api()
artifact = api.artifact('capecape/alpaca_ft/alpaca_gpt4_splitted:v4', type='dataset')
dataset_dir = artifact.download()

def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data
    
train_dataset = load_jsonl(f"{dataset_dir}/alpaca_gpt4_train.jsonl")
eval_dataset = load_jsonl(f"{dataset_dir}/alpaca_gpt4_eval.jsonl")

[34m[1mwandb[0m:   2 of 2 files downloaded.  


Because we need to tokenize this dataset in a very particular way, if we want the model to learn to predict the output.

In [38]:
train_dataset[0]

{'instruction': 'Develop a script that prints out the Fibonacci sequence.',
 'input': '',
 'output': 'Here is a Python script that prints out the Fibonacci sequence:\n\n```\n# number of elements in the sequence\nn = int(input("Enter the number of elements in the sequence: "))\n\n# initial two terms\na = 0\nb = 1\n\n# define a loop to generate the sequence\nif n <= 0:\n    print("Invalid input. The number of elements must be greater than 0.")\n\nelif n == 1:\n    print(a)\n\nelse:\n    print(a, b, end=" ")  # first two elements of the sequence\n    for i in range(3, n+1):\n        c = a + b\n        print(c, end=" ")\n        a = b\n        b = c\n```\n\nTo use, enter the number of elements you want in the sequence when prompted. The script will print out the sequence up to the specified number of elements.'}

In [39]:
def format_dataset(dataset):
    "No EOS token yet"
    return [{"prompt":create_prompt(row), 
             "output":row["output"], 
             "example":create_prompt(row) + row["output"]} for row in dataset]
train_dataset = format_dataset(train_dataset)
eval_dataset = format_dataset(eval_dataset)

In [40]:
train_dataset[0]

{'prompt': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nDevelop a script that prints out the Fibonacci sequence.\n\n### Response:\n',
 'output': 'Here is a Python script that prints out the Fibonacci sequence:\n\n```\n# number of elements in the sequence\nn = int(input("Enter the number of elements in the sequence: "))\n\n# initial two terms\na = 0\nb = 1\n\n# define a loop to generate the sequence\nif n <= 0:\n    print("Invalid input. The number of elements must be greater than 0.")\n\nelif n == 1:\n    print(a)\n\nelse:\n    print(a, b, end=" ")  # first two elements of the sequence\n    for i in range(3, n+1):\n        c = a + b\n        print(c, end=" ")\n        a = b\n        b = c\n```\n\nTo use, enter the number of elements you want in the sequence when prompted. The script will print out the sequence up to the specified number of elements.',
 'example': 'Below is an instruction that describes a

In [41]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [42]:
model_id = 'meta-llama/Llama-2-7b-hf'
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

### Packing

We will pack multiple short examples into a longer chunk, so we can train more efficiently!

In [49]:
max_sequence_len = 1024

def pack(dataset, max_seq_len=max_sequence_len):
    tkds_ids = tokenizer([s["example"] for s in dataset])["input_ids"]
    
    all_token_ids = []
    for tokenized_input in tkds_ids:
        all_token_ids.extend(tokenized_input + [tokenizer.eos_token_id])
    
    print(f"Total number of tokens: {len(all_token_ids)}")
    packed_ds = []
    for i in range(0, len(all_token_ids), max_seq_len+1):
        input_ids = all_token_ids[i : i + max_seq_len+1]
        if len(input_ids) == (max_seq_len+1):
            packed_ds.append({"input_ids": input_ids[:-1], "labels": input_ids[1:]})  # this shift is not needed if using the model.loss
    return packed_ds

The main idea here is that the instruction/output samples are short, so let's concatenate a bunch of them together separated by the `EOS` token. We can also pre-tokenize and pre-pack the dataset and make everything faster!  If we define a `max_seq_len = 1024` the code to pack would look something like this:

In [50]:
train_ds_packed = pack(train_dataset)
eval_ds_packed = pack(eval_dataset)
len(train_ds_packed)

Total number of tokens: 11486035
Total number of tokens: 230341


11205

Doing so, we end up with a little more than 11k sequences of lenght 1024. 

In [68]:
one = train_ds_packed[0]
second = train_ds_packed[1]

## TRL

In [51]:
from trl.trainer.utils import ConstantLengthDataset

In [56]:
trl_train = ConstantLengthDataset(
    tokenizer, 
    train_dataset,
    dataset_text_field="example",
    seq_length=1024,
    shuffle=False,
)

In [73]:
it = iter(trl_train)

In [74]:
one_trl = next(it)
second_trl = next(it)

First example

In [75]:
tokenizer.decode(one["input_ids"])[-100:]

'trategies for virtual teams.\n\n### Response:\n1. Regularly scheduled meetings: Scheduling regular meet'

In [76]:
tokenizer.decode(one_trl["input_ids"])[-100:]

'trategies for virtual teams.\n\n### Response:\n1. Regularly scheduled meetings: Scheduling regular meet'

Second

In [84]:
second["input_ids"][0:10], len(second["input_ids"])

([29892, 2845, 491, 4863, 470, 7314, 21362, 29892, 6511, 3815], 1024)

In [79]:
tokenizer.decode(second["input_ids"])[0:100]

', either by video or voice conference, allows team members to discuss ongoing projects, receive upda'

In [85]:
second_trl["input_ids"][0:10], len(second_trl["input_ids"])

(tensor([  886, 29892,  2845,   491,  4863,   470,  7314, 21362, 29892,  6511]),
 1024)

In [80]:
tokenizer.decode(second_trl["input_ids"])[0:100]

'ings, either by video or voice conference, allows team members to discuss ongoing projects, receive '

In [77]:
tokenizer.decode(second["input_ids"])[-100:]

"nks to the company's social media pages, a newsletter sign-up, and any important information such as"

In [78]:
tokenizer.decode(second_trl["input_ids"])[-100:]

" links to the company's social media pages, a newsletter sign-up, and any important information such"