In [1]:
!pip install transformers  --upgrade --quiet
!pip install datasets --upgrade --quiet

In [2]:
!huggingface-cli login --token <token>

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/jupyter/.cache/huggingface/token
Login successful


In [3]:
from datasets import load_dataset
from random import randrange

# Load dataset from the hub
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

print(f"dataset size: {len(dataset)}")
print(dataset[randrange(len(dataset))])
# dataset size: 15011

Found cached dataset json (/home/jupyter/.cache/huggingface/datasets/databricks___json/databricks--databricks-dolly-15k-1a24287182230a5f/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


dataset size: 15011
{'instruction': "What's it like to live in the belly of a whale?", 'context': '', 'response': 'The giant whale that can swallow a human (Blue Whale) doesn\'t have teeth, so the experience was "painless." However, you would never understand how dark the darkness could be until you\'ve been in the belly of a whale. Also, talk about being seasick. Everything inside sloshes around and smells terribly as it\'s slowly being digested. This only adds insult to injury when it comes to feeling nauseous.', 'category': 'creative_writing'}


In [4]:
def format_dolly(sample):
    instruction = f"### Instruction\n{sample['instruction']}"
    context = f"### Context\n{sample['context']}" if len(sample["context"]) > 0 else None
    response = f"### Answer\n{sample['response']}"
    # join all the parts together
    prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
    return prompt

In [5]:
from random import randrange

print(format_dolly(dataset[randrange(len(dataset))]))

### Instruction
What channel did the series begin on?

### Context
The series began on Discovery Health Channel on November 10, 2009. Season 1 concluded on December 29, 2009, after 6 episodes. Season 2 ran from July 20, 2010, to October 19, 2010, with 8 episodes. Season 3 ran from September 1 to 29, 2011, with 6 episodes.

### Answer
The series began on Discover Health Channel on November 10, 2009.


In [6]:
from transformers import AutoTokenizer

model_id = "meta-llama/Llama-2-13b-hf" # sharded weights
tokenizer = AutoTokenizer.from_pretrained(model_id,use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [7]:
from random import randint
from itertools import chain
from functools import partial


# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}"
    return sample


# apply prompt template per sample
dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))
# print random sample
print(dataset[randint(0, len(dataset))]["text"])

# empty list to save remainder from batches to use in next batch
remainder = {"input_ids": [], "attention_mask": [], "token_type_ids": []}

def chunk(sample, chunk_length=2048):
    # define global remainder variable to save remainder from batches to use in next batch
    global remainder
    # Concatenate all texts and add remainder from previous batch
    concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
    concatenated_examples = {k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()}
    # get total number of tokens for batch
    batch_total_length = len(concatenated_examples[list(sample.keys())[0]])

    # get max number of chunks for batch
    if batch_total_length >= chunk_length:
        batch_chunk_length = (batch_total_length // chunk_length) * chunk_length

    # Split by chunks of max_len.
    result = {
        k: [t[i : i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
        for k, t in concatenated_examples.items()
    }
    # add remainder to global variable for next batch
    remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
    # prepare labels
    result["labels"] = result["input_ids"].copy()
    return result


# tokenize and chunk dataset
lm_dataset = dataset.map(
    lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset.features)
).map(
    partial(chunk, chunk_length=2048),
    batched=True,
)

# Print total number of samples
print(f"Total number of samples: {len(lm_dataset)}")

Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/databricks___json/databricks--databricks-dolly-15k-1a24287182230a5f/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-01769950d43b8f6e.arrow
Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/databricks___json/databricks--databricks-dolly-15k-1a24287182230a5f/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-3b078dbcc4c7ee06.arrow
Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/databricks___json/databricks--databricks-dolly-15k-1a24287182230a5f/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-1b17ed058cf85374.arrow


### Instruction
Give me a list of five  healthy snacks for kids

### Answer
Peanut butter, cheese, milk, fruit and popcorn</s>
Total number of samples: 1581


In [12]:
lm_dataset.save_to_disk(f"/home/jupyter/gcs/data")

Saving the dataset (0/1 shards):   0%|          | 0/1581 [00:00<?, ? examples/s]

In [15]:
!gsutil cp -r /home/jupyter/gcs/data gs://llama-bucket-syd

Copying file:///home/jupyter/gcs/data/data-00000-of-00001.arrow [Content-Type=application/octet-stream]...
Copying file:///home/jupyter/gcs/data/state.json [Content-Type=application/json]...
Copying file:///home/jupyter/gcs/data/dataset_info.json [Content-Type=application/json]...
- [3 files][ 43.3 MiB/ 43.3 MiB]                                                
Operation completed over 3 objects/43.3 MiB.                                     


In [16]:
!gsutil ls gs://llama-bucket-syd

gs://llama-bucket-syd/data/
gs://llama-bucket-syd/gcs/
