In [None]:
!pip install -qqqU sagemaker wandb datasets transformers

In [None]:
WANDB_PROJECT = "aws_llm_workshop"
USE_S3 = True

RAW_TRAIN_DATASET_ARTIFACT = 'capecape/wandbot/run-m6nz6yrl-wandbot_questions:v0'
RAW_EVAL_DATASET_ARTIFACT  = "wandbot/wandbot-eval/run-kinbxic4-responses:v0"

In [None]:
MODEL_NAME = "codellama/CodeLlama-7b-Instruct-hf"

# Preparing Data
How to prepare our dataset for model Finetune

## SageMaker auth

In [None]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")


## Formatting the data for the LLM

A big part of training LLMs lives in getting the data formatted correctly!

In [None]:
import wandb
import pandas as pd

let's create a run and monitor our work from there

In [None]:
wandb.init(project=WANDB_PROJECT, job_type="text_formatting")

# this way we get tracebility
dataset_artifact = wandb.use_artifact(RAW_TRAIN_DATASET_ARTIFACT)
table = dataset_artifact.get("wandbot_questions")

this is a W&B table, so we can convert it to whatever format we may need

In [None]:
df = pd.DataFrame(table.data, columns=table.columns)
df = df.dropna()
df = df.assign(context_len = lambda df: df.page_content.str.len()/3.6)
df.head()

In [None]:
len(df)

Let's prepare the training dataset now

If you use CodeLLama we need to format the instructions accordingly!

In [None]:
B_INST, E_INST = "[INST] ", " [/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
EOS = "</s>"

prompt_format = (
    B_INST
    + B_SYS
    + "You are an AI assistant designed to assist developers with everyday tasks related to Weight & Biases "
    + "and provide helpful information. As an expert in the open-source python SDK wandb answer the following "
    # + "question based on the context below. Answer in formatted Markdown.\n"
    + "question below. Answer in formatted Markdown.\n"
    + "{page_content}"
    + E_SYS
    + "{question}"
    + E_INST
    + "\n[W&B]\n"
    + "{answer}"
    + "\n[/W&B]"
    + EOS
)

def format_text(row):
    return prompt_format.format_map(row)

In [None]:
print(prompt_format)

In [None]:
one_example = format_text(df.iloc[0])
print(one_example)

Let's compute the format over all the dataframe

In [None]:
df["text"] = df.apply(format_text, axis=1)

# print(df.iloc[200]["text"])

In [None]:
df.to_json("wandb_questions_ds.jsonl", orient='records', lines=True)

## Saving your work to W&B

We should log this to W&B so we can inspect the dataset interactively using W&B Tables

In [None]:
table = wandb.Table(dataframe=df)
wandb.log({"wandb_questions_ds": table})

# let's also save a the dataset at this stage
at = wandb.Artifact(
    name="wandb_questions_ds", 
    type="dataset",
    description="A wandbot dataset of questions and answers about W&B for training (non tokenized)",
    metadata={"prompt_format": prompt_format,
              "length": len(df),
             }
)
at.add_file("wandb_questions_ds.jsonl")
wandb.log_artifact(at)
wandb.finish()

## Tokenizing and saving the preprocessing
We can save time during training by pre-processing the dataset and loading directly a tokenized dataset!

In [None]:
import wandb
from transformers import AutoTokenizer
from datasets import load_dataset

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token

we can convert the data to a huggingface parquet-based dataset for fast loading

In [None]:
wandb.init(project=WANDB_PROJECT, job_type="tokenizing")
artifact = wandb.use_artifact('capecape/aws_llm_workshop/wandb_questions_ds:v0', type='dataset')
artifact_dir = artifact.download()

In [None]:
train_dataset = load_dataset(
    path=".", 
    data_files=f"{artifact_dir}/wandb_questions_ds.jsonl", 
    split="train")
train_dataset

one sample looks like this 👇

In [None]:
train_dataset = train_dataset.select_columns(["text"])

### Packing and chunking

We define some helper functions to pack our samples into sequences of a given length and then tokenize them.tokenizer

In [None]:
from random import randint
from itertools import chain
from functools import partial


# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{sample}{tokenizer.eos_token}"
    return sample


# apply prompt template per sample
train_dataset = train_dataset.map(template_dataset)
# print random sample
print(train_dataset[randint(0, len(train_dataset))]["text"])

# empty list to save remainder from batches to use in next batch
remainder = {"input_ids": [], "attention_mask": [], "token_type_ids": []}

In [None]:
def chunk(sample, chunk_length=1024):
    # define global remainder variable to save remainder from batches to use in next batch
    global remainder
    # Concatenate all texts and add remainder from previous batch
    concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
    concatenated_examples = {k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()}
    # get total number of tokens for batch
    batch_total_length = len(concatenated_examples[list(sample.keys())[0]])

    # get max number of chunks for batch
    if batch_total_length >= chunk_length:
        batch_chunk_length = (batch_total_length // chunk_length) * chunk_length

    # Split by chunks of max_len.
    result = {
        k: [t[i : i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
        for k, t in concatenated_examples.items()
    }
    # add remainder to global variable for next batch
    remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
    # prepare labels
    result["labels"] = result["input_ids"].copy()
    return result


# tokenize and chunk dataset
lm_dataset = train_dataset.map(
    lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(train_dataset.features)
).map(
    partial(chunk, chunk_length=1024),
    batched=True,
)

# Print total number of samples
print(f"Total number of samples: {len(lm_dataset)}")

## Save to a bucket and W&B

We are now going to use W&B Aritfacts integration with S3 buckets, so our dataset is close to the training compute SM instance

In [None]:
# save train_dataset to s3
if USE_S3:
    training_input_path = f's3://{sess.default_bucket()}/processed/wandbot/train'
else:
    training_input_path = "./wandbot_train_ds"

lm_dataset.save_to_disk(training_input_path)

print("uploaded data to:")
print(f"training dataset to: {training_input_path}")

In [None]:
at = wandb.Artifact(
    name="wandbot_dataset_tokenized", 
    type="dataset",
    description="A wandbot dataset of questions and answers about W&B - CodeLLama tokenized",
    metadata={"model_name": MODEL_NAME, "tokenizer": MODEL_NAME},
)

In [None]:
if USE_S3:
    at.add_reference(training_input_path)
else:
    at.add_dir(training_input_path)
wandb.log_artifact(at)

Let's finish this run

In [None]:
wandb.finish()

# Eval Dataset
We prepared a set of questions from `wandbot` that were gathered and curated by my colleague Ayush T. God's work here!

In [None]:
import json
import wandb
import pandas as pd

wandb.init(project=WANDB_PROJECT, job_type="eval_preprocessing")
question_artifacts = wandb.use_artifact(RAW_EVAL_DATASET_ARTIFACT)

with open(question_artifacts.file()) as f:
    data = json.load(f)

columns = data["columns"]
data = data["data"]
eval_df = pd.DataFrame(data, columns=columns)

In [None]:
eval_df.head()

Let's remove retrieved Japanese text

### Clean up and prepare (pandas workout)

In [None]:
def contains_japanese(text):
    for char in text:
        if '一' <= char <= '龥':
            return True
    return False

In [None]:
s = "## WandbCallback 引数\n\n| 引数 | 使用法 |"

In [None]:
eval_df["retrieved_context_en"] = [[ctx for ctx in ctxs if not contains_japanese(ctx)] for ctxs in eval_df.retrieved_context.values]

In [None]:
eval_df.head()

Just keep the first page results, to save memory...

In [None]:
eval_df["retrieved_context_first"] = [ctxs[0] for ctxs in eval_df.retrieved_context_en.values]

In [None]:
eval_df.head()

In [None]:
eval_df = eval_df.assign(tokens = eval_df['retrieved_context_first'].str.len())

In [None]:
eval_df.head()

In [None]:
eval_df = eval_df[["query", "generated_response", "retrieved_context_first", "tokens"]]

In [None]:
eval_df.columns = ["question", "answer", "retrieved_context", "char_len"]

In [None]:
eval_df = eval_df.rename({"retrieved_context": "page_content"}, axis=1)
eval_df.head()

## Save to W&B

Let's format the dataset in the same way we created the training dataset, we have to be consisten with naming
- We remove the answer, but we are going to keep it on the dataset to evaluate the model

In [None]:
B_INST, E_INST = "[INST] ", " [/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
EOS = "</s>"

eval_prompt_format = (
    B_INST
    + B_SYS
    + "You are an AI assistant designed to assist developers with everyday tasks related to Weight & Biases"
    + "and provide helpful information. As an expert in the open-source python SDK wandb answer the following"
    # + "question based on the context below. Answer in formatted Markdown.\n"
    + "question below. Answer in formatted Markdown.\n"
    + "{page_content}"
    + E_SYS
    + "{question}"
    + E_INST
    + "\n[W&B]\n"
    # + "{answer}"
    # + "\n[/W&B]"
    # + EOS
)

def eval_format_text(row):
    return eval_prompt_format.format_map(row)

In [None]:
eval_df["text"] = eval_df.apply(eval_format_text, axis=1)

Save to disk and create HF dataset

In [None]:
from datasets import load_dataset

In [None]:
eval_df.to_json("wandbot_eval.jsonl", orient='records', lines=True)
eval_dataset = load_dataset(".", data_files="wandbot_eval.jsonl")
eval_dataset

In [None]:
# save train_dataset to s3
if USE_S3:
    eval_input_path = f's3://{sess.default_bucket()}/processed/wandbot/eval'
else:
    eval_input_path = "./wandbot_eval_ds"

In [None]:
eval_dataset.save_to_disk(eval_input_path)

In [None]:
table = wandb.Table(dataframe=eval_df)
wandb.log({"wandbot_eval_dataset": table})

In [None]:
at = wandb.Artifact(
    name="wandbot_eval_dataset", 
    type="dataset",
    description="A wandbot dataset of questions and answers about W&B for evaluation",
    metadata={"prompt_format": eval_prompt_format,
              "length": len(eval_dataset),
             })

In [None]:
if USE_S3:
    at.add_reference(eval_input_path)
else:
    at.add_dir(eval_input_path)
wandb.log_artifact(at)

In [None]:
wandb.finish()