# Prepare data for llama model

Please use the pytorch jupyter kernel in this demo, which is workable for  pytorch1.13_python3.9 image on SageMaker studio.

In [None]:
!pip install -U sagemaker
!pip install git+https://github.com/huggingface/transformers.git
!pip install datasets[s3]==2.8.0
!pip install sentencepiece

After updating SageMaker python SDK, restart the jupyter kernel.

If you are going to use Sagemaker in a local environment. You need access to an IAM Role with the required permissions for Sagemaker. You can find [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) more about it.



In [None]:
import sagemaker
from sagemaker import get_execution_role

import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = (
    get_execution_role()
)  # provide a pre-existing role ARN as an alternative to creating a new role
print(f"SageMaker Execution Role:{role}")

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")


## process dataset and upload to S3

We need to prepare a dataset to fine-tune our model. 
We define some parameters, which we use throughout the whole example, feel free to adjust it to your needs.

We are splitting the processing and training into two separate paths. This allows you to run the preprocessing outside of the managed SageMaker Training job. We process (tokenize) the dataset and upload to s3 and pass it into our managed Training job.

In [None]:
!unzip -o daily-dialog.txt.zip

In [None]:
from datasets import load_dataset,load_from_disk
from transformers.models.llama.tokenization_llama import LlamaTokenizer

# tokenizer used in preprocessing
tokenizer_name = 'decapoda-research/llama-7b-hf'
#dataset
dataset_name = "daily-dialog.txt"
dataset = load_from_disk(dataset_name)

# s3 key prefix for the data
s3_prefix = 'samples/datasets/1536-token-length-for-llama'

In [None]:
print(f"Test dataset size: {len(dataset['test'])}")
print(f"Train dataset size: {len(dataset['train'])}")

In [None]:
dataset["test"][0]

In [None]:
import transformers
transformers.logging.set_verbosity_error()

tokenizer_name = 'decapoda-research/llama-7b-hf'
# download tokenizer
tokenizer = LlamaTokenizer.from_pretrained(tokenizer_name)

tokenizer.add_special_tokens({'additional_special_tokens': ['[STOP]','[SEP]']})
#tokenizer.add_special_tokens({'pad_token': '<unk>'})
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
print("pad token is {0}".format(tokenizer.pad_token))
max_length = 1536

# tokenizer helper function
def tokenize(example):
    result = tokenizer(example["convo"].replace("##","[SEP]"),example["response"]+"[STOP]", padding='max_length', truncation=True,max_length=max_length)
    result["labels"] = result["input_ids"]
    return result

train_dataset = dataset["train"]
test_dataset = dataset["test"]
# tokenize dataset
train_dataset = train_dataset.map(tokenize, num_proc=8)
test_dataset = test_dataset.map(tokenize, num_proc=8)

In [None]:
test_dataset[0]

In [None]:
test_dataset.format

In [None]:
# set format for pytorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
train_dataset[0]

In [None]:
import boto3, os
from datasets.filesystems import S3FileSystem
s3 = S3FileSystem()  

client = boto3.client("s3")
# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
train_dataset.save_to_disk(training_input_path,fs=s3)
# save test_dataset to s3
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'
test_dataset.save_to_disk(test_input_path,fs=s3)

We now have everything needed to process our dataset.

In [None]:
training_input_path, test_input_path