### Environment setup

In [1]:
!pip install "transformers==4.48.3" "datasets[s3]==3.3.2" "sagemaker>=2.240.0" "awscli==1.38.4" --upgrade --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/117.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.7/59.7 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.5/52.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.7/65.7 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25

In [None]:
from getpass import getpass

HF_TOKEN = getpass("HuggingFace Token:")

!huggingface-cli login --token $HF_TOKEN

In [3]:
!mkdir -p ~/.aws

In [1]:
# %%writefile ~/.aws/credentials
# [default]
# aws_access_key_id = YOUR_ACCESS_KEY
# aws_secret_access_key = YOUR_SECRET_KEY

#!aws configure

In [5]:
region_name = 'us-east-1'  # @param {type: "string"}

In [6]:
import sagemaker
import boto3

#sess = sagemaker.Session()
sess = sagemaker.Session(boto3.session.Session())
# sess = sagemaker.Session(boto3.session.Session(
#     region_name="us-east-1",
#     aws_access_key_id=aws_access_key_id,
#     aws_secret_access_key=aws_secret_access_key
#     )
# )

sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

try:
  exe_role = sagemaker.get_execution_role()
except:
  iam = boto3.client("iam")
  exe_role = iam.get_role(RoleName="sagemaker_execution_role_created")['Role']['Arn']

#sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
sess = sagemaker.Session(boto3.session.Session(), default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {exe_role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml




sagemaker role arn: arn:aws:iam::897722677063:role/sagemaker_execution_role_created
sagemaker bucket: sagemaker-us-east-1-897722677063
sagemaker session region: us-east-1


### Load dataset

In [None]:
from datasets import load_dataset
from random import randrange

# Load dataset from the hub
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

print(f"dataset size: {len(dataset)}")
print(dataset[randrange(len(dataset))])
# dataset size: 15011

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

dataset size: 15011
{'instruction': 'Write an email to the city of Berkeley requesting a refund for basketball camp because it was boring.', 'context': '', 'response': 'To Whom It May Concern, \nWe signed our son up for basketball camp over the break assuming that it would be similar to the quality of the other city programs, including the basketball programs. However, our son did not want to return to camp because it felt more like military training than the program you described on the parks and recreation web page. \n\nOn your site, you describe it as a fun opportunity for children to make friends and improve their basketball skills. The coach, however, told the children that they were not allowed to speak to each other all day, which made it impossible to either have fun to make friends. \n\nAs a result, we have removed our son from the camp and are requesting a refund.\n\nSincerely,\n[insert name]', 'category': 'creative_writing'}


In [None]:
def format_dolly(sample):
    instruction = f"### Instruction:\n{sample['instruction']}"
    context = f"### Context:\n{sample['context']}" if len(sample['context']) > 0 else None
    response  = f"### Answer:\n{sample['response']}"

    prompt = "\n\n".join([ i for i in [instruction, context, response] if i is not None])
    return prompt

In [None]:
from random import randrange
print(format_dolly(dataset[randrange(len(dataset))]))

### Instruction:
Identify which instrument is string or percussion: Atumpan, Axatse, Baryton

### Answer:
Atumpan and Axatse are percussion, Baryton is string.


In [None]:
from transformers import AutoTokenizer

# model_id = "mistralai/Mistral-7B-v0.1"
# model_id =  "NousResearch/Llama-2-7b-hf"
# model_id = "meta-llama/Llama-3.2-3B"
model_id = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]



tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [None]:
from random import randint

def template_dataset(sample):
    sample['text'] = f"{format_dolly(sample)}{tokenizer.eos_token}"
    return sample

dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))
print(dataset[randint(0, len(dataset))]["text"])

Map:   0%|          | 0/15011 [00:00<?, ? examples/s]

### Instruction:
Air Lingus is the national airline of which country

### Answer:
Republic of Ireland or Eire<|end_of_text|>


In [None]:
print(dataset[randint(0, len(dataset))]['text'])

### Instruction:
How many parts did O.J.: Made in America have, and could a documentary with the same length win an Oscar today?

### Context:
The winners were announced during the awards ceremony on February 26, 2017. Moonlight became the first film with an all-black cast and the first LGBT-themed film to win Best Picture. In an event unprecedented in the history of the Oscars, La La Land was incorrectly announced as the Best Picture, and, a few minutes later, the error was corrected and Moonlight was declared the winner. O.J.: Made in America, at 467 minutes, became the longest film to win an Academy Award, surpassing the 431-minute long War and Peace, which won the Academy Award for Best Foreign Language Film in 1969. Following the five-part documentary's win, new academy rules barred any "multi-part or limited series" from being eligible for documentary categories. With Casey Affleck winning the Oscar for Best Actor, he and his older brother, Ben Affleck, became the 16th pair of si

In [None]:
dataset = dataset.map(lambda sample:tokenizer(sample["text"]), batched=True, remove_columns=list(dataset.features))
dataset[randint(0, len(dataset))]

Map:   0%|          | 0/15011 [00:00<?, ? examples/s]

{'input_ids': [128000,
  14711,
  30151,
  512,
  678,
  279,
  11495,
  2035,
  304,
  15704,
  902,
  574,
  49886,
  369,
  279,
  220,
  21,
  339,
  892,
  304,
  279,
  1566,
  220,
  4364,
  15,
  1667,
  382,
  14711,
  22559,
  512,
  57475,
  4488,
  596,
  15992,
  304,
  56750,
  128001],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [None]:
dataset.select(range(20)).map(lambda x:print(x.keys()))

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

KeysView({'input_ids': [128000, 14711, 30151, 512, 4599, 1550, 11463, 8494, 1212, 10565, 1980, 14711, 9805, 512, 64797, 8494, 11, 279, 11380, 836, 315, 11463, 8494, 35230, 80092, 12604, 11, 374, 459, 13673, 6108, 33575, 13, 1102, 374, 279, 7928, 33575, 555, 26155, 1404, 311, 1005, 279, 11463, 6883, 13, 1102, 65362, 3600, 389, 220, 2148, 6287, 220, 1049, 15, 439, 11463, 8868, 11, 449, 1403, 14467, 389, 264, 3254, 6149, 13, 1102, 15187, 1766, 5196, 439, 264, 3682, 33575, 304, 8494, 596, 13018, 3157, 1306, 279, 18678, 315, 1556, 67614, 8494, 304, 6250, 220, 1049, 16, 13, 578, 33575, 706, 2533, 15042, 311, 6089, 8854, 220, 843, 9919, 304, 8494, 11, 505, 69776, 304, 47335, 11, 27535, 323, 21972, 382, 14711, 22559, 512, 64797, 8494, 65362, 3600, 389, 220, 2148, 6287, 220, 1049, 15, 439, 11463, 8868, 11, 449, 1403, 14467, 389, 264, 3254, 6149, 13, 128001], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 20
})

In [None]:
from itertools import chain

d = {1:[[2],[2]], 3:[[4],[4]], 5:[[6],[6]]}
for k in d.keys():
    print({k: list(chain(*d[k]))})

{1: [2, 2]}
{3: [4, 4]}
{5: [6, 6]}


In [None]:
from itertools import chain
from functools import partial

# empty list to save remainder from batches to use in next batch
remainder = {"input_ids":[], "attention_mask":[], "token_type_ids":[]}

def pack_dataset(dataset, chunk_length=2048):
    print(f"Chunking dataset into chunks of {chunk_length} tokens.")

    def chunk(sample, chunk_length=2048):
        # define global remainder variable to save remainder from batches to use in next batch
        global remainder
        # Concatenate all texts and add remainder from previous batch
        concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
        concatenated_examples = {k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()}

        # get total number of tokens for batch
        batch_total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]])
        print(f"batch_total_length with remainder from the previos batch:{batch_total_length}")

        # get max number of chunks for batch
        if batch_total_length > chunk_length:
            batch_total_length = (batch_total_length // chunk_length) * chunk_length
            print(f"batch_total_length after - batches of equally divisable, no leftover in the last chunk:{batch_total_length}")

        result = {
            k: [t[i:i+chunk_length] for i in range(0, batch_total_length, chunk_length)]
            for k, t in concatenated_examples.items()
        }

        # add remainder to global variable for next batch
        remainder = {k:t[batch_total_length:] for k,t in concatenated_examples.items()}
        remainder_length = len(remainder[list(remainder.keys())[0]])
        print(f"remainder of the chuck length:{remainder_length}")

        # prepare labels
        result["labels"] = result["input_ids"].copy()
        return result

    #chunk dataset
    lm_dataset = dataset.map(partial(chunk, chunk_length=2048), batched=True,)
    print(f"Total number of samples: {len(lm_dataset)}")
    return lm_dataset

In [None]:
# chunk dataset
lm_dataset = pack_dataset(dataset, chunk_length=2048) # We use 2048 as the maximum length for packing

# Print total number of samples
print(f"Total number of samples: {len(lm_dataset)}")

Chunking dataset into chunks of 2048 tokens.


Map:   0%|          | 0/15011 [00:00<?, ? examples/s]

batch_total_length with remainder from the previos batch:180172
batch_total_length after - batches of equally divisable, no leftover in the last chunk:178176
remainder of the chuck length:1996
batch_total_length with remainder from the previos batch:190922
batch_total_length after - batches of equally divisable, no leftover in the last chunk:190464
remainder of the chuck length:458
batch_total_length with remainder from the previos batch:185240
batch_total_length after - batches of equally divisable, no leftover in the last chunk:184320
remainder of the chuck length:920
batch_total_length with remainder from the previos batch:167111
batch_total_length after - batches of equally divisable, no leftover in the last chunk:165888
remainder of the chuck length:1223
batch_total_length with remainder from the previos batch:186669
batch_total_length after - batches of equally divisable, no leftover in the last chunk:186368
remainder of the chuck length:301
batch_total_length with remainder from

In [None]:
lm_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1327
})

In [None]:
# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/processed/{model_id}/dolly/train'
lm_dataset.save_to_disk(training_input_path)

print("uploaded data to:")
print(f"training dataset to: {training_input_path}")

### Fine-Tune LLM with QLoRA on SageMaker

In [None]:
from huggingface_hub import HfFolder

# these hyperparameters are passed as command line arguments by sagemaker when calling training script
# The command line arguments are further converted as TrainingArguments
hyperparameters ={
  'model_id': model_id,                             # pre-trained model
  'dataset_path': '/opt/ml/input/data/training',    # path where sagemaker will save training dataset
  'num_train_epochs': 3,                            # number of training epochs
  'per_device_train_batch_size': 6,                 # batch size for training
  'gradient_accumulation_steps': 2,                 # Number of updates steps to accumulate
  'gradient_checkpointing': True,                   # save memory but slower backward pass
  'bf16': True,                                     # use bfloat16 precision
  'tf32': True,                                     # use tf32 precision
  'learning_rate': 2e-4,                            # learning rate
  'max_grad_norm': 0.3,                             # Maximum norm (for gradient clipping)
  'warmup_ratio': 0.03,                             # warmup ratio
  "lr_scheduler_type":"constant",                   # learning rate scheduler
  'save_strategy': "epoch",                         # save strategy for checkpoints
  "logging_steps": 10,                              # log every x steps
  'merge_adapters': True,                           # wether to merge LoRA into the model (needs more memory)
  'use_flash_attn': True,                           # Whether to use Flash Attention
  'output_dir': '/tmp/run',                         # output directory, where to save assets during training
                                                    # could be used for checkpointing. The final trained
                                                    # model will always be saved to s3 at the end of training
}

if HfFolder.get_token() is not None:
    hyperparameters['hf_token'] = HfFolder.get_token()

In [None]:
import torch

torch.__version__

'2.5.1+cu124'

In [None]:
from sagemaker.huggingface import HuggingFace

# define Training Job Name
job_name = f'huggingface-qlora-{hyperparameters["model_id"].replace("/","-").replace(".","-")}'

huggingface_estimator = HuggingFace(
    entry_point = "qlora_llama_example.py",
    source_dir = "scripts",
    py_version = 'py310',
    pytorch_version = "2.5.1",
    transformers_version = "4.48.3",
    hyperparameters = hyperparameters,
    environment = { "HUGGINGFACE_HUB_CACHE": "/tmp/.cache" },
    disable_output_compression = True,
    role = exe_role,
    instance_type = "ml.g5.2xlarge",
    instance_count = 1,
    max_run = 2*24*60*60,
    volume_size = 300,
    base_job_name = job_name
)

In [None]:
inputs = {'training': training_input_path}
huggingface_estimator.fit(inputs, wait=True)

Verify SageMaker has successfully uploaded the model to S3. We can use the model_data property of the estimator to get the S3 path to the model.

In [None]:
huggingface_estimator.model_data

In [None]:
huggingface_estimator.model_data["S3DataSource"]["S3Uri"].replace("s3://", "https://s3.console.aws.amazon.com/s3/buckets/")


### Deployment

Deployment example from hugging face https://huggingface.co/docs/sagemaker/en/inference

In [None]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

llm_image_uri = get_huggingface_llm_image_uri(
    "huggingface",
    #version="1.1.0",
    session=sess,
)

In [None]:
import json
from sagemaker.huggingface import HuggingFaceModel

# model s3 path
model_s3_path = huggingface_estimator.model_data["S3DataSource"]["S3Uri"]

# sagemaker config
instance_type = "ml.g5.2xlarge"
numner_of_gpu = 1
health_check_timeout = 300

# Define Model and Endpoint configuration parameter
hub_config = {
    #"HF_MODEL_ID": "/opt/ml/model", # path to where sagemaker stored the model(in qlora_llama_example.py) or hugging face model id
    "SM_NUM_GPUS": numner_of_gpu, # Number of GPU used per replica
    "MAX_INPUT_LENGTH": 1024, # Max length of input text
    "MAX_TOTAL_TOKENS": 2048 # Max length of the generation (including input text)

}

llm_model = HuggingFaceModel(
    role=exe_role,
    image_uri=llm_image_uri,
    #model_data={'S3DataSource':{'S3Uri': model_s3_path,'S3DataType': 'S3Prefix','CompressionType': 'None'}}
    model_data=huggingface_estimator.model_data
    env=hub_config,
)

# huggingface_model = HuggingFaceModel(
#    model_data="s3://models/my-bert-model/model.tar.gz",  # path to your trained SageMaker model
#    role=role,                                            # IAM role with permissions to create an endpoint
#    transformers_version="4.26",                           # Transformers version used
#    pytorch_version="1.13",                                # PyTorch version used
#    py_version='py39',                                    # Python version used
# )

In [None]:
# Deploy model to an endpoint
llm = llm_model.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    container_startup_health_check_timeout=health_check_timeout # 5 minutes to be able to load the model
)

SageMaker will now create our endpoint and deploy the model to it. This can takes a 10-15 minutes.

In [8]:
# define format function for our input
def format_prompt(message, history, system_prompt):
    prompt = ""
    if system_prompt:
        prompt += f"System: {system_prompt}\n"
    for user_prompt, bot_response in history:
        prompt += f"### Instruction\n{user_prompt}\n\n"
        prompt += f"### Answer\n{bot_response}\n\n"
    prompt += f"### Instruction\n{message}\n\n### Answer\n"
    return prompt

parameters = {
    "do_sample": True,
    "top_p":0.90,
    "temperature": 0.1,
    "max_new_tokens": 1024,
    "repetition_penalty": 1.03,
    "stop": ["\nInstruction:", "<|endoftext|>", " Instruction:", "###"],
}

formatted_prompt = format_prompt("What is deep learning?", [], "You are a helpful assistant.")
payload = {"inputs": formatted_prompt, "parameters": parameters}
generated_text = llm.predict(payload)

In [None]:
print(generated_text[0]['generated_text'])

In [9]:
import boto3
import json

client = boto3.client("sagemaker-runtime")
end_point = ""
content_type = "application/json"
formatted_prompt = format_prompt("What is deep learning?", [], "You are a helpful assistant.")
payload = {"inputs": formatted_prompt, "parameters": parameters}

response = client.invoke_endpoint(
    EndpointName=end_point,
    ContentType=content_type,
    Body=json.dumps(payload)
)

generation = json.loads(response['Body'].read().decode('uft-8'))
final_response = generation[0]['generated_text']
print(final_response)