In [3]:
!pip install --no-deps --quiet s3fs==2023.12.2
!pip install --quiet transformers==4.41.1 peft

In [4]:
###########################
## Init the aws client
###########################
import os
import sys
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::859967598519:role/service-role/AmazonSageMaker-ExecutionRole-20240523T230081
sagemaker bucket: sagemaker-us-east-1-859967598519
sagemaker session region: us-east-1


In [5]:
###########################
## Load Dolly dataset
###########################

from datasets import load_dataset
from random import randrange

dataset = load_dataset('databricks/databricks-dolly-15k', split='train')
dataset = dataset.select(range(1500))
print(f'Dataset Dolly size: {len(dataset)}')
print(dataset[0])

Dataset Dolly size: 1500
{'instruction': 'When did Virgin Australia start operating?', 'context': "Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.", 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}


In [6]:
###########################
## Function to format the dataset to Mistral format
###########################

def format_dolly(sample):
    instruction = f"### Instruction\n{sample['instruction']}"
    context = f"### Context\n{sample['context']}" if len(sample["context"]) > 0 else None
    response = f"### Answer\n{sample['response']}"
    # join all the parts together
    prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
    return prompt

In [7]:
from transformers import AutoTokenizer
import config
 
model_id = "mistralai/Mistral-7B-v0.1"
my_huggingface_token = config.my_huggingface_token;
if my_huggingface_token == 'YOUR_HUGGING_FACE_TOKEN':
    sys.exit('You need to provide HuggingFace token in config.py file')
tokenizer = AutoTokenizer.from_pretrained(model_id, token=my_huggingface_token)

In [8]:
###########################
## Format the dataset to Mistral format
###########################

# add utils method to path for loading dataset
sys.path.append("scripts")
from pack_dataset import pack_dataset


# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}"
    return sample

# apply prompt template per sample
dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))

# tokenize dataset
dataset = dataset.map(
    lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset.features)
)

# chunk dataset
lm_dataset = pack_dataset(dataset, chunk_length=2048) # We use 2048 as the maximum length for packing

# Print total number of samples
print(f"Total number of samples after packing: {len(lm_dataset)}")

Chunking dataset into chunks of 2048 tokens.
Total number of samples: 149
Total number of samples after packing: 149


In [10]:
# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/processed/mistral/dolly/train'
lm_dataset.save_to_disk(training_input_path)

print("uploaded data to:")
print(f"training dataset to: {training_input_path}")

Saving the dataset (0/1 shards):   0%|          | 0/149 [00:00<?, ? examples/s]

uploaded data to:
training dataset to: s3://sagemaker-us-east-1-859967598519/processed/mistral/dolly/train


In [15]:
deepspeed_parameters = {
  "deepspeed": "./configs/mistral_z3_config_bf16.json", # deepspeed config file
  "training_script": "./scripts/run_qlora.py" # real training script, not entrypoint
}

# hyperparameters, which are passed into the training job
training_hyperparameters ={
  'model_id': model_id,                             # pre-trained model
  'dataset_path': '/opt/ml/input/data/training',    # path where sagemaker will save training dataset
  'num_train_epochs': 3,                            # number of training epochs
  'per_device_train_batch_size': 6,                 # batch size for training  REQUIRED REVIEW
  'per_device_eval_batch_size': 8,                  # REQUIRED REVIEW
  #'gradient_accumulation_steps': 8,                 # Number of updates steps to accumulate
  'gradient_checkpointing': True,                   # save memory but slower backward pass
  'bf16': True,                                     # use bfloat16 precision
  'tf32': True,                                     # use tf32 precision
  'learning_rate': 2e-4,                            # learning rate
  'max_grad_norm': 0.3,                             # Maximum norm (for gradient clipping)
  'warmup_ratio': 0.03,                             # warmup ratio
  "lr_scheduler_type":"constant",                   # learning rate scheduler
  "logging_steps": 10,                              # log every x steps
  'merge_adapters': False,                          # wether to merge LoRA into the model (needs more memory)
  'use_flash_attn': True,                           # Whether to use Flash Attention
  'save_strategy': "epoch",                         # save strategy for checkpoints
  'save_total_limit': 3,
  'output_dir': '/opt/ml/checkpoints',              # output directory, where to save assets during training
                                                    # could be used for checkpointing. The final trained
                                                    # model will always be saved to s3 at the end of training
  'hf_token': my_huggingface_token,
}

from sagemaker.huggingface import HuggingFace

job_name = f'mistral-deepspeed-qlora-{training_hyperparameters["model_id"].replace("/","-").replace(".","-")}'

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'ds_launcher.py',    # train script
    source_dir           = '.',      # directory which includes all the files needed for training
    instance_type        = 'ml.g5.4xlarge',   # instances type used for the training job
    instance_count       = 2,                 # the number of instances used for training
    max_run              = 3600,        # maximum runtime in seconds (days * hours * minutes * seconds)
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.26',            # the transformers version used in the training job
    pytorch_version      = '1.13',             # the pytorch_version version used in the training job
    py_version           = 'py39',           # the python version used in the training job
    hyperparameters      =  {
        **training_hyperparameters,  # the hyperparameters passed to the training job
        **deepspeed_parameters,
    },
    environment          = { "HUGGINGFACE_HUB_CACHE": "/tmp/.cache" }, # set env variable to cache models in /tmp
    disable_output_compression = True,        # not compress output to save training time and cost

    ## Spot instance
    #use_spot_instances   = True,
    #max_wait             = 5400,
    checkpoint_s3_uri = f's3://{sess.default_bucket()}/{job_name}/checkpoints'
)


# We need to have ssh key files generated. It is required for ssh passwordless login
def gen_ssh_key():
    if not os.path.isfile('scripts/id_rsa'):
        print('Generating SSH key files for passwordless remote on cluster')
        os.system("ssh-keygen -f scripts/id_rsa -t rsa -N ''")


gen_ssh_key()

In [16]:
# define a data input dictonary with our uploaded s3 uris
data = {'training': training_input_path}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)
print(huggingface_estimator.model_data["S3DataSource"]["S3Uri"])

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: mistral-deepspeed-qlora-mistralai-Mistr-2024-06-08-17-59-21-175


2024-06-08 17:59:22 Starting - Starting the training job...
2024-06-08 17:59:22 Pending - Training job waiting for capacity...
2024-06-08 18:00:16 Pending - Preparing the instances for training...
2024-06-08 18:00:51 Downloading - Downloading input data.........
2024-06-08 18:01:56 Downloading - Downloading the training image.........
2024-06-08 18:03:43 Training - Training image download completed. Training in progress....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-06-08 18:04:09,221 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-06-08 18:04:09,243 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-06-08 18:04:09,254 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-06-08 18:04:09,257 sagemaker_pytorch_container.training INFO     

In [11]:
#######################
## Deploy the trained model 
##     which is already stored in S3
#######################

from sagemaker.huggingface import get_huggingface_llm_image_uri
 
# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="1.1.0",
  session=sess,
)
 
# print ecr image uri
print(f"llm image uri: {llm_image}")

import json
from sagemaker.huggingface import HuggingFaceModel
 
# s3 path where the model will be uploaded
# if you try to deploy the model to a different time add the s3 path here
model_s3_path = huggingface_estimator.model_data["S3DataSource"]["S3Uri"]
 
# sagemaker config
instance_type = "ml.g5.2xlarge"
number_of_gpu = 1
health_check_timeout = 300
 
# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "/opt/ml/model", # path to where sagemaker stores the model
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(1024), # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(8192), # Max length of the generation (including input text)
}
 
# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  model_data={'S3DataSource':{'S3Uri': model_s3_path,'S3DataType': 'S3Prefix','CompressionType': 'None'}},
  env=config
)

# Deploy model to an endpoint
# https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py39
INFO:sagemaker.image_uris:Defaulting to only supported image scope: gpu.


llm image uri: 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04


INFO:sagemaker:Creating model with name: huggingface-pytorch-tgi-inference-2024-05-28-09-09-25-791
INFO:sagemaker:Creating endpoint-config with name huggingface-pytorch-tgi-inference-2024-05-28-09-09-26-509
INFO:sagemaker:Creating endpoint with name huggingface-pytorch-tgi-inference-2024-05-28-09-09-26-509


----------!

In [22]:
question = {'inputs': 'compare neural network and svm',
           "parameters": { 
                "max_new_tokens": 2048,                
                "early_stopping": True
}}
answer = llm.predict(question)
print(answer)

[{'generated_text': '\n\n# compare neural network and svm\n\n- start\n- 1\n- 2\n- 3\n- 4\n- 5\n- 6\n- 7\n- 8\n- 9\n- 10\n- 11\n- 12\n- 13\n- 14\n- 15\n- 16\n- 17\n- 18\n- 19\n- 20\n- 21\n- 22\n- 23\n- 24\n- 25\n- 26\n- 27\n- 28\n- 29\n- 30\n- 31\n- 32\n- 33\n- 34\n- 35\n- 36\n- 37\n- 38\n- 39\n- 40\n- 41\n- 42\n- 43\n- 44\n- 45\n- 46\n- 47\n- 48\n- 49\n- 50\n- 51\n- 52\n- 53\n- 54\n- 55\n- 56\n- 57\n- 58\n- 59\n- 60\n- 61\n- 62\n- 63\n- 64\n- 65\n- 66\n- 67\n- 68\n- 69\n- 70\n- 71\n- 72\n- 73\n- 74\n- 75\n- 76\n- 77\n- 78\n- 79\n- 80\n- 81\n- 82\n- 83\n- 84\n- 85\n- 86\n- 87\n- 88\n- 89\n- 90\n- 91\n- 92\n- 93\n- 94\n- 95\n- 96\n- 97\n- 98\n- 99\n- 100\n- 101\n- 102\n- 103\n- 104\n- 105\n- 106\n- 107\n- 108\n- 109\n- 110\n- 111\n- 112\n- 113\n- 114\n- 115\n- 116\n- 117\n- 118\n- 119\n- 120\n- 121\n- 122\n- 123\n- 124\n- 125\n- 126\n- 127\n- 128\n- 129\n- 130\n- 131\n- 132\n- 133\n- 134\n- 135\n- 136\n- 137\n- 138\n- 139\n- 140\n- 141\n- 142\n- 143\n- 144\n- 145\n- 146\n- 147\n- 148\n- 

In [23]:
llm.delete_model()
llm.delete_endpoint()

INFO:sagemaker:Deleting model with name: huggingface-pytorch-tgi-inference-2024-05-28-09-09-25-791
INFO:sagemaker:Deleting endpoint configuration with name: huggingface-pytorch-tgi-inference-2024-05-28-09-09-26-509
INFO:sagemaker:Deleting endpoint with name: huggingface-pytorch-tgi-inference-2024-05-28-09-09-26-509
