In [None]:
!pip install --no-deps --quiet s3fs==2023.12.2
!pip install --quiet transformers==4.41.1 peft huggingface_hub hf-transfer
!sudo apt install -y pigz

In [None]:
###########################
## Init the aws client
###########################
import os
import sys
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
from transformers import AutoTokenizer
import config
 
model_id = "mistralai/Mistral-7B-v0.1"
my_huggingface_token = config.my_huggingface_token;
if my_huggingface_token == 'YOUR_HUGGING_FACE_TOKEN':
    sys.exit('You need to provide HuggingFace token in config.py file')
tokenizer = AutoTokenizer.from_pretrained(model_id, token=my_huggingface_token)

In [None]:
from pathlib import Path
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
from huggingface_hub import snapshot_download

model_tar_dir = Path("../base_models/" + model_id)
model_tar_dir.mkdir(parents=True, exist_ok = True)
s3_model_uri = f"s3://{sess.default_bucket()}/base_models/{model_id}/model.tar.gz"
if len(list(model_tar_dir.glob("*.safetensors"))) == 0:
    print(f"Save the HuggingFace base model to {model_tar_dir}")
    snapshot_download(
        repo_id = model_id,
        local_dir = str(model_tar_dir),
        token = my_huggingface_token,
        ignore_patterns=["*.msgpack*", "*.h5%", "*.bin*"],
    )
    assert len(list(model_tar_dir.glob("*.safetensors"))) > 0, "Model download failed"

    parent_dir=os.getcwd()
    # change to model dir
    os.chdir(str(model_tar_dir))
    # use pigz for faster and parallel compression
    print(f"Compressing the model {model_id}")
    !tar -cf model.tar.gz --use-compress-program=pigz *
    # change back to parent dir
    os.chdir(parent_dir)

    print(f"Uploading the model {model_id} to S3")
    from sagemaker.s3 import S3Uploader
    s3_model_uri = S3Uploader.upload(local_path=str(model_tar_dir.joinpath("model.tar.gz")), desired_s3_uri=f"s3://{sess.default_bucket()}/base_models/{model_id}")
    print(f"model {model_id} uploaded to: {s3_model_uri}")
else:
    print(f'The base model {model_id} seems to be already downloaded')

print(f"Model is stored at {s3_model_uri}")

In [None]:
###########################
## Load Dolly dataset
###########################

from datasets import load_dataset
from random import randrange

dataset = load_dataset('databricks/databricks-dolly-15k', split='train')
dataset = dataset.select(range(50))
print(f'Dataset Dolly size: {len(dataset)}')
print(dataset[0])

In [None]:
###########################
## Function to format the dataset to Mistral format
###########################

def format_dolly(sample):
    instruction = f"### Instruction\n{sample['instruction']}"
    context = f"### Context\n{sample['context']}" if len(sample["context"]) > 0 else None
    response = f"### Answer\n{sample['response']}"
    # join all the parts together
    prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
    return prompt

In [None]:
###########################
## Format the dataset to Mistral format
###########################

# add utils method to path for loading dataset
sys.path.append("scripts")
from pack_dataset import pack_dataset


# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}"
    return sample


# apply prompt template per sample
dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))

# tokenize dataset
dataset = dataset.map(
    lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset.features)
)

# chunk dataset
lm_dataset = pack_dataset(dataset, chunk_length=2048) # We use 2048 as the maximum length for packing

# Print total number of samples
print(f"Total number of samples after packing: {len(lm_dataset)}")

In [None]:
# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/processed/mistral/dolly/train'
lm_dataset.save_to_disk(training_input_path)

print("uploaded data to:")
print(f"training dataset to: {training_input_path}")

In [None]:
deepspeed_parameters = {
  "deepspeed": "./configs/mistral_z3_config_bf16.json", # deepspeed config file
  "training_script": "./scripts/run_qlora.py" # real training script, not entrypoint
}

# hyperparameters, which are passed into the training job
training_hyperparameters ={
  'model_id': model_id,                             # pre-trained model
  'dataset_path': '/opt/ml/input/data/training',    # path where sagemaker will save training dataset
  'num_train_epochs': 3,                            # number of training epochs
  'per_device_train_batch_size': 6,                 # batch size for training  REQUIRED REVIEW
  'per_device_eval_batch_size': 8,                  # REQUIRED REVIEW
  #'gradient_accumulation_steps': 8,                 # Number of updates steps to accumulate
  'gradient_checkpointing': True,                   # save memory but slower backward pass
  'bf16': True,                                     # use bfloat16 precision
  'tf32': True,                                     # use tf32 precision
  'learning_rate': 2e-4,                            # learning rate
  'max_grad_norm': 0.3,                             # Maximum norm (for gradient clipping)
  'warmup_ratio': 0.03,                             # warmup ratio
  "lr_scheduler_type":"constant",                   # learning rate scheduler
  "logging_steps": 10,                              # log every x steps
  'merge_adapters': False,                          # wether to merge LoRA into the model (needs more memory)
  'use_flash_attn': True,                           # Whether to use Flash Attention
  'save_strategy': "epoch",                         # save strategy for checkpoints
  'save_total_limit': 3,
  'output_dir': '/opt/ml/checkpoints',              # output directory, where to save assets during training
                                                    # could be used for checkpointing. The final trained
                                                    # model will always be saved to s3 at the end of training
  'hf_token': my_huggingface_token,
  'model_data': s3_model_uri,
}

from sagemaker.huggingface import HuggingFace

job_name = f'mistral-deepspeed-qlora-{model_id.replace("/","-").replace(".","-")}'

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'ds_launcher.py',    # train script
    source_dir           = '.',      # directory which includes all the files needed for training
    instance_type        = 'ml.g5.4xlarge',   # instances type used for the training job
    instance_count       = 2,                 # the number of instances used for training
    max_run              = 3600,        # maximum runtime in seconds (days * hours * minutes * seconds)
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.26',            # the transformers version used in the training job
    pytorch_version      = '1.13',             # the pytorch_version version used in the training job
    py_version           = 'py39',           # the python version used in the training job
    hyperparameters      =  {
        **training_hyperparameters,  # the hyperparameters passed to the training job
        **deepspeed_parameters,
    },
    environment          = { "HUGGINGFACE_HUB_CACHE": "/tmp/.cache" }, # set env variable to cache models in /tmp
    disable_output_compression = True,        # not compress output to save training time and cost

    ## Spot instance
    #use_spot_instances   = True,
    #max_wait             = 5400,
    checkpoint_s3_uri = f's3://{sess.default_bucket()}/{job_name}/checkpoints',
)


# We need to have ssh key files generated. It is required for ssh passwordless login
def gen_ssh_key():
    if not os.path.isfile('scripts/id_rsa'):
        print('Generating SSH key files for passwordless remote on cluster')
        os.system("ssh-keygen -f scripts/id_rsa -t rsa -N ''")


gen_ssh_key()

In [None]:
!cp -r ../aws_config .

# define a data input dictonary with our uploaded s3 uris
data = {'training': training_input_path}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)
print(huggingface_estimator.model_data["S3DataSource"]["S3Uri"])

!rm -rf aws_config

In [None]:
#######################
## Deploy the trained model 
##     which is already stored in S3
#######################

from sagemaker.huggingface import get_huggingface_llm_image_uri
 
# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="1.1.0",
  session=sess,
)
 
# print ecr image uri
print(f"llm image uri: {llm_image}")

import json
from sagemaker.huggingface import HuggingFaceModel
 
# s3 path where the model will be uploaded
# if you try to deploy the model to a different time add the s3 path here
model_s3_path = huggingface_estimator.model_data["S3DataSource"]["S3Uri"]
 
# sagemaker config
instance_type = "ml.g5.2xlarge"
number_of_gpu = 1
health_check_timeout = 300
 
# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "/opt/ml/model", # path to where sagemaker stores the model
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(1024), # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(8192), # Max length of the generation (including input text)
}
 
# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  model_data={'S3DataSource':{'S3Uri': model_s3_path,'S3DataType': 'S3Prefix','CompressionType': 'None'}},
  env=config
)

# Deploy model to an endpoint
# https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

In [None]:
question = {'inputs': 'compare neural network and svm',
           "parameters": { 
                "max_new_tokens": 2048,                
                "early_stopping": True
}}
answer = llm.predict(question)
print(answer)

In [None]:
llm.delete_model()
llm.delete_endpoint()