In [None]:
!pip install bitsandbytes-cuda112
!pip install accelerate
!pip install xformers==0.0.20

In [2]:
# Running without inference acceleration
%time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig
tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan2-7B-Chat", use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("baichuan-inc/Baichuan2-7B-Chat", device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)
model.generation_config = GenerationConfig.from_pretrained("baichuan-inc/Baichuan2-7B-Chat")
messages = []
messages.append({"role": "user", "content": "解释一下“温故而知新”"})
response = model.chat(tokenizer, messages)
print(response)


CPU times: user 2 µs, sys: 3 µs, total: 5 µs
Wall time: 8.34 µs


"温故而知新"是一个中国古代成语，出自《论语·为政》。它的意思是通过回顾和了解过去的事情，可以发现新的知识和道理。这个成语强调了学习和思考的重要性，鼓励人们在不断积累知识的过程中，不断地总结经验教训，从而实现自我提升和成长。


# Host Baichuan model on Amazon SageMaker using LMI(vLLM DeepSpeed) container

In [1]:
%pip install sagemaker --upgrade  --quiet

Note: you may need to restart the kernel to use updated packages.


In [None]:
import boto3
import sagemaker
import json
import io
import numpy as np
from sagemaker import Model, image_uris, serializers, deserializers

role = sagemaker.get_execution_role()  # execution role for the endpoint
session = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
region = session._region_name  # region name of the current SageMaker Studio environment

  import scipy.sparse


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ubuntu/.config/sagemaker/config.yaml


In [35]:
%%writefile serving.properties
engine=MPI
option.model_id=baichuan-inc/Baichuan2-7B-Chat
option.tensor_parallel_degree=max
option.rolling_batch=auto
option.max_rolling_batch_size=32
option.model_loading_timeout=7200
option.trust_remote_code=True


Writing serving.properties


In [36]:
%%sh
rm -rf Baichuan2-7B-Chat-model.tar.gz
mkdir Baichuan2-7B-Chat-model
mv serving.properties Baichuan2-7B-Chat-model/
tar czvf Baichuan2-7B-Chat-model.tar.gz Baichuan2-7B-Chat-model/
rm -rf Baichuan2-7B-Chat-model

Baichuan2-7B-Chat-model/
Baichuan2-7B-Chat-model/serving.properties


In [30]:
# Refer to https://github.com/aws/deep-learning-containers/blob/master/available_images.md for all available images

"""
all available framework under library folder sagemaker/image_uri_config
autogluon.json                      knn.json
blazingtext.json                    lda.json
chainer.json                        linear-learner.json
clarify.json                        model-monitor.json
coach-mxnet.json                    mxnet.json
coach-tensorflow.json               neo-mxnet.json
data-wrangler.json                  neo-pytorch.json
debugger.json                       neo-tensorflow.json
detailed-profiler.json              ntm.json
djl-deepspeed.json                  object-detection.json
djl-fastertransformer.json          object2vec.json
djl-lmi.json                        pca.json
djl-neuronx.json                    pytorch-neuron.json
djl-tensorrtllm.json                pytorch-smp.json
factorization-machines.json         pytorch-training-compiler.json
forecasting-deepar.json             pytorch.json
huggingface-llm-neuronx.json        randomcutforest.json
huggingface-llm.json                ray-pytorch.json
huggingface-neuron.json             ray-tensorflow.json
huggingface-neuronx.json            sagemaker-base-python.json
huggingface-tei-cpu.json            sagemaker-geospatial.json
huggingface-tei.json                sagemaker-tritonserver.json
huggingface-training-compiler.json  semantic-segmentation.json
huggingface.json                    seq2seq.json
image-classification-neo.json       sklearn.json
image-classification.json           spark.json
inferentia-mxnet.json               sparkml-serving.json
inferentia-pytorch.json             stabilityai.json
inferentia-tensorflow.json          tensorflow.json
instance_gpu_info.json              vw.json
ipinsights.json                     xgboost-neo.json
kmeans.json                         xgboost.json
"""
image_uri = image_uris.retrieve(
        # There are issue in using deepspeed since the option "trust_remote_code" is not supported, some of the issue been raised: https://github.com/aws/sagemaker-python-sdk/issues/4063
        framework="djl-tensorrtllm",
        region=session.boto_session.region_name,
        version="0.26.0"
    )
image_uri

'763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.26.0-tensorrtllm0.7.1-cu122'

In [37]:
s3_code_prefix = "large-model-lmi/code"
bucket = session.default_bucket()  # bucket to house artifacts
code_artifact = session.upload_data("Baichuan2-7B-Chat-model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {code_artifact}")

S3 Code or Model tar ball uploaded to --- > s3://sagemaker-us-east-1-705247044519/large-model-lmi/code/Baichuan2-7B-Chat-model.tar.gz


In [38]:
!aws s3 ls s3://sagemaker-us-east-1-705247044519/large-model-lmi/code/Baichuan2-7B-Chat-model.tar.gz
!date

2024-07-10 08:19:15        307 Baichuan2-7B-Chat-model.tar.gz
Wed Jul 10 08:19:17 UTC 2024


In [None]:
""" List all the available instance types with GPU support
aws ec2 describe-instance-types \
  --query 'InstanceTypes[?GpuInfo.Gpus[0].Name!=`null`] | sort_by(@, &GpuInfo.Gpus[0].Name) | sort_by(@, &to_number(GpuInfo.Gpus[0].MemoryInfo.SizeInMiB)) | reverse(@) | [].{GPUCount:GpuInfo.Gpus[0].Count,GPUMemorySizeInMiB:GpuInfo.Gpus[0].MemoryInfo.SizeInMiB,GPUType:GpuInfo.Gpus[0].Name,InstanceType:InstanceType}' \
  --region us-east-1 \
  --output table

------------------------------------------------------------------------
|                         DescribeInstanceTypes                        |
+----------+----------------------+-------------------+----------------+
| GPUCount | GPUMemorySizeInMiB   |      GPUType      | InstanceType   |
+----------+----------------------+-------------------+----------------+
|  8       |  183105              |  L4               |  g6.48xlarge   |
|  4       |  91553               |  L4               |  g6.12xlarge   |
|  4       |  91553               |  L4               |  g6.24xlarge   |
|  8       |  81920               |  H100             |  p5.48xlarge   |
|  8       |  40960               |  A100             |  p4d.24xlarge  |
|  8       |  32768               |  V100             |  p3dn.24xlarge |
|  8       |  32768               |  Gaudi HL-205     |  dl1.24xlarge  |
|  8       |  24576               |  A10G             |  g5.48xlarge   |
|  1       |  24576               |  A10G             |  g5.8xlarge    |
|  1       |  24576               |  A10G             |  g5.xlarge     |
|  1       |  24576               |  A10G             |  g5.16xlarge   |
|  4       |  24576               |  A10G             |  g5.12xlarge   |
|  4       |  24576               |  A10G             |  g5.24xlarge   |
|  1       |  24576               |  A10G             |  g5.2xlarge    |
|  1       |  24576               |  A10G             |  g5.4xlarge    |
|  1       |  22888               |  L4               |  gr6.4xlarge   |
|  1       |  22888               |  L4               |  g6.4xlarge    |
|  1       |  22888               |  L4               |  gr6.8xlarge   |
|  1       |  22888               |  L4               |  g6.16xlarge   |
|  1       |  22888               |  L4               |  g6.xlarge     |
|  1       |  22888               |  L4               |  g6.2xlarge    |
|  1       |  22888               |  L4               |  g6.8xlarge    |
|  4       |  16384               |  V100             |  p3.8xlarge    |
|  8       |  16384               |  V100             |  p3.16xlarge   |
|  1       |  16384               |  V100             |  p3.2xlarge    |
|  1       |  16384               |  T4g              |  g5g.4xlarge   |
|  1       |  16384               |  T4g              |  g5g.xlarge    |
|  1       |  16384               |  T4g              |  g5g.8xlarge   |
|  2       |  16384               |  T4g              |  g5g.metal     |
|  2       |  16384               |  T4g              |  g5g.16xlarge  |
|  1       |  16384               |  T4g              |  g5g.2xlarge   |
|  1       |  16384               |  T4               |  g4dn.4xlarge  |
|  1       |  16384               |  T4               |  g4dn.8xlarge  |
|  1       |  16384               |  T4               |  g4dn.2xlarge  |
|  1       |  16384               |  T4               |  g4dn.16xlarge |
|  1       |  16384               |  T4               |  g4dn.xlarge   |
|  4       |  16384               |  T4               |  g4dn.12xlarge |
|  8       |  16384               |  T4               |  g4dn.metal    |
|  1       |  12288               |  K80              |  p2.xlarge     |
|  16      |  12288               |  K80              |  p2.16xlarge   |
|  8       |  12288               |  K80              |  p2.8xlarge    |
|  1       |  8192                |  Radeon Pro V520  |  g4ad.xlarge   |
|  4       |  8192                |  Radeon Pro V520  |  g4ad.16xlarge |
|  1       |  8192                |  Radeon Pro V520  |  g4ad.2xlarge  |
|  2       |  8192                |  Radeon Pro V520  |  g4ad.8xlarge  |
|  1       |  8192                |  Radeon Pro V520  |  g4ad.4xlarge  |
|  1       |  8192                |  M60              |  g3s.xlarge    |
|  1       |  8192                |  M60              |  g3.4xlarge    |
|  2       |  8192                |  M60              |  g3.8xlarge    |
|  4       |  8192                |  M60              |  g3.16xlarge   |
+----------+----------------------+-------------------+----------------+

Refer to https://docs.aws.amazon.com/sagemaker/latest/dg/notebooks-available-instance-types.html would be more accurate
aws --region us-east-1 pricing get-products \
  --service-code AmazonSageMaker \
  --filters Type=TERM_MATCH,Field=regionCode,Value=us-east-1 \
  | jq -r '
    .PriceList[]
    | fromjson
    | select(.product.productFamily == "ML Instance")
    | {
        GPUCount: .product.attributes.gpu,
        GPUMemory: .product.attributes.gpuMemory,
        GPUType: .product.attributes.physicalGpu,
        InstanceType: .product.attributes.instanceName
      }
    | select(.GPUType != "None" and .GPUType != "N/A")
    | .GPUMemory = if .GPUMemory != "N/A" and .GPUMemory != null then (.GPUMemory | gsub("[^0-9]"; "") | tonumber | tostring + " GiB") else .GPUMemory end
    | [.GPUCount, .GPUMemory, .GPUType, .InstanceType]
    | @tsv
  ' | column -t -s $'\t' | sort -k2 -n -r | uniq | echo -e "GPUCount\tGPUMemorySizeInGiB\tGPUType\tInstanceType\n$(cat -)"
    
GPUCount        GPUMemorySizeInGiB      GPUType InstanceType
8    6403 GiB  nvidia h100              ml.p5.48xlarge
8    6402 GiB  A100 80GB SXM            ml.p4de.24xlarge
8    3202 GiB  nvidia a100 80gb         ml.p4de.24xlarge
8    3202 GiB  nvidia a100 40gb         ml.p4d.24xlarge
8    3202 GiB  NVIDIA A100 Tensor Core  ml.p4d.24xlarge
8    256 GiB   NVIDIA Tesla V100 GPU    ml.p3dn.24xlarge
8    192 GiB   nvidia a10g              ml.g5.48xlarge
8    192 GiB   NVIDIA L4                ml.g6.48xlarge
16   192 GiB   NVIDIA K80 GPU           ml.p2.16xlarge
8    128 GiB   NVIDIA Tesla V100 GPU    ml.p3.16xlarge
8    96 GiB    NVIDIA K80 GPU           ml.p2.8xlarge
4    96 GiB    nvidia a10g              ml.g5.24xlarge
4    96 GiB    nvidia a10g              ml.g5.12xlarge
4    96 GiB    NVIDIA L4                ml.g6.24xlarge
4    96 GiB    NVIDIA L4                ml.g6.12xlarge
4    64 GiB    NVIDIA Tesla V100 GPU    ml.p3.8xlarge
4    64 GiB    NVIDIA T4 GPU            ml.g4dn.12xlarge
1    24 GiB    nvidia a10g              ml.g5.xlarge
1    24 GiB    nvidia a10g              ml.g5.8xlarge
1    24 GiB    nvidia a10g              ml.g5.4xlarge
1    24 GiB    nvidia a10g              ml.g5.2xlarge
1    24 GiB    nvidia a10g              ml.g5.16xlarge
1    24 GiB    NVIDIA L4                ml.g6.xlarge
1    24 GiB    NVIDIA L4                ml.g6.8xlarge
1    24 GiB    NVIDIA L4                ml.g6.4xlarge
1    24 GiB    NVIDIA L4                ml.g6.2xlarge
1    24 GiB    NVIDIA L4                ml.g6.16xlarge
1    16 GiB    NVIDIA Tesla V100 GPU    ml.p3.2xlarge
1    16 GiB    NVIDIA T4 GPU            ml.g4dn.xlarge
1    16 GiB    NVIDIA T4 GPU            ml.g4dn.8xlarge
1    16 GiB    NVIDIA T4 GPU            ml.g4dn.4xlarge
1    16 GiB    NVIDIA T4 GPU            ml.g4dn.2xlarge
1    16 GiB    NVIDIA T4 GPU            ml.g4dn.16xlarge
1    12 GiB    NVIDIA K80 GPU           ml.p2.xlarge
N/A  N/A       AWS Inferentia2          ml.inf2.xlarge
N/A  N/A       AWS Inferentia2          ml.inf2.8xlarge
N/A  N/A       AWS Inferentia2          ml.inf2.48xlarge
N/A  N/A       AWS Inferentia2          ml.inf2.24xlarge
N/A  N/A       AWS Inferentia           ml.inf1.xlarge
N/A  N/A       AWS Inferentia           ml.inf1.6xlarge
N/A  N/A       AWS Inferentia           ml.inf1.2xlarge
N/A  N/A       AWS Inferentia           ml.inf1.24xlarge
"""

In [39]:
model = Model(image_uri=image_uri, model_data=code_artifact, role=role)

instance_type = "ml.g5.12xlarge"
endpoint_name = sagemaker.utils.name_from_base("Baichuan2-7B-Chat-lmi-model")

model.deploy(initial_instance_count=1,
             instance_type=instance_type,
             endpoint_name=endpoint_name,
            #  volume_size =30, ClientError: An error occurred (ValidationException) when calling the CreateEndpointConfig operation: VolumeSize parameter is not allowed for the selected Instance type ml.g4dn.12xlarge
             container_startup_health_check_timeout=1800
            )

------------------------!

In [40]:
class LineIterator:

    def __init__(self, stream):
        # Iterator to get bytes from stream 
        self.byte_iterator = iter(stream)  
        # Buffer stream bytes until we get a full line
        self.buffer = io.BytesIO()  
      # Track current reading position within buffer
        self.read_pos = 0

    def __iter__(self):
        # Make class iterable 
        return self

    def __next__(self):
        while True:
           # Seek read position within buffer
           self.buffer.seek(self.read_pos)  
           # Try reading a line from current position
           line = self.buffer.readline()
           # If we have a full line
           if line and line[-1] == ord('\n'):
               # Increment reading position past this line
               self.read_pos += len(line)  
               # Return the line read without newline char
               return line[:-1] 
           # Fetch next chunk from stream  
           try:
               chunk = next(self.byte_iterator)
           # Handle end of stream 
           except StopIteration:
               # Check if we have any bytes still unread
               if self.read_pos < self.buffer.getbuffer().nbytes:
                   continue
               # If not, raise StopIteration
               raise
           # Add fetched bytes to end of buffer
           self.buffer.seek(0, io.SEEK_END)  
           self.buffer.write(chunk['PayloadPart']['Bytes'])

In [41]:
prompt="""Your task is to write a short paragraph in about 100 words about exercising regularly for a lifestyle focused website. Discuss benefits of regular exercises along with some tips for increasing exercise effectiveness"""

In [49]:
sm_client = boto3.client("sagemaker-runtime")

# set details: True as a runtime parameter within the input.
body = {"inputs": prompt, "parameters": {"max_new_tokens":512, "details": True}}
resp = sm_client.invoke_endpoint_with_response_stream(EndpointName=endpoint_name, Body=json.dumps(body), ContentType="application/json")
event_stream = resp['Body']
event_stream

<botocore.eventstream.EventStream at 0x7fcbb034dc90>

In [None]:
overall_log_prob = []

for line in LineIterator(event_stream):
    resp = json.loads(line)
    
    if resp['token'].get('text') != None:
        token_log_prob = resp['token']['log_prob']
        overall_log_prob.append(token_log_prob)
    elif resp['generated_text'] != None:
        generated_text= resp['generated_text']

In [None]:
print(generated_text)
overall_score=np.exp(np.mean(overall_log_prob))      
print(f"\n\nOverall confidence score in the generated text: {overall_score}")

In [None]:
prompt="""Your task is to write a paragraph in about 500 words about exercising regularly for a lifestyle focused website. Discuss benefits of regular exercises along with some tips for increasing exercise effectiveness while reducing required time commitment"""

In [51]:
def inference(payload):
    # Call SageMaker endpoint and get response stream
    resp = sm_client.invoke_endpoint_with_response_stream(EndpointName=endpoint_name, Body=json.dumps(payload), ContentType="application/json")
    event_stream = resp['Body']
    text_output = []
    for line in LineIterator(event_stream):
        resp = json.loads(line) 
        # Extract text tokens if present
        if resp['token'].get('text') != None:
            token = resp['token']['text']
            text_output.append(token)  
            print(token, end='')
        # Get finish reason if details present
        if resp.get('details') != None:
            finish_reason = resp['details']['finish_reason']
            # Return extracted output, finish reason and token length
            return payload['inputs'] + ''.join(text_output), finish_reason, len(text_output)

# set details: True as a runtime parameter within the input.
payload = {"inputs": prompt,  "parameters": {"max_new_tokens":256, "details": True}} 

finish_reason = "length"
# Print initial output 
print(f"Output: {payload['inputs']}", end='')  
total_tokens = 0
total_requests = 0
while finish_reason == 'length':
    # Call inference and get extracts
    output_text, finish_reason, out_token_len = inference(payload)
    # Update payload for next request
    payload['inputs'] = output_text 
    total_tokens += out_token_len
    total_requests += 1
# Print metrics
print(f"\n\ntotal tokens generated: {total_tokens} \ntotal requests sent: {total_requests}")

Output: Your task is to write a short paragraph in about 100 words about exercising regularly for a lifestyle focused website. Discuss benefits of regular exercises along with some tips for increasing exercise effectiveness