# 部署语言模型

## 上传一个空的 tar.gz 文件到 S3

In [None]:
!tar zcvf chatglm2.tar.gz -T /dev/null
!aws s3 cp chatglm2.tar.gz s3://cloudbeer-sagemaker-models/llm/chatglm2.tar.gz

## 在SageMaker上部署模型

In [3]:
import boto3  
from sagemaker.huggingface.model import HuggingFaceModel

s3_model = "s3://cloudbeer-sagemaker-models/llm/chatglm2.tar.gz"

iam_client = boto3.client('iam')
role = iam_client.get_role(RoleName='HuggingfaceExecuteRole')['Role']['Arn']

huggingface_model = HuggingFaceModel(
  model_data=s3_model,
  role=role,
	transformers_version='4.26',
	pytorch_version='1.13',
	py_version='py39',
  entry_point='inference.py',
  source_dir='src/code',
)

predictor = huggingface_model.deploy(
  initial_instance_count=1,
  instance_type='ml.g5.2xlarge',
  endpoint_name='chatglm2',
)

ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateEndpoint operation: The account-level service limit 'ml.g5.2xlarge for endpoint usage' is 0 Instances, with current utilization of 0 Instances and a request delta of 1 Instances. Please use AWS Service Quotas to request an increase for this quota. If AWS Service Quotas is not available, contact AWS support to request an increase for this quota.

## 删除模型

In [None]:
from sagemaker.huggingface.model import HuggingFacePredictor

predictor = HuggingFacePredictor(
  endpoint_name='chatglm2'
)

predictor.delete_model()
predictor.delete_endpoint()