# Using LLama Factory finetune on SageMaker 
# 2. 使用vLLM进行本地推理

## 安装依赖包

In [13]:
!pip install vllm==0.4.3 bitsandbytes
!pip install datasets



In [3]:
### 从s3下载模型文件到本地

Collecting datasets
  Using cached datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Using cached datasets-2.21.0-py3-none-any.whl (527 kB)
Installing collected packages: datasets
Successfully installed datasets-2.21.0


In [4]:
import boto3
import pprint
from tqdm import tqdm
import sagemaker
sagemaker_session =  sagemaker.session.Session() #sagemaker.session.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [5]:
print(default_bucket)

sagemaker-us-west-2-342367142984


In [6]:
!aws s3 sync s3://{default_bucket}/llama3-8b-qlora/ ./local_model

download: s3://sagemaker-us-west-2-342367142984/llama3-8b-qlora/finetuned_model/README.md to local_model/finetuned_model/README.md
download: s3://sagemaker-us-west-2-342367142984/llama3-8b-qlora/finetuned_model/adapter_config.json to local_model/finetuned_model/adapter_config.json
download: s3://sagemaker-us-west-2-342367142984/llama3-8b-qlora/finetuned_model/checkpoint-160/rng_state.pth to local_model/finetuned_model/checkpoint-160/rng_state.pth
download: s3://sagemaker-us-west-2-342367142984/llama3-8b-qlora/finetuned_model/checkpoint-160/README.md to local_model/finetuned_model/checkpoint-160/README.md
download: s3://sagemaker-us-west-2-342367142984/llama3-8b-qlora/finetuned_model/checkpoint-160/adapter_config.json to local_model/finetuned_model/checkpoint-160/adapter_config.json
download: s3://sagemaker-us-west-2-342367142984/llama3-8b-qlora/finetuned_model/all_results.json to local_model/finetuned_model/all_results.json
download: s3://sagemaker-us-west-2-342367142984/llama3-8b-qlor

## 加载模型tokenizer

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [8]:
model_id = 'TechxGenus/Meta-Llama-3-8B-Instruct-AWQ'
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/50.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

## 加载sample数据，用于对比

In [14]:
from datasets import load_dataset
from random import randrange
# dataset_name = "zxbsmk/webnovel_cn"
dataset_name = "hfl/ruozhiba_gpt4"
# Load dataset from the hub
train_dataset = load_dataset(dataset_name, split="train",revision='41d2c61beb86c8d4c61916cc656c39d018c40ce5')

In [17]:

print(f"Training size: {len(train_dataset)}")
print("\nTraining sample:\n")
num_samples = 200
print(train_dataset[randrange(num_samples)])

Training size: 4898

Training sample:

{'input': '', 'instruction': '问一下考过驾照的吧友们 哪个科比较容易挂？', 'output': '在驾照考试中，通常科目二和科目三被认为是较容易挂科的。科目二主要是场地驾驶技能考试，包括倒车入库、侧方停车、直角转弯、坡道定点停车与起步等项目，这些项目对考生的技术细节要求较高，稍有不慎可能就会犯错。而科目三是道路驾驶技能考试，涉及变道、交通灯应对、路口转弯、公交车站靠站等实际道路驾驶的操作，在真实交通环境中，考生需要应对复杂多变的情况，也容易出现失误。因此，考生普遍认为这两个科目较为困难。'}


In [18]:
sql_lora_path = './local_model/finetuned_model'

## 使用本地的vLLM部署

In [19]:
from vllm.lora.request import LoRARequest
from vllm import LLM,SamplingParams
from transformers import AutoModelForCausalLM, AutoTokenizer


In [20]:
model_id = 'TechxGenus/Meta-Llama-3-8B-Instruct-AWQ'
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [21]:
llm = LLM(model=model_id,max_model_len=4096,enable_lora=True)

config.json:   0%|          | 0.00/885 [00:00<?, ?B/s]

INFO 08-29 19:48:45 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='TechxGenus/Meta-Llama-3-8B-Instruct-AWQ', speculative_config=None, tokenizer='TechxGenus/Meta-Llama-3-8B-Instruct-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=TechxGenus/Meta-Llama-3-8B-Instruct-AWQ)


generation_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

INFO 08-29 19:48:48 weight_utils.py:207] Using model weights format ['*.safetensors']


model-00001-of-00002.safetensors:   0%|          | 0.00/4.68G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/63.5k [00:00<?, ?B/s]

INFO 08-29 19:49:04 model_runner.py:146] Loading model weights took 5.3479 GB
INFO 08-29 19:49:07 gpu_executor.py:83] # GPU blocks: 6493, # CPU blocks: 2048
INFO 08-29 19:49:09 model_runner.py:854] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-29 19:49:09 model_runner.py:858] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-29 19:49:17 model_runner.py:924] Graph capturing finished in 8 secs.


In [29]:
测试第一个消息
messages = [
    {"role": "system", "content":"请始终用中文回答"},
     {"role": "user", "content": "你是谁？你是干嘛的"},
]

# #测试第二个消息
# messages = [
#     {"role": "system", "content":"请始终用中文回答"},
#      {"role": "user", "content": "睡觉时被女鬼压床我该怎么办？"},
# ]


inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

### 使用原始模型进行推理

In [25]:
sampling_params = SamplingParams(temperature=0.1, top_p=0.95,max_tokens=512)

outputs = llm.generate(inputs, sampling_params)
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt:\n{prompt!r}")
    print(f"Response:\n{generated_text!r}")


Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it, Generation Speed: 65.75 toks/s]

Prompt:
'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n请始终用中文回答<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n你是谁？你是干嘛的<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
Response:
'我是 LLaMA，一个由 Meta 开发的基于人工智能的语言模型。我可以理解和生成自然语言，帮助用户回答问题、完成任务和进行对话。我是一个大语言模型，能够学习和改进自己，适用于各种应用场景，例如客服、内容创作、翻译等。'





### 加载Lora进行推理

In [30]:
sql_lora_path = './local_model/finetuned_model'

In [31]:
outputs = llm.generate(inputs, sampling_params,lora_request=LoRARequest("adapter", 1, sql_lora_path))

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt:\n{prompt!r}")
    print(f"Response:\n{generated_text!r}")

Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.34s/it, Generation Speed: 62.84 toks/s]

Prompt:
'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n请始终用中文回答<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n睡觉时被女鬼压床我该怎么办？<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
Response:
'睡觉时被女鬼压床这种现象在传统文化中被称为“被压床”或“被压梦”，在现代心理学和科学角度上，通常被解释为梦境中的幻觉或心理压力。这种现象可能是由于梦境中的情绪过强、睡眠质量不佳、心理压力大、或者是受到某些文化或传统的影响。因此，如果你经常梦到被女鬼压床，建议你可以尝试改善睡眠习惯，例如保持规律的睡眠时间、避免大量的精神活动前睡眠、进行放松技巧等。'



