In [1]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList
from bamboo import bamboo

class StopTokenCriteria(StoppingCriteria):
    def __init__(self, stop_token_id):
        self.stop_token_id = stop_token_id

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        # 检查最新生成的 token 是否是停止标记
        return input_ids[0, -1] in self.stop_token_id

# Load the tokenizer and model
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#tokenizer = AutoTokenizer.from_pretrained("/home/dozhang/nlcmt/HuggingfaceModels/Llama-2-13b-chat-hf")
#model = AutoModelForCausalLM.from_pretrained("/home/dozhang/nlcmt/HuggingfaceModels/Llama-2-13b-chat-hf", torch_dtype=torch.bfloat16, device_map="auto")

tokenizer = AutoTokenizer.from_pretrained("/home/dozhang/nlcmt1/HuggingfaceModels/Meta-Llama-3.1-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained("/home/dozhang/nlcmt1/HuggingfaceModels/Meta-Llama-3.1-8B-Instruct", torch_dtype=torch.float16, device_map="auto")

#tokenizer = AutoTokenizer.from_pretrained("/home/dozhang/nlcmt1/HuggingfaceModels/Meta-Llama-3.1-70B-Instruct")
#model = AutoModelForCausalLM.from_pretrained("/home/dozhang/nlcmt1/HuggingfaceModels/Meta-Llama-3.1-70B-Instruct", torch_dtype=torch.float16, device_map="auto")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:


input_text = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful AI assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n2+5=10, 3+6=18, 4+7=?<|eot_id|><|start_header_id|>assistant<|end_header_id|>"

input_text = "Kylar went to the store to buy glasses for his new apartment. One glass costs $5, but every second glass costs only 60% of the price. Kylar wants to buy 16 glasses. How much does he need to pay for them?"


input_text = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful AI assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nif let 2+3=13, 3+4=25, 4+5=41, 5+6=?<|eot_id|><|start_header_id|>assistant<|end_header_id|>"

input_text = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful AI assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nQuestion: Kylar went to the store to buy glasses for his new apartment. One glass costs $5, but every second glass costs only 60% of the price. Kylar wants to buy 16 glasses. How much does he need to pay for them?<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
input_text = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful AI assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhich of these is needed in all stages of the butterfly's life cycle?\nA. Wings\nB. Eyes\nC. Soil\nD. Air<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
#input_text="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful AI assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhich statement about the molecules in ice and the molecules in liquid water is correct?\nA. The molecules in ice have more energy than the molecules in liquid water.\nB. The molecules in ice contain different atoms than the molecules in liquid water.\nC. The molecules in ice have more electric charge than the molecules in liquid water.\nD. The molecules in ice are less free to move than the molecules in liquid water.<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
#input_text="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful AI assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>Kylar went to the store to buy glasses for his new apartment. One glass costs $5, but every second glass costs only 60% of the price. Kylar wants to buy 16 glasses. How much does he need to pay for them?<|eot_id|><|start_header_id|>assistant<|end_header_id|>"

inputs = tokenizer(input_text, return_tensors="pt").to(device)

# Create attention mask
attention_mask = inputs.attention_mask

# 定义停止条件
stop_token_id = tokenizer.convert_tokens_to_ids(["</s>", "<|eot_id|>", "<|end_of_text|>", "<|end_header_id|>", "<|start_header_id|>"])
stopping_criteria = StoppingCriteriaList([StopTokenCriteria(stop_token_id)])


In [3]:
from hf_model_adapt import hf_adapt

input_length = inputs["input_ids"].shape[1]
### Todo: complete hf_adapt 
#(model, numDecodeLayer, numSkipLayer) = hf_adapt(model, tokenizer, 512, nBarLayer=54, valBarSim=0.98, nOutLayer = 4, nCheckLayer=2, nWarmupTok = input_length + 3, globalBarLayer=-1, verbose=True) 
#(model, numDecodeLayer, numSkipLayer) = hf_adapt(model, tokenizer, 512, nBarLayer=20, valBarSim=0.96, nOutLayer = 3, nCheckLayer=2, nWarmupTok = -1, globalBarLayer=-1, verbose=True)

(model, globalNumDecodedLayer, globalNumSkippedLayer) = bamboo(model, tokenizer, 512, nBarLayer=54, valBarSim=0.96, nOutLayer = 3, nCheckLayer=2, nWarmupTok = -1, globalBarLayer=-1, verbose=True)

#import pdb; pdb.set_trace()
# Generate text
generation_kwargs = {
                        "do_sample":False,
                         "temperature":0, 
                         "top_p":1
                    }
outputs = model.generate(inputs["input_ids"], attention_mask=attention_mask, max_length=512, num_return_sequences=1, 
                         pad_token_id=tokenizer.eos_token_id, stopping_criteria=stopping_criteria, **generation_kwargs)

# Decode the generated text
generated_text = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=False)

print(f'\nPrompt::: {input_text}\n')
print(f'Response >>> {generated_text}')
print(f'\nNumDecodedLayer={torch.sum(globalNumDecodedLayer)}\tNumSkippedLayer={torch.sum(globalNumSkippedLayer)}')
print(f'SaveRatio={torch.sum(globalNumSkippedLayer)/(torch.sum(globalNumSkippedLayer)+torch.sum(globalNumDecodedLayer))*100:.2f}%')




*** Set BarLayer=54 based on prompt-layer similairty over 52 tokens.

--- No truncation at #position 51, #SimScore 0.9521484375, for token ['<|begin_of_text|>', '<|begin_of_text|>', '<|start_header_id|>', 'system', '<|end_header_id|>', 'ĊĊ', 'You', 'Ġare', 'Ġa', 'Ġhelpful', 'ĠAI', 'Ġassistant', '.', '<|eot_id|>', '<|start_header_id|>', 'user', '<|end_header_id|>', 'ĊĊ', 'Which', 'Ġof', 'Ġthese', 'Ġis', 'Ġneeded', 'Ġin', 'Ġall', 'Ġstages', 'Ġof', 'Ġthe', 'Ġbutterfly', "'s", 'Ġlife', 'Ġcycle', '?Ċ', 'A', '.', 'ĠWings', 'Ċ', 'B', '.', 'ĠEyes', 'Ċ', 'C', '.', 'ĠSoil', 'Ċ', 'D', '.', 'ĠAir', '<|eot_id|>', '<|start_header_id|>', 'assistant', '<|end_header_id|>']

--- No truncation at #position 52, #SimScore 0.90185546875, for token ['ĊĊ']

--- No truncation at #position 53, #SimScore 0.91796875, for token ['The']

--- No truncation at #position 54, #SimScore 0.87646484375, for token ['Ġanswer']

--- No truncation at #position 55, #SimScore 0.96875, for token ['Ġis']

--- No truncation at #p