## Usage 1

Replace transformers' beam search with atspeed's.

In [None]:
# Load models
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

target_checkpoint = "/storage/syma/models/vicuna-7b-v1.3/"
draft_checkpoint = "/storage/syma/models/vicuna-68m/"

device = "cuda:0" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(target_checkpoint)
target_model = AutoModelForCausalLM.from_pretrained(target_checkpoint, device_map=device).eval()
draft_model = AutoModelForCausalLM.from_pretrained(draft_checkpoint, device_map=device).eval()

prompt= "Long long ago"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

In [None]:
# Set generation parameters
max_new_tokens = 10
beam_size = 10
draft_beam_size = 10
gamma = 3

target_model.generation_config.update(**{
    "max_new_tokens": max_new_tokens,
    "num_beams": beam_size,
    "num_return_sequences": beam_size,
    "return_dict_in_generate": True,
    "output_scores": True,
})
draft_model.generation_config.update(**{
    "max_new_tokens": gamma,
    "num_beams": draft_beam_size,
    "num_return_sequences": draft_beam_size,
    "return_dict_in_generate": True,
    "output_scores": True,
})

In [4]:
from atspeed.beamsd4timing import Timer
with Timer() as timer_first:
    outputs = target_model.generate(**inputs, max_new_tokens=max_new_tokens, num_beams=beam_size)
print(f"First generation with resource preparation: {timer_first.time_cost:.2f} s")
print(tokenizer.batch_decode(outputs["sequences"], skip_special_tokens=True))

First generation with resource preparation: 2.14 s
['Long long ago, in a galaxy far, far away,', 'Long long ago, in a galaxy far, far away...', 'Long long ago, in a land far, far away, there', 'Long long ago, in a galaxy far, far away…', 'Long long ago, in a galaxy far far away, there', 'Long long ago, in a land far far away, there was', 'Long long ago, in a faraway land, there was a', 'Long long ago, in a far-off land, there was', 'Long long ago, in a far far away land, there was', 'Long long ago, in a galaxy far far away...\n']


In [5]:
with Timer() as timer_TF:
    outputs = target_model.generate(**inputs, max_new_tokens=max_new_tokens, num_beams=beam_size)
print(f"transformers (beam search by batch): {timer_TF.time_cost:.2f} s")
print(tokenizer.batch_decode(outputs["sequences"], skip_special_tokens=True))

transformers (beam search by batch): 0.89 s
['Long long ago, in a galaxy far, far away,', 'Long long ago, in a galaxy far, far away...', 'Long long ago, in a land far, far away, there', 'Long long ago, in a galaxy far, far away…', 'Long long ago, in a galaxy far far away, there', 'Long long ago, in a land far far away, there was', 'Long long ago, in a faraway land, there was a', 'Long long ago, in a far-off land, there was', 'Long long ago, in a far far away land, there was', 'Long long ago, in a galaxy far far away...\n']


In [6]:
from atspeed.beamsd_replace import replace_beam_search_with_speculative_decoding

model = target_model
replace_beam_search_with_speculative_decoding(model)
with Timer() as timer_AT:
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, num_beams=beam_size)
print(f"atspeed (beam search by TreeAttn): {timer_AT.time_cost:.2f} s")
print(tokenizer.batch_decode(outputs["sequences"], skip_special_tokens=True))

atspeed (beam search by TreeAttn): 0.75 s
['Long long ago, in a galaxy far, far away,', 'Long long ago, in a galaxy far, far away...', 'Long long ago, in a land far, far away, there', 'Long long ago, in a galaxy far, far away…', 'Long long ago, in a galaxy far far away, there', 'Long long ago, in a land far far away, there was', 'Long long ago, in a faraway land, there was a', 'Long long ago, in a far-off land, there was', 'Long long ago, in a far far away land, there was', 'Long long ago, in a galaxy far far away...\n']


In [7]:
from atspeed.beamsd4timing import BSSD4timming

outputs = BSSD4timming(target_model, draft_model, inputs, gamma, max_new_tokens)
print(f"atspeed-draft: {outputs['time_cost']:.2f} s, accepted_steps: {outputs['total_accept_steps']}")
print(f"target_time_cost: {outputs['target_time_cost']:.2f} s, draft_time_cost: {outputs['draft_time_cost']:.2f} s, verify_time_cost: {outputs['verify_time_cost']:.2f} s")

atspeed-draft: 0.91 s, accepted_steps: 0
target_time_cost: 0.72 s, draft_time_cost: 0.10 s, verify_time_cost: 0.02 s


In [8]:
from atspeed.beamsd import beam_search_by_speculative_decoding

with Timer() as timer_ATSD:
    outputs = beam_search_by_speculative_decoding(target_model, draft_model, inputs, gamma, max_new_tokens)
print(f"atspeed-draft: {timer_ATSD.time_cost:.2f} s, accepted_steps: {outputs['total_accept_steps']}")
print(tokenizer.batch_decode(outputs["beam_sequence"], skip_special_tokens=True))

atspeed-draft: 0.59 s, accepted_steps: 0
['Long long ago.\nI\'m not sure what you mean by "I\'m not sure what you mean by "I\'m not sure what you mean by "', 'Long long ago.\nI hope you are doing well. I hope you are doing well. I hope you are doing well. I hope you are doing well. I hope', 'Long long ago.\nI hope you are doing well. I hope you are doing well. I hope you are doing well. I hope you are doing well.\n\n', 'Long long ago!\nI hope you have a great day!\n\nI hope you have a great day too!\n\nI hope you have a great day too!', 'Long long ago.\nI hope you are doing well. I hope you are doing well. I hope you are doing well. I hope you are doing well.\nI', 'Long long ago.\nI hope you are doing well. I hope you are doing well. I hope you are doing well.\nI hope you are doing well. I', "Long long ago.\nI don't know if it's just me or if it's just me or if it's just me or if it's", 'Long long ago!\nI hope you have a great day!\n\nBest regards,\n[Your Name]\n[Your Email]\n[Your Pho

## Usage 2

Use atspeed's beam search.