<a href="https://colab.research.google.com/github/hamzafarooq/multi-agent-course/blob/main/Module_2/Quantization/TextStreamer_Meta_Llama_3_1_8B_Instruct.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install transformers
!pip install bitsandbytes

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

## Using TextStreamer

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextStreamer
import torch
from transformers import BitsAndBytesConfig

In [None]:
def start_gpu_stat():
    #@title Show current memory stats
    #Set torch device to get properties global: torch.cuda.set_device(0)
    gpu_stats = torch.cuda.get_device_properties(0)
    initial_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    return initial_gpu_memory, max_memory

def final_gpu_stat(_initial_gpu_memory, _max_memory):
    #@title Show final memory and time stats
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_diff = round(used_memory - _initial_gpu_memory, 3)
    used_percentage = round(used_memory         /_max_memory*100, 3)
    diff_percentage = round(used_memory_for_diff/_max_memory*100, 3)

    print(f"Max memory = {_max_memory} GB.")
    print(f"{_initial_gpu_memory} GB of INITIAL memory reserved.")
    print(f"Peak reserved FINAL memory = {used_memory} GB.")
    print(f"Peak reserved memory DIFFERENCE = {used_memory_for_diff} GB.")
    print(f"Peak reserved memory % of FINAL memory = {used_percentage} %.")
    print(f"Peak reserved memory % of DIFFERENCE memory = {diff_percentage} %.")

# Text Streaming Without Quantization

In [None]:
model_id = "unsloth/Meta-Llama-3.1-8B-Instruct" # Replace with your model

# Load tokenizer and full precision model
tokenizer = AutoTokenizer.from_pretrained(model_id)

initial_gpu_memory, max_memory = start_gpu_stat()

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto"
)

final_gpu_stat(initial_gpu_memory, max_memory)

In [None]:
from transformers import TextStreamer


# Define Alpaca-style prompt format
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
"""

# Prepare input text
prompt_text = alpaca_prompt.format("What is the importance of using renewable energy?")  # instruction

inputs = tokenizer([prompt_text], return_tensors="pt").to(model.device)  # Move inputs to model's device

# Initialize text streamer
text_streamer = TextStreamer(tokenizer, skip_prompt=False, skip_special_tokens=False)

# Generate response with streamer
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=100)


# Time To First Token (TTFT)
Time required to process the prompt and then generate the first output token

# Inter-token latency (ITL)
Average time between consecutive tokens

# End-to-end Latency  
Total time taken to generate the entire response
~ Average output length of tokens * Inter-token latency

# Throughput
Number of output tokens per second


In [None]:
from transformers import TextIteratorStreamer
from threading import Thread
import time

# Define Alpaca-style prompt format
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
"""

# Prepare input text
prompt_text = alpaca_prompt.format("What is the importance of using renewable energy?")
inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)

# Initialize variables for time measurements
start_time = time.time()
token_times = []

# Initialize streamer
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)

# Start generation in a separate thread
thread = Thread(target=model.generate, kwargs={
    'input_ids': inputs['input_ids'],
    'attention_mask': inputs['attention_mask'],
    'streamer': streamer,
    'max_new_tokens': 100
})
thread.start()

# Initialize a variable to store the model output
model_output = ""
first_token_time = None

# Iterate over the streamer to get the generated text in chunks
for i, new_text in enumerate(streamer):
    model_output += new_text
    print(new_text, end='')

    # Measure time for the first token
    if i == 0:
        first_token_time = time.time()
    # Measure time for each token
    token_times.append(time.time())

# Calculate end-to-end latency
end_time = time.time()
end_to_end_latency = end_time - start_time

# Calculate time to first token
ttft = first_token_time - start_time if first_token_time else 0

# Calculate inter-token latency
itl = sum(x - y for x, y in zip(token_times[1:], token_times[:-1])) / (len(token_times) - 1) if len(token_times) > 1 else 0

# Calculate throughput
throughput = len(tokenizer.encode(model_output)) / end_to_end_latency if model_output else 0

print("\nTime To First Token (TTFT):", ttft)
print("Inter-token latency (ITL):", itl)
print("End-to-end Latency:", end_to_end_latency)
print("Throughput:", throughput)

# Text Streaming With Quantization

Shutdown and Restart the kernel before starting below cells

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextStreamer
import torch
from transformers import BitsAndBytesConfig

In [None]:
def start_gpu_stat():
    #@title Show current memory stats
    #Set torch device to get properties global: torch.cuda.set_device(0)
    gpu_stats = torch.cuda.get_device_properties(0)
    initial_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    return initial_gpu_memory, max_memory

def final_gpu_stat(_initial_gpu_memory, _max_memory):
    #@title Show final memory and time stats
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_diff = round(used_memory - _initial_gpu_memory, 3)
    used_percentage = round(used_memory         /_max_memory*100, 3)
    diff_percentage = round(used_memory_for_diff/_max_memory*100, 3)

    print(f"Max memory = {_max_memory} GB.")
    print(f"{_initial_gpu_memory} GB of INITIAL memory reserved.")
    print(f"Peak reserved FINAL memory = {used_memory} GB.")
    print(f"Peak reserved memory DIFFERENCE = {used_memory_for_diff} GB.")
    print(f"Peak reserved memory % of FINAL memory = {used_percentage} %.")
    print(f"Peak reserved memory % of DIFFERENCE memory = {diff_percentage} %.")

In [None]:
model_id = "unsloth/Meta-Llama-3.1-8B-Instruct" # Replace with your model

# 4-bit quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

# Load tokenizer and model in 4-bit
tokenizer = AutoTokenizer.from_pretrained(model_id)

initial_gpu_memory, max_memory = start_gpu_stat()

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto"
)

final_gpu_stat(initial_gpu_memory, max_memory)

In [None]:
from transformers import TextStreamer

# Define Alpaca-style prompt format
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
"""

# Prepare input text
prompt_text = alpaca_prompt.format("What is the importance of using renewable energy?")  # instruction

inputs = tokenizer([prompt_text], return_tensors="pt").to(model.device)  # Move inputs to model's device

# Initialize text streamer
text_streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)

# Generate response with streamer
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=100)


In [None]:
from transformers import TextIteratorStreamer
from threading import Thread
import time

# Define Alpaca-style prompt format
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
"""

# Prepare input text
prompt_text = alpaca_prompt.format("What is the importance of using renewable energy?")
inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)

# Initialize variables for time measurements
start_time = time.time()
token_times = []

# Initialize streamer
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)

# Start generation in a separate thread
thread = Thread(target=model.generate, kwargs={
    'input_ids': inputs['input_ids'],
    'attention_mask': inputs['attention_mask'],
    'streamer': streamer,
    'max_new_tokens': 100
})
thread.start()

# Initialize a variable to store the model output
model_output = ""
first_token_time = None

# Iterate over the streamer to get the generated text in chunks
for i, new_text in enumerate(streamer):
    model_output += new_text
    print(new_text, end='')

    # Measure time for the first token
    if i == 0:
        first_token_time = time.time()
    # Measure time for each token
    token_times.append(time.time())

# Calculate end-to-end latency
end_time = time.time()
end_to_end_latency = end_time - start_time

# Calculate time to first token
ttft = first_token_time - start_time if first_token_time else 0

# Calculate inter-token latency
itl = sum(x - y for x, y in zip(token_times[1:], token_times[:-1])) / (len(token_times) - 1) if len(token_times) > 1 else 0

# Calculate throughput
throughput = len(tokenizer.encode(model_output)) / end_to_end_latency if model_output else 0

print("\nTime To First Token (TTFT):", ttft)
print("Inter-token latency (ITL):", itl)
print("End-to-end Latency:", end_to_end_latency)
print("Throughput:", throughput)