In [None]:
!pip install --upgrade transformers hqq bitsandbytes transformers datasets accelerate
!pip install -q flash-attn --no-build-isolation

In [1]:
import torch  
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed, BitsAndBytesConfig  
import os  
  
# Define multiple prompts for batch processing  
base_prompts = [  
    "The best tomato sauce is",  
    "Artificial intelligence is",  
    "The future of technology is",  
    "Climate change is",  
    "The benefits of exercise are",  
    "Healthy eating includes",  
    "The importance of sleep is",  
    "The universe is",  
    "Machine learning is",  
    "Renewable energy sources are",  
]  
  
# Multiply the prompts by 20 to increase the batch size  
prompts = base_prompts * 20  
  
model_id = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"  
  
bnb_config = BitsAndBytesConfig(  
    load_in_4bit=True,  
    bnb_4bit_quant_type="nf4",  
    bnb_4bit_compute_dtype=torch.float16,  
    bnb_4bit_use_double_quant=True,  
)  
  
tokenizer = AutoTokenizer.from_pretrained(model_id)  
  
# Set padding token  
if tokenizer.pad_token is None:  
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})  
  
model = AutoModelForCausalLM.from_pretrained(  
    model_id,   
    trust_remote_code=True,   
    quantization_config=bnb_config,   
    attn_implementation="flash_attention_2",   
    torch_dtype=torch.float16,   
    device_map="cuda:0"  
)  
  
# Resize token embeddings to match the tokenizer's vocabulary size  
model.resize_token_embeddings(len(tokenizer))  
  
def print_gpu_memory_usage():  
    allocated_memory = torch.cuda.memory_allocated()  
    reserved_memory = torch.cuda.memory_reserved()  
    print(f"Allocated GPU memory: {allocated_memory / (1024**3):.2f} GB")  
    print(f"Reserved GPU memory: {reserved_memory / (1024**3):.2f} GB")  
  
print("Before tokenization and model generation:")  
print_gpu_memory_usage()  
  
# Tokenize the prompts with batch processing  
inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)  
  
print("After tokenization:")  
print_gpu_memory_usage()  
  
# Generate outputs for the batch of prompts  
outputs = model.generate(  
    **inputs,   
    max_new_tokens=1000,   
    cache_implementation="quantized",   
    cache_config={"backend": "HQQ", "nbits": 4, "q_group_size": 128, "residual_length": 64, "device": model.device}  
)  
  
print("After model generation:")  
print_gpu_memory_usage()  
  
# Decode the outputs for each prompt in the batch  
results = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]  
  
# Print the results  
#for i, result in enumerate(results):  
#    print(f"Result for prompt {i+1}: {result}")  


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Before tokenization and model generation:
Allocated GPU memory: 5.31 GB
Reserved GPU memory: 5.40 GB
After tokenization:
Allocated GPU memory: 5.31 GB
Reserved GPU memory: 5.40 GB
After model generation:
Allocated GPU memory: 5.32 GB
Reserved GPU memory: 29.34 GB


: 

In [1]:
import torch  
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed, BitsAndBytesConfig  
import os  

  
# Define multiple prompts for batch processing  
base_prompts = [  
    "The best tomato sauce is",  
    "Artificial intelligence is",  
    "The future of technology is",  
    "Climate change is",  
    "The benefits of exercise are",  
    "Healthy eating includes",  
    "The importance of sleep is",  
    "The universe is",  
    "Machine learning is",  
    "Renewable energy sources are",  
]  
  
# Multiply the prompts by 20 to increase the batch size  
prompts = base_prompts * 20  
  
model_id = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"  
  
bnb_config = BitsAndBytesConfig(  
    load_in_4bit=True,  
    bnb_4bit_quant_type="nf4",  
    bnb_4bit_compute_dtype=torch.float16,  
    bnb_4bit_use_double_quant=True,  
)  
  
tokenizer = AutoTokenizer.from_pretrained(model_id)  
  
# Set padding token  
if tokenizer.pad_token is None:  
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})  
  
model = AutoModelForCausalLM.from_pretrained(  
    model_id,   
    trust_remote_code=True,   
    quantization_config=bnb_config,   
    attn_implementation="flash_attention_2",   
    torch_dtype=torch.float16,   
    device_map="cuda:0"  
)  
  
# Resize token embeddings to match the tokenizer's vocabulary size  
model.resize_token_embeddings(len(tokenizer))  
  
def print_gpu_memory_usage():  
    allocated_memory = torch.cuda.memory_allocated()  
    reserved_memory = torch.cuda.memory_reserved()  
    print(f"Allocated GPU memory: {allocated_memory / (1024**3):.2f} GB")  
    print(f"Reserved GPU memory: {reserved_memory / (1024**3):.2f} GB")  
  
print("Before tokenization and model generation:")  
print_gpu_memory_usage()  
  
# Tokenize the prompts with batch processing  
inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)  
  
print("After tokenization:")  
print_gpu_memory_usage()  
  
# Generate outputs for the batch of prompts  
outputs = model.generate(  
    **inputs,   
    max_new_tokens=1000,   
    cache_implementation="quantized",   
    cache_config={"backend": "quanto", "nbits": 4, "q_group_size": 128, "residual_length": 64, "device": model.device}  
)  
  
print("After model generation:")  
print_gpu_memory_usage()  
  
# Decode the outputs for each prompt in the batch  
results = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]  
  
# Print the results  
#for i, result in enumerate(results):  
#    print(f"Result for prompt {i+1}: {result}")  


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Before tokenization and model generation:
Allocated GPU memory: 5.31 GB
Reserved GPU memory: 5.40 GB
After tokenization:
Allocated GPU memory: 5.31 GB
Reserved GPU memory: 5.40 GB


If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


After model generation:
Allocated GPU memory: 5.32 GB
Reserved GPU memory: 18.94 GB


: 

In [1]:
import torch  
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed, BitsAndBytesConfig  
import os  
  
  
# Define multiple prompts for batch processing  
base_prompts = [  
    "The best tomato sauce is",  
    "Artificial intelligence is",  
    "The future of technology is",  
    "Climate change is",  
    "The benefits of exercise are",  
    "Healthy eating includes",  
    "The importance of sleep is",  
    "The universe is",  
    "Machine learning is",  
    "Renewable energy sources are",  
]  
  
# Multiply the prompts by 20 to increase the batch size  
prompts = base_prompts * 20  
  
model_id = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"  
  
bnb_config = BitsAndBytesConfig(  
    load_in_4bit=True,  
    bnb_4bit_quant_type="nf4",  
    bnb_4bit_compute_dtype=torch.float16,  
    bnb_4bit_use_double_quant=True,  
)  
  
tokenizer = AutoTokenizer.from_pretrained(model_id)  
  
# Set padding token  
if tokenizer.pad_token is None:  
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})  
  
model = AutoModelForCausalLM.from_pretrained(  
    model_id,   
    trust_remote_code=True,   
    quantization_config=bnb_config,   
    attn_implementation="flash_attention_2",   
    torch_dtype=torch.float16,   
    device_map="cuda:0"  
)  
  
# Resize token embeddings to match the tokenizer's vocabulary size  
model.resize_token_embeddings(len(tokenizer))  
  
def print_gpu_memory_usage():  
    allocated_memory = torch.cuda.memory_allocated()  
    reserved_memory = torch.cuda.memory_reserved()  
    print(f"Allocated GPU memory: {allocated_memory / (1024**3):.2f} GB")  
    print(f"Reserved GPU memory: {reserved_memory / (1024**3):.2f} GB")  
  
print("Before tokenization and model generation:")  
print_gpu_memory_usage()  
  
# Tokenize the prompts with batch processing  
inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)  
  
print("After tokenization:")  
print_gpu_memory_usage()  
  
# Generate outputs for the batch of prompts  
outputs = model.generate(  
    **inputs,   
    max_new_tokens=1000,   
)  
  
print("After model generation:")  
print_gpu_memory_usage()  
  
# Decode the outputs for each prompt in the batch  
results = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]  
  
# Print the results  
#for i, result in enumerate(results):  
#    print(f"Result for prompt {i+1}: {result}")  


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Before tokenization and model generation:
Allocated GPU memory: 5.31 GB
Reserved GPU memory: 5.40 GB
After tokenization:
Allocated GPU memory: 5.31 GB
Reserved GPU memory: 5.40 GB
After model generation:
Allocated GPU memory: 5.32 GB
Reserved GPU memory: 55.99 GB
