# Cluster Config for Llama 70b:
- DBR 16.3 ML: 16.3.x-gpu-ml-scala2.12
- Standard_NC48ads_A100_v4 [A100]
- 440GB MEMORY, 2 GPUs 
- 80GB VRAM x 2 GPUs

In [0]:
%pip install -r requirements.txt

In [0]:
dbutils.library.restartPython()

In [0]:
from utils import print_gpu_utilization, print_available_vram, test_load_gpu, batch_inference

In [0]:
print_gpu_utilization()

In [0]:
print_available_vram()

In [0]:
%run ./00_setup

### Use Accelerate: [Big Model Inference](https://huggingface.co/docs/accelerate/main/en/concept_guides/big_model_inference)


```
# Use direct model checkpoints

import torch
from accelerate import init_empty_weights, load_checkpoint_and_dispatch

with init_empty_weights():
    model = MyModel(...)

model = load_checkpoint_and_dispatch(
    model, checkpoint=checkpoint_file, device_map="auto"
)

input = torch.randn(2,3)
device_type = next(iter(model.parameters())).device.type
input = input.to(device_type)
output = model(input)
```

In [0]:
import torch
import torch.distributed as dist
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import transformers

model_id = "meta-llama/Llama-3.3-70B-Instruct"
padding_side="left"

gen_config = GenerationConfig(
        max_length=20,
        do_sample=False)

Warning:` /databricks/python/lib/python3.11/site-packages/huggingface_hub/file_download.py:651: UserWarning: Not enough free disk space to download the file. The expected file size is: 4664.17 MB. The target location /root/.cache/huggingface/hub/models--meta-llama--Llama-3.3-70B-Instruct/blobs only has 3930.85 MB free disk space.`


In [0]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")

# Set pad_token 
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [0]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # quantization_config=quantization_config,
    # tp_plan="auto",
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

In [0]:
print_gpu_utilization()

In [0]:
%sh 

nvidia-smi

In [0]:
messages = [
    {"role": "system", "content": "You are a doctor that works in a hospital emergency room."},
    {"role": "user", "content": "What is the difference between Medicare and Medicaid?"},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=False,
    return_tensors="pt"
).to(model.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

In [0]:
import timeit

start_time = timeit.default_timer()

with torch.no_grad():
    outputs = model.generate(
        input_ids,
        pad_token_id = tokenizer.pad_token_id,
        max_new_tokens=30,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )

response = outputs[0][input_ids.shape[-1]:]

elapsed = timeit.default_timer() - start_time
print(f"Time taken: {elapsed} seconds")

In [0]:
start_time = timeit.default_timer()

print(tokenizer.decode(response, skip_special_tokens=True))

elapsed = timeit.default_timer() - start_time
print(f"Time taken: {elapsed} seconds")

In [0]:
print_gpu_utilization()

In [0]:
# Free memory
del outputs  # Delete the outputs tensor
torch.cuda.empty_cache()  # Clear the GPU memory cache
print_gpu_utilization()


### Load the data

In [0]:
medquad_df = spark.read.table(f"{catalog_name}.{schema_name}.{table_name}")
display(medquad_df)

In [0]:
qtype_list = medquad_df.select("qtype").distinct().rdd.map(lambda row: row.qtype).collect()
print(qtype_list)

## Generate inference 

In [0]:
batch_questions = [row.question for row in medquad_df.select("question").limit(2).collect()]

In [0]:
updated_batch_questions = [f"Classify the following question: {question} as one of the categories: {qtype_list} without explanation. Return only the category name and output less than 15 tokens." for question in batch_questions]

In [0]:
batch_encoding = tokenizer(updated_batch_questions, return_tensors='pt', padding=True)

In [0]:
generated_ids = batch_inference(batch_encoding, model, tokenizer, terminators)

In [0]:
start_time = timeit.default_timer()

generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

elapsed = timeit.default_timer() - start_time
print(f"Time taken: {elapsed} seconds")

In [0]:
print(len(generated_texts))

In [0]:
print(generated_texts)

In [0]:
print_gpu_utilization()

In [0]:
# Free memory
del generated_ids  # Delete the outputs tensor
torch.cuda.empty_cache()  # Clear the GPU memory cache
print_gpu_utilization()

## Continous batch

In [0]:
batch_questions = [row.question for row in medquad_df.select("question").limit(1000).collect()]

In [0]:
updated_batch_questions = [f"Classify the following question: {question} as one of the categories: {qtype_list} without explanation. Return only the category name and output less than 15 tokens." for question in batch_questions]

In [0]:
def micro_batch_inference(input_list, tokenizer, terminators, batch_size: int = 64):

  import timeit
  start_time = timeit.default_timer()
  output_list = []

  for i in range(0, len(input_list), batch_size):
    
    print(f"Generating inferences on batch {i} to {i+batch_size - 1}...")

    if i + batch_size > len(input_list):
      batch_encoding = tokenizer(input_list[i:], return_tensors='pt', padding=True)
    else:  
      batch_encoding = tokenizer(input_list[i:i+batch_size], return_tensors='pt', padding=True)

    with torch.no_grad():
      generated_ids = model.generate(
          **batch_encoding,
          pad_token_id = tokenizer.pad_token_id,
          max_new_tokens=20,
          eos_token_id=terminators,
          do_sample=True,
          temperature=0.6,
          top_p=0.9,
      )

    generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    for item in generated_texts:
      output_list.append(item)

    print(f"Items completed: {len(output_list)}")

    # Free memory
    del generated_ids  # Delete the outputs tensor
    torch.cuda.empty_cache()  # Clear the GPU memory cache

  batch_generation_elapsed = timeit.default_timer() - start_time
  print(f"Time taken: {batch_generation_elapsed} seconds")

  return output_list, batch_generation_elapsed

In [0]:
output_list, batch_generation_elapsed = micro_batch_inference(updated_batch_questions, tokenizer, terminators, 100)

In [0]:
(len(output_list))

In [0]:
for item in output_list:
  print(item)

In [0]:
display([{'value': item} for item in output_list])

## Free up resources

In [0]:
print_gpu_utilization()

In [0]:
%restart_python