In [0]:
%pip install --upgrade mlflow
%pip install --upgrade databricks-sdk
%pip install --upgrade accelerate
%pip install --upgrade bitsandbytes
%pip install --upgrade nvidia-ml-py3
%pip install --upgrade transformers

dbutils.library.restartPython()

In [0]:
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    device_count = nvmlDeviceGetCount()
    for i in range(device_count):
        handle = nvmlDeviceGetHandleByIndex(i)
        info = nvmlDeviceGetMemoryInfo(handle)
        print(f"GPU {i} memory occupied: {info.used//1024**2} MB, available: {info.free//1024**2} MB.")

def print_available_vram():
    nvmlInit()
    device_count = nvmlDeviceGetCount()
    for i in range(device_count):
        handle = nvmlDeviceGetHandleByIndex(i)
        info = nvmlDeviceGetMemoryInfo(handle)
        print(f"GPU {i} available VRAM: {info.free//1024**2} MB.")

In [0]:
print_gpu_utilization()

In [0]:
print_available_vram()

### Log in to HF due to gated model 

In [0]:
from huggingface_hub import login

login(token=dbutils.secrets.get('william_smith_secrets', 'HF_KEY'))

### Load in 32 bit

In [0]:
# from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="cuda")

### Load in 8 bit due to v100 having 16 GB of VRAM 

In [0]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")

# Set pad_token 
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="cuda",
)

In [0]:
print_gpu_utilization()


### Load the data

In [0]:
medquad_df = spark.read.table("will_smith.datasets.medquad_qna")
display(medquad_df)

In [0]:
qtype_list = medquad_df.select("qtype").distinct().rdd.map(lambda row: row.qtype).collect()

In [0]:
print(qtype_list)

## Generate inference 

In [0]:
first_question_text = medquad_df.select("question").limit(2).collect()[1]["question"]

In [0]:
user_input = f"Classify the following question: {first_question_text} as one of the categories: {qtype_list} without explanation. Return only the category name and output less than 15 tokens."

In [0]:
print(user_input)

In [0]:
messages = [
    {"role": "system", "content": "You are a doctor that works in a hospital emergency room."},
    {"role": "user", "content": user_input},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=False,
    return_tensors="pt"
).to(model.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

In [0]:
import timeit

start_time = timeit.default_timer()

with torch.no_grad():
    outputs = model.generate(
        input_ids,
        pad_token_id = tokenizer.pad_token_id,
        max_new_tokens=len(max(qtype_list)) + 5,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )

response = outputs[0][input_ids.shape[-1]:]

elapsed = timeit.default_timer() - start_time
print(f"Time taken: {elapsed} seconds")

In [0]:
start_time = timeit.default_timer()

print(tokenizer.decode(response, skip_special_tokens=True))

elapsed = timeit.default_timer() - start_time
print(f"Time taken: {elapsed} seconds")

## Now we do batch inference with low-level generate

In [0]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

texts = [user_input, user_input]

encoding = tokenizer(texts, return_tensors='pt').to(device)

In [0]:
def batch_inference(encoding, tokenizer, terminators):
  import timeit

  start_time = timeit.default_timer()

  with torch.no_grad():
      generated_ids = model.generate(
          **encoding,
          pad_token_id = tokenizer.pad_token_id,
          max_new_tokens=20,
          eos_token_id=terminators,
          do_sample=True,
          temperature=0.6,
          top_p=0.9,
      )

  batch_generation_elapsed = timeit.default_timer() - start_time
  print(f"Time taken: {batch_generation_elapsed} seconds")

  return generated_ids


In [0]:
generated_ids = batch_inference(encoding, tokenizer, terminators)

In [0]:
generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

In [0]:
for item in generated_texts: 
  print(item)

## Timing the computation across 100 records using low level generate

In [0]:
prompt = f"Classify the following question: {first_question_text} as one of the categories: {qtype_list} without explanation. Return only the category name and output less than 15 tokens."

In [0]:
batch_questions = [row.question for row in medquad_df.select("question").limit(100).collect()]

In [0]:
updated_batch_questions = [f"Classify the following question: {question} as one of the categories: {qtype_list} without explanation. Return only the category name and output less than 15 tokens." for question in batch_questions]

In [0]:
batch_encoding = tokenizer(updated_batch_questions, return_tensors='pt', padding=True).to(device)

In [0]:
generated_ids = batch_inference(batch_encoding, tokenizer, terminators)

In [0]:
start_time = timeit.default_timer()

generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

elapsed = timeit.default_timer() - start_time
print(f"Time taken: {elapsed} seconds")

In [0]:
print(len(generated_texts))

In [0]:
print(generated_texts)

In [0]:
%restart_python

In [0]:
print_gpu_utilization()

In [0]:
# Free memory
del generated_ids  # Delete the outputs tensor
torch.cuda.empty_cache()  # Clear the GPU memory cache
print_gpu_utilization()

## Continous batch

In [0]:
batch_questions = [row.question for row in medquad_df.select("question").limit(1000).collect()]

In [0]:
updated_batch_questions = [f"Classify the following question: {question} as one of the categories: {qtype_list} without explanation. Return only the category name and output less than 15 tokens." for question in batch_questions]

In [0]:
def micro_batch_inference(input_list, tokenizer, terminators, batch_size: int = 64):

  import timeit
  start_time = timeit.default_timer()
  output_list = []

  for i in range(0, len(input_list), batch_size):
    
    print(f"Generating inferences on batch {i} to {i+batch_size - 1}...")

    if i + batch_size > len(input_list):
      batch_encoding = tokenizer(input_list[i:], return_tensors='pt', padding=True).to(device)
    else:  
      batch_encoding = tokenizer(input_list[i:i+batch_size], return_tensors='pt', padding=True).to(device)

    with torch.no_grad():
      generated_ids = model.generate(
          **batch_encoding,
          pad_token_id = tokenizer.pad_token_id,
          max_new_tokens=20,
          eos_token_id=terminators,
          do_sample=True,
          temperature=0.6,
          top_p=0.9,
      )

    generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    for item in generated_texts:
      output_list.append(item)

    print(f"Items completed: {len(output_list)}")

    # Free memory
    del generated_ids  # Delete the outputs tensor
    torch.cuda.empty_cache()  # Clear the GPU memory cache

  batch_generation_elapsed = timeit.default_timer() - start_time
  print(f"Time taken: {batch_generation_elapsed} seconds")

  return output_list, batch_generation_elapsed

In [0]:
output_list, batch_generation_elapsed = micro_batch_inference(updated_batch_questions, tokenizer, terminators, 100)

In [0]:
(len(output_list))

In [0]:
for item in output_list:
  print(item)

## Free up resources

In [0]:
print_gpu_utilization()

In [0]:
%restart_python