# Libraries

In [1]:
import torch

from peft import PeftModel
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

# Meta Llama 3.2 3B Instruct Rare Disease

In [2]:
model_id = "fine_tuned_models/Llama-3.2-3B-Instruct-Rare-Disease_v3" #"models/Meta-Llama-3.2-3B-Instruct/"#

In [3]:
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:1


In [4]:
instruction = """
    You are a highly rated rare disease classification agent name Chimz. 
    Provide users all the answers regarding their question.
    """

In [None]:
messages = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": "Who are you?"},
]
outputs = pipe(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])

In [None]:
messages1 = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": """
                                What rare disease might I have if I feel my leg is falling off, 
                                my heart won't stop racing, I am bleeding through my nose, 
                                and my back is killing me"""}]

outputs = pipe(
    messages1,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])

In [None]:
messages2 = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": "What are some rare disease that have bleeding nose as a symptoms"}]

outputs = pipe(
    messages2,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])

# Load Model

In [None]:
# Load model directly
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

In [None]:
print("What question do you have for me?\n")
user_input = input() #"What some rare diseases affects postmenoposal women with type 2 diabetes mellitus and signs of androgen excess?"

test_message = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": user_input}]

prompt = tokenizer.apply_chat_template(test_message, tokenize=False, add_generation_prompt=True)
    
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

model.to(inputs['input_ids'].device)

# Clear CUDA cache before generation
torch.cuda.empty_cache()

# Debugging: Check for NaNs or Infs in the input
if torch.isnan(inputs['input_ids']).any() or torch.isinf(inputs['input_ids']).any():
    print("Input contains NaNs or Infs")

# Use a try-except block to catch and print any errors during generation
try:
    outputs = model.generate(**inputs, max_new_tokens=50, num_return_sequences=1)  # Reduce max_new_tokens
except RuntimeError as e:
    print(f"Error during generation: {e}")

# Inspect the model outputs
with torch.no_grad():
    generated_ids = model.generate(**inputs, num_return_sequences=1, max_length=512)
    #print(generated_ids)

# Decode the generated tokens
text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print(text.split("assistant")[1])