In [None]:
# %%
"""
# HuluMed Multi-Modal Test Suite
Test HuluMed model's multimodal capabilities in Jupyter Notebook
"""

# %%
# Import required libraries
import os
#os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
from pathlib import Path
from PIL import Image
from transformers import AutoProcessor
from vllm import LLM, SamplingParams
import sys

# Add hulu_utils to path
from vllm.model_executor.models.hulu_utils import load_images, load_video, load_3d

# %%
MODEL_PATH = "ZJU-AI4H/Hulu-Med-7B"
VIDEO_PATH = "./demo.mp4"
IMAGE_PATH = "./demo.jpg"
NII_PATH = "./demo.nii"

os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

print("‚úÖ Environment configured")
print(f"üìÅ Model path: {MODEL_PATH}")
print(f"üé¨ Video path: {VIDEO_PATH}")
print(f"üñºÔ∏è  Image path: {IMAGE_PATH}")
print(f"üè• NII path: {NII_PATH}")

# %%
# Initialize model (run once)
print("üöÄ Loading model...")
llm = LLM(
    model=MODEL_PATH,
    trust_remote_code=True,
    dtype="bfloat16",
    enforce_eager=True,
    tensor_parallel_size=1,
    gpu_memory_utilization=0.9,
    limit_mm_per_prompt={"image": 512},
)

processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
print("‚úÖ Model loaded successfully!")

# %%
# Test 1: Single image test
print("="*80)
print("TEST 1: Single Image")
print("="*80)

if os.path.exists(IMAGE_PATH):
    images = load_images(IMAGE_PATH)
    conversation = [{"role": "user", "content": "<image>Describe this image."}]
    prompt = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
    
    vllm_input = {
        "prompt": prompt,
        "multi_modal_data": {"image": images}
    }
    
    sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
    outputs = llm.generate([vllm_input], sampling_params)
    
    print(f"üì∏ Input: Single image from {IMAGE_PATH}")
    print(f"üí¨ Question: Describe this image.")
    print(f"ü§ñ Answer: {outputs[0].outputs[0].text}")
else:
    print(f"‚ö†Ô∏è  Image not found: {IMAGE_PATH}")

# %%
# Test 2: Text-only test
print("="*80)
print("TEST 2: Text Only")
print("="*80)

conversation = [{"role": "user", "content": "What is the capital of France?"}]
prompt = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)

vllm_input = {"prompt": prompt}

sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
outputs = llm.generate([vllm_input], sampling_params)

print(f"üí¨ Question: What is the capital of France?")
print(f"ü§ñ Answer: {outputs[0].outputs[0].text}")

# %%
# Test 3: Multiple images test
print("="*80)
print("TEST 3: Multiple Images")
print("="*80)

if os.path.exists(IMAGE_PATH):
    # Load the same image twice as an example
    images = load_images(IMAGE_PATH) * 2
    conversation = [{"role": "user", "content": "<image><image>Compare these two images."}]
    prompt = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
    
    vllm_input = {
        "prompt": prompt,
        "multi_modal_data": {"image": images}
    }
    
    sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
    outputs = llm.generate([vllm_input], sampling_params)
    
    print(f"üì∏ Input: {len(images)} images")
    print(f"üí¨ Question: Compare these two images.")
    print(f"ü§ñ Answer: {outputs[0].outputs[0].text}")
else:
    print(f"‚ö†Ô∏è  Image not found: {IMAGE_PATH}")

# %%
# Test 4: 3D medical volume test
print("="*80)
print("TEST 4: 3D Medical Volume")
print("="*80)

if os.path.exists(NII_PATH):
    slices = load_3d(NII_PATH, num_slices=32, axis=2)
    print(f"üìä Loaded {len(slices)} slices from 3D volume")
    
    conversation = [{"role": "user", "content": "<image>"*len(slices) + "Describe this 3D scan."}]
    prompt = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
    
    vllm_input = {
        "prompt": prompt,
        "multi_modal_data": {"image": slices}
    }
    
    sampling_params = SamplingParams(temperature=0.0, max_tokens=512)
    outputs = llm.generate([vllm_input], sampling_params)
    
    print(f"üè• Input: {len(slices)} slices from {NII_PATH}")
    print(f"üí¨ Question: Describe this CT/MRI scan.")
    print(f"ü§ñ Answer: {outputs[0].outputs[0].text}")
    
    # Optional: Display first slice
    #from IPython.display import display
    #display(slices[0])
else:
    print(f"‚ö†Ô∏è  NII file not found: {NII_PATH}")

# %%
# Test 5: Video test
print("="*80)
print("TEST 5: Video")
print("="*80)

if os.path.exists(VIDEO_PATH):
    frames = load_video(VIDEO_PATH, fps=1.0, max_frames=64)
    print(f"üé¨ Loaded {len(frames)} frames from video")
    
    conversation = [{"role": "user", "content": "<image>"*len(frames) + "Describe this video."}]
    prompt = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
    
    vllm_input = {
        "prompt": prompt,
        "multi_modal_data": {"image": frames}
    }
    
    sampling_params = SamplingParams(temperature=0.0, max_tokens=512)
    outputs = llm.generate([vllm_input], sampling_params)
    
    print(f"üé• Input: {len(frames)} frames from {VIDEO_PATH}")
    print(f"üí¨ Question: Describe this video.")
    print(f"ü§ñ Answer: {outputs[0].outputs[0].text}")
    
    # Optional: Display first frame
    #from IPython.display import display
    #display(frames[0])
else:
    print(f"‚ö†Ô∏è  Video not found: {VIDEO_PATH}")

print("\n" + "="*80)
print("üéâ All tests completed!")
print("="*80)
print("""
üí° Tips:
- Each cell can be run independently
- Modify custom_question and paths for custom tests
- Adjust sampling_params to change generation strategy
- Use display() to show images
""")

