In [1]:
# Install all required packages
!pip install git+https://github.com/openai/CLIP.git
!pip install -q -U llama-index chromadb llama-index-vector-stores-chroma llama-index-embeddings-clip llama-index-multi-modal-llms-openai-like
!pip install -q -U bitsandbytes accelerate peft
!pip install vllm

# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-fb1fl_s4
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-fb1fl_s4
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369490 sha256=6219316d6045ff58c5bfc54dc730a201be602fd5f6716904458abe74b58c8814
  Stored in directory: /tmp/pip-ephem-wheel-cache-1e282_hd/wheels/35/3e/df/3d24cbfb3b6a06f17

Mounted at /content/drive


In [2]:
import subprocess
import time

# --- Configuration ---
# IMPORTANT: Update this path to your fine-tuned model folder
MODEL_PATH = "/content/drive/MyDrive/APOD/llava-apod-merged"
LOG_FILE = "vllm_server.log"

# Command to start the VLLM server
command = [
    "python", "-m", "vllm.entrypoints.openai.api_server",
    "--model", MODEL_PATH,
    "--trust-remote-code",
    "--gpu-memory-utilization", "0.7"
]

print("Starting LLaVA VLLM server in the background...")
# Open a log file to capture the server's output
with open(LOG_FILE, "w") as log:
    # Use subprocess.Popen to run the command as a background process
    server_process = subprocess.Popen(command, stdout=log, stderr=log)

print(f"Server started with PID: {server_process.pid}. Check '{LOG_FILE}' for status.")
# Wait a bit for the server to initialize
time.sleep(120)
print("Server should be ready.")

Starting LLaVA VLLM server in the background...
Server started with PID: 1852. Check 'vllm_server.log' for status.
Server should be ready.


In [4]:
import chromadb, os, base64, requests
from llama_index.embeddings.clip import ClipEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.indices import MultiModalVectorStoreIndex
from llama_index.core import StorageContext, PromptTemplate, QueryBundle, Settings, VectorStoreIndex
from llama_index.multi_modal_llms.openai_like import OpenAILikeMultiModal
from llama_index.core.query_engine import SimpleMultiModalQueryEngine
from llama_index.core.indices.multi_modal.retriever import MultiModalVectorIndexRetriever
from llama_index.core.schema import ImageNode, ImageDocument

# --- Configuration ---
CHROMA_DB_PATH = '/content/drive/MyDrive/APOD/multimodal_db'
COLLECTION_NAME = "multimodal_collection"
LLAVA_API_BASE = "http://localhost:8000/v1"
LLAVA_MODEL_NAME = "/content/drive/MyDrive/APOD/llava-apod-merged"

def query_llava_server(prompt_text: str, image_path: str) -> str:
    """Sends a prompt and a base64-encoded image directly to the VLLM server."""
    print("\n--- Sending request to local LLaVA server ---")
    try:
        with open(image_path, "rb") as img_file:
            encoded_image = base64.b64encode(img_file.read()).decode('utf-8')
    except FileNotFoundError:
        return f"Error: Image file not found at {image_path}"

    headers = {"Content-Type": "application/json"}
    payload = {
        "model": LLAVA_MODEL_NAME,
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt_text},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}
                    }
                ]
            }
        ],
        "max_tokens": 500
    }

    try:
        response = requests.post(f"{LLAVA_API_BASE}/chat/completions", headers=headers, json=payload)
        response.raise_for_status()
        return response.json()["choices"][0]["message"]["content"]
    except Exception as e:
        return f"Error calling LLaVA server: {e}"

if __name__ == "__main__":
    # --- Set up the Retriever ---
    print("Initializing retriever...")
    Settings.embed_model = ClipEmbedding()
    db = chromadb.PersistentClient(path=CHROMA_DB_PATH)
    chroma_collection = db.get_collection(COLLECTION_NAME)
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
    retriever = index.as_retriever(similarity_top_k=5) # Retrieve top 5 overall nodes

    Settings.embed_model = ClipEmbedding()

    # Set up the LLaVA multimodal LLM (pointing to your local server)
    llava_mm_llm = OpenAILikeMultiModal(
        model="/content/drive/MyDrive/APOD/llava-apod-merged",
        api_key="EMPTY",  # LLaVA server does not require a real key
        api_base="http://localhost:8000/v1",  # Your LLaVA server endpoint
        max_new_tokens=256,
    )

    #Build the query engine for multimodal retrieval
    query_engine = SimpleMultiModalQueryEngine(
        retriever = retriever,
        multi_modal_llm=llava_mm_llm,
    )

    # Construct and combine the parts of a prompt
    qa_text = "What is this image?"
    image_path = "/content/drive/MyDrive/APOD/DATA/IMAGES/2023-03-03.jpg"

    query = QueryBundle(
        query_str = qa_text,
        image_path = image_path,
    )

    # Retrieve the context for the VLLM to use.
    nodes = query_engine.retrieve(query)
    for node in nodes:
        print("Node type:", type(node.node))
        print("Node content:", node.node.get_content())
        print("Node metadata:", node.node.metadata)

    # Seperate image and text retrievals
    image_nodes = [node for node in nodes if node.metadata.get('source') == 'apod_image']
    text_nodes = [node for node in nodes if node.metadata.get('source') != 'apod_image']

    #Model will see the image that is uploaded and not the top retrieved one if image is uploaded
    if image_path != None:
      top_image_path = image_path
    else:
      if image_nodes:
        # The first image in the list is the most relevant one
        top_image_path = "/content/drive/MyDrive/APOD/DATA/IMAGES/" + os.path.basename(os.path.normpath(image_nodes[0].metadata.get('file_path')))

    context_str = "\n".join([node.get_content() for node in text_nodes])

    # --- Call the LLM with the best image and all retrieved text ---
    print(f"Found best image: {top_image_path}")
    print(f"Using context: \"{context_str[:100]}...\"")

    prompt = (
        "Context information is below.\n"
        "---------------------\n"
        f"{context_str}\n"
        "---------------------\n"
        "Given the context information and prior knowledge, "
        "answer the query.\n"
        f"Query:{qa_text} .\n"
        "Answer: "
    )

    final_response = query_llava_server(prompt_text=prompt, image_path=top_image_path)

    print("\n--- Final LLaVA Response ---")
    print(final_response)

Initializing retriever...


  llava_mm_llm = OpenAILikeMultiModal(


Node type: <class 'llama_index.core.schema.TextNode'>
Node content: 
Node metadata: {'source': 'apod_image', 'file_path': 'DATA/APOD_DATA/IMAGES/2020-08-10.jpg'}
Node type: <class 'llama_index.core.schema.TextNode'>
Node content: 
Node metadata: {'source': 'apod_image', 'file_path': 'DATA/APOD_DATA/IMAGES/2021-08-06.jpg'}
Node type: <class 'llama_index.core.schema.TextNode'>
Node content: 
Node metadata: {'source': 'apod_image', 'file_path': 'DATA/APOD_DATA/IMAGES/2022-01-05.jpg'}
Node type: <class 'llama_index.core.schema.TextNode'>
Node content: 
Node metadata: {'source': 'apod_image', 'file_path': 'DATA/APOD_DATA/IMAGES/2023-03-03.jpg'}
Node type: <class 'llama_index.core.schema.TextNode'>
Node content: 
Node metadata: {'source': 'apod_image', 'file_path': 'DATA/APOD_DATA/IMAGES/2025-03-16.jpg'}
Found best image: /content/drive/MyDrive/APOD/DATA/IMAGES/2023-03-03.jpg
Using context: "..."

--- Sending request to local LLaVA server ---

--- Final LLaVA Response ---
 It is a space phot