In [None]:
!pip install -U -q byaldi pdf2image qwen-vl-utils transformers
# Tested with byaldi==0.0.4, pdf2image==1.17.0, qwen-vl-utils==0.0.8, transformers==4.45.0

In [None]:
!sudo apt-get install -y poppler-utils

In [None]:
import os
import shutil

# Map of IKEA item names to their source file names (already in Drive)
pdfs = {
    "ECCO": "ECCO Guidelines on Extraintestinal IBD.pdf"
}

# Replace with your actual drive mount point or folder path
drive_source_dir = "/content/drive/MyDrive/IKEA_PDFs"
output_dir = "data"
os.makedirs(output_dir, exist_ok=True)

# Copy each file from Drive to the local output directory
for name, filename in pdfs.items():
    source_path = os.path.join(drive_source_dir, filename)
    dest_path = os.path.join(output_dir, f"{name}.pdf")

    if os.path.exists(source_path):
        shutil.copy(source_path, dest_path)
        print(f"Copied {name} from {source_path} to {dest_path}")
    else:
        print(f"❌ File not found for {name}: {source_path}")

print("Final files in output folder:", os.listdir(output_dir))

In [None]:
import os
from pdf2image import convert_from_path


def convert_pdfs_to_images(pdf_folder):
    pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]
    all_images = {}

    for doc_id, pdf_file in enumerate(pdf_files):
        pdf_path = os.path.join(pdf_folder, pdf_file)
        images = convert_from_path(pdf_path)
        all_images[doc_id] = images

    return all_images


all_images = convert_pdfs_to_images("/content/data/")

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 8, figsize=(15, 10))

for i, ax in enumerate(axes.flat):
    img = all_images[0][i]
    ax.imshow(img)
    ax.axis("off")

plt.tight_layout()
plt.show()

In [None]:
import torch
print("PyTorch CUDA available:", torch.cuda.is_available())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

In [None]:
from byaldi import RAGMultiModalModel

docs_retrieval_model = RAGMultiModalModel.from_pretrained("vidore/colpali-v1.2")

In [None]:
docs_retrieval_model.index(
    input_path="data/", index_name="image_index", store_collection_with_index=False, overwrite=True
)

In [None]:
text_query = "What is the recommended VTE prophylaxis for hospitalized IBD patients?"

results = docs_retrieval_model.search(text_query, k=3)
results

In [None]:
def get_grouped_images(results, all_images):
    grouped_images = []

    for result in results:
        doc_id = result["doc_id"]
        page_num = result["page_num"]
        grouped_images.append(
            all_images[doc_id][page_num - 1]
        )  # page_num are 1-indexed, while doc_ids are 0-indexed. Source https://github.com/AnswerDotAI/byaldi?tab=readme-ov-file#searching

    return grouped_images


grouped_images = get_grouped_images(results, all_images)

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 3, figsize=(15, 10))

for i, ax in enumerate(axes.flat):
    img = grouped_images[i]
    ax.imshow(img)
    ax.axis("off")

plt.tight_layout()
plt.show()

In [None]:
!pip install -U bitsandbytes accelerate

In [None]:
pip install -U bitsandbytes

In [None]:
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor, BitsAndBytesConfig
import torch

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model_id = "Qwen/Qwen2-VL-7B-Instruct"

processor = Qwen2VLProcessor.from_pretrained(model_id, trust_remote_code=True)
vl_model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=quant_config,
    device_map="auto",
    trust_remote_code=True,
)

In [None]:
torch.cuda.empty_cache()

In [None]:
from qwen_vl_utils import process_vision_info

In [None]:
min_pixels = 224 * 224
max_pixels = 1024 * 1024
vl_model_processor = Qwen2VLProcessor.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels
)

In [None]:
chat_template = [
    {
        "role": "system",
        "content": "You are a Gastroenterology assistant. Answer in detail with all the possible facts available in document and accurately based only on the provided images and question."
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "image": grouped_images[0]},  # e.g. an endoscopy diagram or chart
            {"type": "text", "text": text_query}
        ]
    }
]


In [None]:
text = vl_model_processor.apply_chat_template(chat_template, tokenize=False, add_generation_prompt=True)

In [None]:
image_inputs, _ = process_vision_info(chat_template)
inputs = vl_model_processor(
    text=[text],
    images=image_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

In [None]:
generated_ids = vl_model.generate(**inputs, max_new_tokens=500)

In [None]:
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_text = vl_model_processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

In [None]:
print(output_text[0])