### Testing the Fine-Tuned Model 

In [38]:
import gc
import time
import torch

import os
import torch
from typing import List, Dict, Any
from datasets import load_dataset, load_from_disk, Dataset
from huggingface_hub import login
from transformers import (
    AutoProcessor,
    LlavaOnevisionForConditionalGeneration,
    BitsAndBytesConfig,
)
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from trl import SFTConfig, SFTTrainer
from PIL import Image
import io

from transformers import Qwen2VLProcessor
from qwen_vl_utils import process_vision_info

def clear_memory():
    # Delete variables if they exist in the current global scope
    if "inputs" in globals():
        del globals()["inputs"]
    if "model" in globals():
        del globals()["model"]
    if "processor" in globals():
        del globals()["processor"]
    if "trainer" in globals():
        del globals()["trainer"]
    if "peft_model" in globals():
        del globals()["peft_model"]
    if "bnb_config" in globals():
        del globals()["bnb_config"]
    time.sleep(2)

    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)

    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")



In [39]:
clear_memory()

GPU allocated memory: 0.01 GB
GPU reserved memory: 0.02 GB


In [40]:
# Configuration
MODEL_ID = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
# DATASET_ID = "philschmid/amazon-product-descriptions-vlm"
USE_LORA = True
USE_QLORA = False
MULTIPLE_IMAGES_NUM = 2



processor = AutoProcessor.from_pretrained(MODEL_ID)

if USE_QLORA or USE_LORA:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16, # torch.float16
    ) if USE_QLORA else None
    
    model = LlavaOnevisionForConditionalGeneration.from_pretrained(
        MODEL_ID,
        device_map="auto",
        torch_dtype=torch.bfloat16, # torch.float16,
        quantization_config=bnb_config,
    )
else:
    model = LlavaOnevisionForConditionalGeneration.from_pretrained(
        MODEL_ID,
        device_map="auto",
        torch_dtype=torch.bfloat16, # torch.float16,
    )

Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.52s/it]


In [41]:
adapter_path = "llava-onevision-qwen2-7b-ov-neurips-openreview-v1"
model.load_adapter(adapter_path)

In [42]:
dataset = load_from_disk("neurips_openreview_v1")

In [43]:
def format_data(sample):
    return {
        # "images": [Image.open(io.BytesIO(img)) for img in sample["image"]],
        "messages": [
        {
            "role": "system",
            "content": [{"type": "text", "text": "You are a professional academic paper review assistant."}],
        },
        {   
            "role": "user",
            "content": [
                        # *[{'type': 'image', 'image': Image.open(io.BytesIO(img))} for img in sample["image"]],
                        *[{'type': 'image', 'image': Image.open(io.BytesIO(sample["image"][i])).resize((336, 336))} for i in range(MULTIPLE_IMAGES_NUM)],
                        # {'type': 'image', 'image': Image.open(io.BytesIO(sample["image"][0]))},
                        {"type": "text", "text": "Please help me on reviewing this paper by given those images"}
                        ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": sample["summaries"][0]}],
        },
    ]
    }

dataset = load_from_disk("neurips_openreview_v1")
format_dataset = [format_data(sample) for sample in dataset]

In [44]:
def generate_text_from_sample(model, processor, sample, max_new_tokens=1024, device="cuda"):
    # Prepare the text input by applying the chat template

    # print(sample["messages"][1:2])
    text_input = processor.apply_chat_template(
        sample["messages"][1:2], tokenize=False, add_generation_prompt=True  # Use the sample without the system message
    )

    # Process the visual input from the sample
    image_inputs, _ = process_vision_info(sample["messages"])
    
    # Prepare the inputs for the model
    model_inputs = processor(
        text=[text_input],
        images=image_inputs,
        return_tensors="pt",
    ).to(
        device, dtype=torch.bfloat16
    ) # Move inputs to the specified device

    # Generate text with the model
    generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens)

    # Trim the generated ids to remove the input ids
    trimmed_generated_ids = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)]

    # Decode the output text
    output_text = processor.batch_decode(
        trimmed_generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    return output_text[0]  # Return the first decoded output text

In [50]:
print(format_dataset[0])

{'messages': [{'role': 'system', 'content': [{'type': 'text', 'text': 'You are a professional academic paper review assistant.'}]}, {'role': 'user', 'content': [{'type': 'image', 'image': <PIL.Image.Image image mode=RGB size=336x336 at 0x7F837EE8BF40>}, {'type': 'image', 'image': <PIL.Image.Image image mode=RGB size=336x336 at 0x7F837EE8B9A0>}, {'type': 'text', 'text': 'Please help me on reviewing this paper by given those images'}]}, {'role': 'assistant', 'content': [{'type': 'text', 'text': 'The authors take the generalization of the K-NN method for the multi-label classification problem, which lifts the samples to feature spaces and replaces the distance weights with more general weight functions. The distributionally robust formulation of this well-known generalization is defined and shown to be equivalent to a much simpler problem when the ambiguity sets comprise Wasserstein balls. Thanks to this equivalence, the authors show that the worst-case distributions are characterized by 

In [45]:

output = generate_text_from_sample(model, processor, format_dataset[0])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [46]:
# Prediction
print(output)

The paper proposes a method for multi-class classification with only 3 labeled training samples. The method is based on the weighted k-NN algorithm, where the weights are learned. The authors show that the proposed method is equivalent to a SVM with a particular feature mapping. They also show that the feature mapping can be computed in linear time, which allows for efficient training. The authors show that the method is able to classify the 3 samples correctly on the training set, and show some experiments on real-world datasets.


## Ground-Truth Result 

In [47]:
# GT 
print(format_dataset[0]["messages"][2:3][0]["content"][0]['text'])

The authors take the generalization of the K-NN method for the multi-label classification problem, which lifts the samples to feature spaces and replaces the distance weights with more general weight functions. The distributionally robust formulation of this well-known generalization is defined and shown to be equivalent to a much simpler problem when the ambiguity sets comprise Wasserstein balls. Thanks to this equivalence, the authors show that the worst-case distributions are characterized by the solution of a convex optimization problem. There is further a solution algorithm proposed, and thanks to this, the authors compare the performance of Wasserstein DRO weighted K-NN with benchmark algorithms on well-known classification datasets.


## Compare Fine-Tuned Model vs. Base Model + Prompting

In [48]:
# Configuration
MODEL_ID = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
# DATASET_ID = "philschmid/amazon-product-descriptions-vlm"
USE_LORA = True
USE_QLORA = False
MULTIPLE_IMAGES_NUM = 2



processor = AutoProcessor.from_pretrained(MODEL_ID)

if USE_QLORA or USE_LORA:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16, # torch.float16
    ) if USE_QLORA else None
    
    model = LlavaOnevisionForConditionalGeneration.from_pretrained(
        MODEL_ID,
        device_map="auto",
        torch_dtype=torch.bfloat16, # torch.float16,
        quantization_config=bnb_config,
    )
else:
    model = LlavaOnevisionForConditionalGeneration.from_pretrained(
        MODEL_ID,
        device_map="auto",
        torch_dtype=torch.bfloat16, # torch.float16,
    )

Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  2.07s/it]


In [49]:
output = generate_text_from_sample(model, processor, format_dataset[0])
print(output)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


The image you've provided appears to be a page from a research paper. However, I'm unable to provide a detailed review of the content or the quality of the research based on the images alone. If you have specific questions about the paper or need assistance with understanding certain aspects of the content, feel free to ask, and I'll do my best to help!
