# **Result**
#### A BEFORE Vs AFTER comparison using a set of 20 examples

In [None]:
import torch
import os, json, csv, traceback
from typing import List, Dict, Any
from PIL import Image
from transformers import AutoModelForCausalLM, AutoModelForImageTextToText, AutoProcessor, BitsAndBytesConfig

### Before

In [None]:
# constants
BASE_MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
ANNOTATIONS_PATH = "annotations.json"
OUTPUT_CSV = "results_base.csv"
OUTPUT_JSONL = "results_base.jsonl"

In [None]:

# Device detection + CPU-fallback helper
import torch, os
HAS_CUDA = torch.cuda.is_available()
print(f"CUDA available: {HAS_CUDA}")

def get_quant_config_or_none():
    # Only attempt bitsandbytes when CUDA is present
    if not HAS_CUDA:
        print("[INFO] CUDA not available — skipping bitsandbytes quantization.")
        return None
    try:
        import bitsandbytes as bnb  # noqa: F401
        from transformers import BitsAndBytesConfig
        return BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=False,
        )
    except Exception as e:
        print(f"[INFO] bitsandbytes not available or failed ({e}). Falling back to fp16 (GPU) if available.")
        return None

def load_base_model_and_processor_cpu_fallback(BASE_MODEL_NAME):
    # This wraps the notebook's loader and chooses CPU-safe options when no GPU.
    from transformers import AutoProcessor, AutoModelForImageTextToText
    print("Loading processor...")
    processor = AutoProcessor.from_pretrained(BASE_MODEL_NAME)
    qcfg = get_quant_config_or_none()

    if HAS_CUDA and qcfg is not None:
        print("[INFO] Using 4-bit quantization (CUDA).")
        model = AutoModelForImageTextToText.from_pretrained(
            BASE_MODEL_NAME,
            quantization_config=qcfg,
            device_map="auto",
        )
    elif HAS_CUDA:
        print("[INFO] CUDA available — loading model in fp16 with device_map='auto'.")
        model = AutoModelForImageTextToText.from_pretrained(
            BASE_MODEL_NAME,
            torch_dtype=torch.float16,
            device_map="auto",
        )
    else:
        # CPU path: avoid fp16 and bitsandbytes; force CPU and low-ram-friendly flags
        print("[INFO] Loading model on CPU (this will be slow and may OOM on low-RAM machines).")
        model = AutoModelForImageTextToText.from_pretrained(
            BASE_MODEL_NAME,
            device_map={"": "cpu"},
            low_cpu_mem_usage=True,   # helps reduce peak memory
            torch_dtype=torch.float32,
        )
    # ensure pad token
    tok = getattr(processor, "tokenizer", None)
    if tok is not None and tok.pad_token_id is None:
        if tok.eos_token_id is not None:
            tok.pad_token = tok.eos_token
        else:
            tok.add_special_tokens({"pad_token": "<|pad|>"})
            model.resize_token_embeddings(len(tok))

    print("Model and processor loaded.")
    return model, processor


In [None]:
# --- Base-only evaluation for LLaVA (Colab T4 friendly) ---
# ---------- quant & loader (4-bit with fallback to fp16) ----------
def get_quant_config_or_none():
    try:
        import bitsandbytes as _bnb  # noqa
        return BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,  # T4: fp16 compute
            bnb_4bit_use_double_quant=False,
        )
    except Exception as e:
        print(f"[INFO] 4-bit not available ({e}). Falling back to fp16.")
        return None

def ensure_pad_token(model, processor):
    tok = getattr(processor, "tokenizer", None)
    if tok is not None and tok.pad_token_id is None:
        if tok.eos_token_id is not None:
            tok.pad_token = tok.eos_token
        else:
            tok.add_special_tokens({"pad_token": "<|pad|>"})
            model.resize_token_embeddings(len(tok))

def load_base_model_and_processor():
    print("Loading base LLaVA model + processor...")
    processor = AutoProcessor.from_pretrained(BASE_MODEL_NAME)
    qcfg = get_quant_config_or_none()

    if qcfg:
        print("[INFO] Using 4-bit quantization.")
        model = AutoModelForImageTextToText.from_pretrained(
            BASE_MODEL_NAME,
            quantization_config=qcfg,
            device_map="auto",
        )
    else:
        print("[INFO] Using fp16.")
        model = AutoModelForImageTextToText.from_pretrained(
            BASE_MODEL_NAME,
            torch_dtype=torch.float16,
            device_map="auto",
        )

    ensure_pad_token(model, processor)
    print("Base model + processor loaded.")
    return model, processor

# ---------- prompt & generation ----------
def get_inference_prompt(utterance: str) -> str:
    # LLaVA 1.5 conversational format
    return f"USER: <image>\n{utterance}\nASSISTANT:"

def safe_pad_id(model, processor):
    tok = getattr(processor, "tokenizer", None)
    if tok and tok.pad_token_id is not None:
        return tok.pad_token_id
    return getattr(model.config, "eos_token_id", None)

@torch.inference_mode()
def generate_response(model, processor, image_path: str, statement: str) -> str:
    from PIL import Image
    img = Image.open(image_path).convert("RGB")

    prompt = get_inference_prompt(statement)

    # ✅ Pass text/images by name; wrap as lists for batch dimension
    inputs = processor(
        text=[prompt],
        images=[img],
        return_tensors="pt",
        padding=True,
    ).to(model.device)

    out = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        pad_token_id=safe_pad_id(model, processor),
    )

    tok = processor.tokenizer
    text = tok.decode(out[0], skip_special_tokens=True)
    marker = "ASSISTANT:"
    i = text.find(marker)
    return text[i + len(marker):].strip() if i != -1 else text.strip()


# ---------- data I/O ----------
def load_annotations(path: str) -> List[Dict[str, Any]]:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    if isinstance(data, dict):
        return [data]
    if isinstance(data, list):
        return data
    raise ValueError("annotations.json must be a dict or a list of dicts.")

def maybe_fix_image_path(p: str) -> str:
    if os.path.exists(p):
        return p
    alt = p.replace("image_", "img_") if "image_" in p else p.replace("img_", "image_")
    if alt != p and os.path.exists(alt):
        return alt
    joined = os.path.join("images", os.path.basename(p))
    if os.path.exists(joined):
        return joined
    return p  # may be missing; caller will handle

# ---------- main eval ----------
def evaluate_base(ann_path=ANNOTATIONS_PATH, out_csv=OUTPUT_CSV, out_jsonl=OUTPUT_JSONL):
    model, processor = load_base_model_and_processor()
    samples = load_annotations(ann_path)

    fields = ["id","image","statement","label","context","explanation","generated_text","error"]
    with open(out_csv, "w", newline="", encoding="utf-8") as fcsv, open(out_jsonl, "w", encoding="utf-8") as fjl:
        writer = csv.DictWriter(fcsv, fieldnames=fields)
        writer.writeheader()

        for ex in samples:
            row = {
                "id": ex.get("id",""),
                "image": ex.get("image",""),
                "statement": ex.get("statement",""),
                "label": ex.get("label",""),
                "context": ex.get("context",""),
                "explanation": ex.get("explanation",""),
                "generated_text": "",
                "error": "",
            }
            img_path = maybe_fix_image_path(row["image"])
            try:
                if not os.path.exists(img_path):
                    raise FileNotFoundError(f"Image not found: {img_path}")
                row["generated_text"] = generate_response(model, processor, img_path, row["statement"])
            except Exception as e:
                row["error"] = f"{type(e).__name__}: {e}"
                traceback.print_exc()

            writer.writerow(row)
            fjl.write(json.dumps(row, ensure_ascii=False) + "\n")

    print(f"[DONE] Wrote:\n - {out_csv}\n - {out_jsonl}")


if __name__ == "__main__":
    evaluate_base()


In [None]:
# 1) Clean up any wrong wheels
!pip uninstall -y bitsandbytes

# 2) Install CUDA 12-compatible wheel
!pip install -U bitsandbytes --extra-index-url https://jllllll.github.io/bitsandbytes-wheels/cu12

# 3) Core deps
!pip install -U "transformers>=4.44" "accelerate>=0.33" "peft>=0.12"

# (Optional) sanity check GPU
!nvidia-smi