In [None]:
%pip install "ultralytics<=8.3.40" supervision roboflow
import ultralytics
ultralytics.checks()

Ultralytics 8.3.40 🚀 Python-3.12.12 torch-2.8.0+cu126 CUDA:0 (Tesla T4, 15095MiB)
Setup complete ✅ (2 CPUs, 12.7 GB RAM, 39.7/112.6 GB disk)


In [None]:
from ultralytics import YOLO
from IPython.display import display, Image

In [None]:
import os
import json
import cv2
from typing import List, Dict, Tuple
from google import genai
from google.genai import types

In [None]:
!pip install roboflow

from roboflow import Roboflow
rf = Roboflow(api_key="WZ3OZfiYUcvT8zsgwg4D")
project = rf.workspace("team-rahman").project("physics_lab_apparatus-kzk1d")
version = project.version(7)
dataset = version.download("yolov11")


loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in physics_lab_apparatus-7 to yolov11:: 100%|██████████| 196477/196477 [00:12<00:00, 16226.21it/s]





Extracting Dataset Version Zip to physics_lab_apparatus-7 in yolov11:: 100%|██████████| 14434/14434 [00:01<00:00, 8873.15it/s] 


In [None]:
!pip install roboflow ultralytics torch torchvision torchaudio opencv-python pandas tqdm scikit-learn Pillow




In [35]:
import yaml, pandas as pd, os, json
from pathlib import Path
from tqdm import tqdm

yolo_root = Path(dataset.location)
out_dir = Path("mlcls_dataset")
out_dir.mkdir(exist_ok=True)

data_yaml_file = list(yolo_root.glob("**/data.yaml"))[0]
classes = yaml.safe_load(open(data_yaml_file))["names"]

with open(out_dir/"classes.txt", "w") as f:
    for c in classes:
        f.write(c+"\n")

def process_split(split):
    rows = []
    if not (yolo_root/split).exists():
        return
    img_dir = yolo_root/split/"images"
    lbl_dir = yolo_root/split/"labels"
    for img_path in tqdm(sorted(img_dir.glob("*.*"))):
        label_file = lbl_dir/f"{img_path.stem}.txt"
        present = set()
        if label_file.exists():
            for line in open(label_file):
                cid = int(line.split()[0])
                present.add(cid)
        rows.append({"filepath": str(img_path), "labels": " ".join(classes[c] for c in sorted(present))})
    pd.DataFrame(rows).to_csv(out_dir/f"{split}.csv", index=False)

for split in ["train", "valid", "val", "test"]:
    process_split(split)


100%|██████████| 6297/6297 [00:00<00:00, 19044.77it/s]
100%|██████████| 612/612 [00:00<00:00, 17921.50it/s]
100%|██████████| 302/302 [00:00<00:00, 18917.53it/s]


In [38]:
# ============================== #
# 🧠 PHYSISENS-VLR HYBRID (YOLO → Gemini-only)
# - YOLOv11: detection
# - Gemini 2.5 Flash: bilingual QA + semantic instrument extraction
# - Match: YOLO ↔︎ Gemini (semantic)
# - Label on boxes: "YOLO → Gemini [match: yolo/none]"
# - Color: green = matched, yellow = none
# ============================== #

import os, re, json, torch, cv2, unicodedata, string
from pathlib import Path
from typing import List, Dict, Tuple
from ultralytics import YOLO
from google import genai
from google.genai import types

# ================== PATHS ==================
YOLO_MODEL_PATH = "/content/best.pt"  # your YOLOv11 weights
TEST_IMAGE = "/content/physics_lab_apparatus-7/test/images/IMG_20250925_151735_1_jpg.rf.ebf50bf8effc630f8cef525d59a320fb.jpg"
OUTPUT_DIR = "/content/outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "crops"), exist_ok=True)

# OPTIONAL: a known list of instrument names (if you have it). If not present, it's OK.
data_dir = Path("/content/mlcls_dataset")
CLASSES_FILE = data_dir / "classes.txt"
CLASS_NAMES = []
if CLASSES_FILE.exists():
    CLASS_NAMES = [c.strip() for c in open(CLASSES_FILE, encoding="utf-8") if c.strip()]

# ================== PARAMETERS ==================
YOLO_CONF = 0.35
GENAI_MODEL = "gemini-2.5-flash"
THINKING_BUDGET = 30
# If you want to require a minimum semantic score to count as "match with YOLO"
STRICT_MATCH = True
STRICT_THRESHOLD = 0.72  # tweak if needed

# ================== GOOGLE CLIENT ==================
def get_google_client() -> genai.Client:
    key = os.getenv("GOOGLE_API_KEY")
    if not key:
        raise RuntimeError("GOOGLE_API_KEY not found in environment.")
    return genai.Client(api_key=key)

client = get_google_client()

# ================== DEVICE (for YOLO) ==================
device = "cuda" if torch.cuda.is_available() else "cpu"

# ================== LOAD YOLO ==================
yolo_model = YOLO(YOLO_MODEL_PATH)
YOLO_NAMES = getattr(yolo_model.model, "names", None) or getattr(yolo_model, "names", None) or {}

def yolo_class_name(idx: int) -> str:
    if isinstance(YOLO_NAMES, dict):
        return YOLO_NAMES.get(int(idx), f"class_{int(idx)}")
    if isinstance(YOLO_NAMES, list) and 0 <= int(idx) < len(YOLO_NAMES):
        return YOLO_NAMES[int(idx)]
    return f"class_{int(idx)}"

# ================== BILINGUAL PROMPT ==================
def build_bilingual_prompt(seed_label: str) -> str:
    return f"""
You are an educational reasoning assistant for Physics lab instruments. Follow this **strict bilingual structure** for Question + Answer: (দ্বিভাষিক প্রশ্ন ও উত্তর কাঠামো অনুসরণ করুন।)

🔹 Question Prompt (প্রশ্ন কাঠামো)
1. Instrument Detected? (যন্ত্র সনাক্ত হয়েছে কি?) – Answer “Yes” or “No” (উত্তর হবে “হ্যাঁ” অথবা “না”)
2. Which Instrument? (কোন যন্ত্রটি?) – Write the name of the detected instrument (সনাক্ত যন্ত্রের নাম লিখুন)
3. Explanation – How it works & why we use it? (ব্যাখ্যা – এটি কীভাবে কাজ করে এবং কেন ব্যবহার করা হয়?)
   - English: Write 2–5 sentences explaining how the instrument works and why it is used.
   - বাংলা: একই অর্থে ২–৫টি বাক্যে এর কাজ করার প্রক্রিয়া ও ব্যবহারের কারণ লিখুন।
4. Usage Note (ব্যবহার নির্দেশ): Write one short, practical sentence showing real-life use.
   - English: One short, practical sentence.
   - বাংলা: তার সাথে এক লাইনে বাংলা অনুবাদ।

🔸 Answer Prompt (উত্তর কাঠামো)
Instrument Detected (যন্ত্র সনাক্ত হয়েছে): Yes / No (হ্যাঁ / না)
Instrument (যন্ত্রের নাম): {seed_label}

Explanation (ব্যাখ্যা):
Each English sentence must be followed by its Bangla translation. Example:
1️⃣ It measures small lengths very precisely. ➡️ এটি খুব সূক্ষ্ম দৈর্ঘ্য পরিমাপ করতে ব্যবহৃত হয়।
2️⃣ Used in physics labs to measure wire diameters. ➡️ পদার্থবিদ্যা ল্যাবে তারের ব্যাস পরিমাপের জন্য ব্যবহৃত হয়।

Usage Note (ব্যবহার নির্দেশ):
One short, practical sentence in both English and Bangla. Example:
- Always clean before measurement. ➡️ মাপ নেওয়ার আগে সবসময় পরিষ্কার করুন।

⚙️ Default placeholders (if explanation not possible):
English: Not available
Bangla: উপলভ্য নয়
Usage Note: (Keep empty / ফাঁকা রাখুন)
""".strip()

# ================== PARSER (English/Bangla/Usage Note) ==================
def parse_gemini_text(text: str) -> Dict[str, str]:
    out = {"english": "", "bangla": "", "usage_note": ""}

    m_en = re.search(r'^\s*English\s*:\s*(.+)$', text, flags=re.I | re.M)
    m_bn = re.search(r'^\s*Bangla\s*:\s*(.+)$', text, flags=re.I | re.M)
    m_un = re.search(r'^\s*Usage\s*Note\s*:\s*(.+)$', text, flags=re.I | re.M)
    if m_en: out["english"] = m_en.group(1).strip()
    if m_bn: out["bangla"] = m_bn.group(1).strip()
    if m_un: out["usage_note"] = m_un.group(1).strip()

    en_lines, bn_lines = [], []
    usage_line = out["usage_note"]
    for raw in text.splitlines():
        line = raw.strip()
        if not line:
            continue
        # Usage note fallback
        if not usage_line and (line.lower().startswith("usage note") or ("➡️" in line and line.startswith("-"))):
            usage_line = line.split(":", 1)[-1].strip() if ":" in line.lower() else line.lstrip("-").strip()
            continue
        # Paired sentence (EN ➡️ BN)
        if "➡️" in line and not line.lower().startswith("usage note"):
            left, right = line.split("➡️", 1)
            left = re.sub(r"^[0-9]+[️⃣\).\s-]*", "", left.strip())
            if left: en_lines.append(left)
            if right.strip(): bn_lines.append(right.strip())
            continue
        # Likely English bullet/numbered line
        if re.match(r"^([0-9]+[️⃣\)]|\-|\•|[0-9]+\.)", line) or line.lower().startswith("english"):
            if line.lower().startswith("english:"):
                line = line.split(":", 1)[-1].strip()
            if not line.lower().startswith(("instrument detected", "instrument (", "usage note")):
                en_lines.append(line)

    if not out["english"] and en_lines:
        out["english"] = " ".join(en_lines).strip()
    if not out["bangla"] and bn_lines:
        out["bangla"] = " ".join(bn_lines).strip()
    if not out["usage_note"] and usage_line:
        out["usage_note"] = usage_line.strip()

    if not out["english"]:
        out["english"] = "Not available"
    if not out["bangla"]:
        out["bangla"] = "উপলভ্য নয়"
    return out

# ================== GEMINI CALL (returns parsed + raw) ==================
def get_gemini_explanation(seed_label: str) -> Tuple[Dict[str, str], str]:
    prompt = build_bilingual_prompt(seed_label)
    try:
        response = client.models.generate_content(
            model=GENAI_MODEL,
            contents=prompt,
            config=types.GenerateContentConfig(
                thinking_config=types.ThinkingConfig(thinking_budget=THINKING_BUDGET)
            )
        )
        raw_text = (getattr(response, "text", "") or "").strip()
        return parse_gemini_text(raw_text), raw_text
    except Exception as e:
        print(f"[Gemini Error] {seed_label}: {e}")
        return {"english": "Not available", "bangla": "উপলভ্য নয়", "usage_note": ""}, ""

# ================== SEMANTIC MATCHING (Option B) ==================
from difflib import SequenceMatcher

_STOPWORDS = set("""
the a an of for to in with on at and or by from into using used use lab labs physics instrument device
""".split())

_PUNCT_MAP = str.maketrans({c: " " for c in string.punctuation + "–—•·“”‘’"})

def normalize_text(s: str) -> str:
    if not s:
        return ""
    s = unicodedata.normalize("NFKC", s)
    s = s.lower().translate(_PUNCT_MAP)
    s = re.sub(r"\s+", " ", s).strip()
    toks = [t for t in s.split() if t not in _STOPWORDS]
    return " ".join(toks)

def singularize(word: str) -> str:
    if word.endswith("ies"): return word[:-3] + "y"
    if word.endswith("ses"): return word[:-2]
    if word.endswith("s") and not word.endswith("ss"): return word[:-1]
    return word

def normalize_label(label: str) -> str:
    s = normalize_text(label)
    return " ".join(singularize(t) for t in s.split())

def ngrams(tokens: List[str], max_n: int = 4) -> List[str]:
    out = []
    L = len(tokens)
    for n in range(1, min(max_n, L) + 1):
        for i in range(L - n + 1):
            out.append(" ".join(tokens[i:i+n]))
    return out

def best_candidate_from_text(raw_text: str, candidates: List[str]) -> Dict[str, object]:
    if not raw_text:
        return {"candidate": "", "score": 0.0, "matched_phrase": ""}

    text_norm = normalize_text(raw_text)
    tokens = text_norm.split()
    phrases = list(dict.fromkeys(ngrams(tokens, max_n=4)))  # dedupe

    best = {"candidate": "", "score": 0.0, "matched_phrase": ""}
    cand_norm_map = {c: normalize_label(c) for c in candidates if c}

    for c, c_norm in cand_norm_map.items():
        if not c_norm:
            continue
        for p in phrases:
            score = SequenceMatcher(None, c_norm, p).ratio()
            # light length-balance boost
            score *= (1.0 - min(0.3, abs(len(p) - len(c_norm)) * 0.01))
            if score > best["score"]:
                best = {"candidate": c, "score": score, "matched_phrase": p}
    return best

def draw_label(img, x1, y1, text: str, color: Tuple[int, int, int]):
    font = cv2.FONT_HERSHEY_SIMPLEX
    scale, thickness = 0.45, 1  # 👈 smaller text (was 0.55, 2)
    (tw, th), baseline = cv2.getTextSize(text, font, scale, thickness)
    y_text = max(0, y1 - 8)
    x_text = max(0, x1)
    cv2.rectangle(img, (x_text, y_text - th - 6), (x_text + tw + 6, y_text + baseline), color, -1)
    cv2.putText(img, text, (x_text + 3, y_text - 3), font, scale, (0, 0, 0), thickness, cv2.LINE_AA)


# ================== YOLO → GEMINI → SEMANTIC MATCH ==================
def detect_with_gemini(image_path: str, yolo_conf: float = YOLO_CONF) -> Dict:
    yolo_results = yolo_model(image_path, conf=yolo_conf)
    res = yolo_results[0]
    img = cv2.imread(image_path)
    if img is None:
        raise FileNotFoundError(f"Image not found: {image_path}")

    summary: Dict = {"image": image_path, "detections": []}

    if not hasattr(res, "boxes") or res.boxes is None or len(res.boxes) == 0:
        print("[ℹ️] YOLO found no objects — returning empty detections.")
        annotated_path = os.path.join(OUTPUT_DIR, "annotated.jpg")
        cv2.imwrite(annotated_path, img)
        return summary

    for i, b in enumerate(res.boxes, start=1):
        conf_det = float(b.conf.cpu().numpy()[0])
        if conf_det < yolo_conf:
            continue

        x1, y1, x2, y2 = map(int, b.xyxy.cpu().numpy()[0])
        x1, y1 = max(0, x1), max(0, y1)
        x2, y2 = min(img.shape[1], x2), min(img.shape[0], y2)
        if x2 <= x1 or y2 <= y1:
            continue

        yolo_idx = int(b.cls.cpu().numpy()[0]) if hasattr(b, "cls") else -1
        yolo_label = yolo_class_name(yolo_idx)

        # Save crop for traceability (optional)
        crop = img[y1:y2, x1:x2]
        if crop.size == 0:
            continue
        crop_path = os.path.join(OUTPUT_DIR, "crops", f"obj_{i}.png")
        cv2.imwrite(crop_path, crop)

        # Call Gemini (seed with YOLO label for pedagogy)
        llm_parsed, llm_raw = get_gemini_explanation(yolo_label)

        # Build candidate pool: YOLO label + optional known class names
        candidate_pool = [yolo_label] + (CLASS_NAMES if CLASS_NAMES else [])
        best = best_candidate_from_text(llm_raw, candidate_pool)
        gemini_label = best["candidate"]
        gemini_score = float(best["score"])
        gemini_phrase = best["matched_phrase"]

        # Match decision
        normalized_equal = normalize_label(gemini_label) == normalize_label(yolo_label) if gemini_label else False
        score_ok = (gemini_score >= STRICT_THRESHOLD) if STRICT_MATCH else True
        matched = bool(normalized_equal and score_ok)
        match_with = "yolo" if matched else "none"

        # Record JSON
        det = {
            "index": i,
            "box": [x1, y1, x2, y2],
            "yolo_label": yolo_label,
            "yolo_confidence": round(conf_det, 3),
            "gemini_label": gemini_label,
            "gemini_match_score": round(gemini_score, 3),
            "gemini_matched_phrase": gemini_phrase,
            "match_with": match_with,
            "llm": llm_parsed
        }
        summary["detections"].append(det)

        # Draw
        color = (0, 255, 0) if matched else (0, 255, 255)  # green or yellow
        cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
        label_text = f"{yolo_label} → {gemini_label or 'N/A'} [match: {match_with}]"
        draw_label(img, x1, y1, label_text, color)

    annotated_path = os.path.join(OUTPUT_DIR, "annotated.jpg")
    cv2.imwrite(annotated_path, img)
    print("[✅] Annotated image saved:", annotated_path)
    return summary

# ================== RUN ==================
if __name__ == "__main__":
    summary = detect_with_gemini(TEST_IMAGE)

    summary_path = os.path.join(OUTPUT_DIR, "summary.json")
    with open(summary_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)
    print(f"[💾] Saved summary JSON → {summary_path}")

    print("\n🧾 Summary preview:\n", json.dumps(summary, indent=2, ensure_ascii=False))



image 1/1 /content/physics_lab_apparatus-7/test/images/IMG_20250925_151735_1_jpg.rf.ebf50bf8effc630f8cef525d59a320fb.jpg: 640x640 1 Helical-Extension-Spring, 1 Lens, 1 Micrometer-Screw-Gauge, 1 Vernier-Caliper, 1 Weight-carrier, 15.6ms
Speed: 2.0ms preprocess, 15.6ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)
[✅] Annotated image saved: /content/outputs/annotated.jpg
[💾] Saved summary JSON → /content/outputs/summary.json

🧾 Summary preview:
 {
  "image": "/content/physics_lab_apparatus-7/test/images/IMG_20250925_151735_1_jpg.rf.ebf50bf8effc630f8cef525d59a320fb.jpg",
  "detections": [
    {
      "index": 1,
      "box": [
        69,
        123,
        202,
        310
      ],
      "yolo_label": "Weight-carrier",
      "yolo_confidence": 0.889,
      "gemini_label": "Weight-carrier",
      "gemini_match_score": 1.0,
      "gemini_matched_phrase": "weight carrier",
      "match_with": "yolo",
      "llm": {
        "english": "A weight-carrier is a simple appar