# CLIP (Ollama) Embedding inklusive Metadaten inklusive Kurzzusammenfassung der Bilder mit QwenVl

In [29]:
# ollama pull clip
# ollama pull qwen3-vl:8b

# pip install pillow tqdm 

In [30]:
# Bibliotheken
import os
import json
import base64
import requests
from pathlib import Path
from PIL import Image
from io import BytesIO
from tqdm import tqdm

In [31]:
# Konfigurations Ordner und Ollama Clip Modell
IMAGE_ROOT = Path("theorie")  
OUTPUT_FILE = "image_embeddings.json"

CLIP_MODEL = "clip"                   # für Embeddings
VISION_MODEL = "qwen3-vl:8b"          # für Bildbeschreibung (kurzer Inhalt)

OLLAMA_URL = "http://localhost:11434/api/generate"

In [32]:
# alle jpg Bilder finden
def find_images(root: Path):
    exts = [".jpg", ".jpeg", ".png"]
    return [
        p for p in root.rglob("*")
        if p.is_file() and p.suffix.lower() in exts
    ]


In [33]:
# Bild laden & zu base64 konvertieren
def to_b64(path: Path):
    with open(path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

In [34]:
# Clip Embedding berechnen 
def get_clip_embedding(b64_img: str):
    payload = {
        "model": CLIP_MODEL,
        "images": [b64_img],
        "prompt": ""
    }

    response = requests.post(OLLAMA_URL, json=payload, stream=True)
    embedding = None

    for line in response.iter_lines():
        if not line:
            continue
        j = json.loads(line.decode("utf-8"))
        if "embedding" in j:
            embedding = j["embedding"]

    return embedding


In [35]:
# Qwen3-VL (Ollama-LLM): Bildinhalt automatisch beschreiben

def describe_image_qwen(b64_img: str):
    prompt = (
        "Beschreibe dieses handschriftliche Mathematikbild sehr kurz und präzise. "
        "Gib nur eine kurze Zusammenfassung, z. B. 'Lineare Gleichung', 'Integralaufgabe', "
        "'Graph der Normalverteilung', 'Matrixmultiplikation', 'Ableitung einer Funktion'. "
        "Maximal 1–2 Sätze."
    )

    payload = {
        "model": VISION_MODEL,
        "prompt": prompt,
        "images": [b64_img]
    }

    response = requests.post(OLLAMA_URL, json=payload, stream=True)
    summary = ""

    for line in response.iter_lines():
        if not line:
            continue
        j = json.loads(line.decode("utf-8"))
        if "response" in j:
            summary += j["response"]

    return summary.strip()

### Main pipeline: Embedding mit Metadaten & LLM Zusammenfassungen

In [36]:
def main():
    images = find_images(IMAGE_ROOT)
    print("Gefundene Bilder:", len(images))

    out = []

    for img in tqdm(images, desc="Embedding"):
        try:
            b64 = to_b64(img)

            # 1. Embedding
            emb = get_clip_embedding(b64)
        

            # 2. Kurzbeschreibung (Qwen3-VL)
            summary = describe_image_qwen(b64)

            # 3. Metadaten pro Bild
            out.append({
                "embedding": emb,
                "metadata": {
                    "path": str(img),
                    "folder": str(img.parent),
                    "filename": img.name,
                    "summary": summary,
                    "type": "image"
                }
            })

        except Exception as e:
            print(f"Fehler bei {img}: {e}")

    with open(OUTPUT_FILE, "w") as f:
        json.dump(out, f, indent=2)

    print("FERTIG – gespeichert in:", OUTPUT_FILE)

if __name__ == "__main__":
    main()

Gefundene Bilder: 428


Embedding: 100%|██████████| 428/428 [00:06<00:00, 67.92it/s]

FERTIG – gespeichert in: image_embeddings.json





### Hauptprozess: Basic Modell

In [37]:
'''
def main():
    images = find_all_images(IMAGE_ROOT)
    print(f"Gefundene Bilder: {len(images)}")

    results = []

    for img_path in tqdm(images, desc="Embedding-Bilder"):
        try:
            b64 = image_to_base64(img_path)
            emb = get_clip_embedding(b64)

            # Metadaten
            metadata = {
                "path": str(img_path),
                "folder": str(img_path.parent),
                "filename": img_path.name,
                "type": "image",
                "modality": "vision",
            }

            results.append({
                "embedding": emb,
                "metadata": metadata,
            })

        except Exception as e:
            print(f"Fehler bei {img_path}: {e}")

    # Speichern
    with open(OUTPUT_FILE, "w") as f:
        json.dump(results, f, indent=2)

    print(f"\nFERTIG! Embeddings gespeichert in: {OUTPUT_FILE}")


if __name__ == "__main__":
    main()
'''

'\ndef main():\n    images = find_all_images(IMAGE_ROOT)\n    print(f"Gefundene Bilder: {len(images)}")\n\n    results = []\n\n    for img_path in tqdm(images, desc="Embedding-Bilder"):\n        try:\n            b64 = image_to_base64(img_path)\n            emb = get_clip_embedding(b64)\n\n            # Metadaten\n            metadata = {\n                "path": str(img_path),\n                "folder": str(img_path.parent),\n                "filename": img_path.name,\n                "type": "image",\n                "modality": "vision",\n            }\n\n            results.append({\n                "embedding": emb,\n                "metadata": metadata,\n            })\n\n        except Exception as e:\n            print(f"Fehler bei {img_path}: {e}")\n\n    # Speichern\n    with open(OUTPUT_FILE, "w") as f:\n        json.dump(results, f, indent=2)\n\n    print(f"\nFERTIG! Embeddings gespeichert in: {OUTPUT_FILE}")\n\n\nif __name__ == "__main__":\n    main()\n'

Vectorstore am besten Chorma wsl