In [None]:
# 1) IMPORT  ──────────────────────────────────────────────────────
import os, re, torch, pandas as pd
from PIL import Image
from transformers import AutoModel, AutoImageProcessor
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from qdrant_client.http.exceptions import UnexpectedResponse
from dotenv import load_dotenv

load_dotenv()
QDRANT_API = os.getenv("QDRANT_API")
QDRANT_URL = "https://cd8db105-544d-457f-aa1a-97d4475c1f56.europe-west3-0.gcp.cloud.qdrant.io"

# 2) DINOv2  ──────────────────────────────────────────────
model_name = "facebook/dinov2-large"
processor = AutoImageProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).eval()

# 3) IMAGES SELECTION  ─────────────────────────────────
base_directory = "images"
all_images = sorted(os.listdir(base_directory))[:]          
paths = [f"{base_directory}/{p}" for p in all_images]

# 4) DATAFRAME AND PAYLOAD  ───────────────────────────────────────
def extract_author(path: str) -> str:
    name_parts = path.split("/")[1].split("_")
    author = []
    for part in name_parts:
        if re.search(r"\d", part): break
        author.append(part)
    return re.sub(r"[^a-zA-Z ]+", "", " ".join(author)).strip()

def embed_image(path: str):
    img = Image.open(path).convert("RGB")
    inputs = processor(images=img, return_tensors="pt")
    with torch.no_grad():
        emb = model(**inputs).last_hidden_state.mean(dim=1)
    return emb.cpu().numpy().flatten()

payloads = pd.DataFrame({"image_url": paths})
payloads["author"]    = payloads["image_url"].apply(extract_author)
payloads["embedding"] = payloads["image_url"].apply(embed_image)

# 5) CONNECTION TO QDRANT  ───────────────────────────────────────
qclient = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API)
collection_name = "dino_embedding_collection"

try:
    qclient.get_collection(collection_name)
    current_id = qclient.count(collection_name).count
except UnexpectedResponse as err:
    if err.status_code == 404:
        qclient.create_collection(
            collection_name,
            vectors_config=VectorParams(
                size=payloads["embedding"][0].shape[0],
                distance=Distance.COSINE,
            ),
        )
        current_id = 0
    else:
        raise

# 6) POINTSTRUCT  ──────────────────────────────────────
records = [
    PointStruct(
        id=current_id + i,
        payload=payloads.drop(columns="embedding").iloc[i].to_dict(),
        vector=payloads["embedding"].iloc[i].tolist(),
    )
    for i in range(len(payloads))
]

qclient.upsert(collection_name, points=records)