In [4]:
# car_vector_search.py
# Gereksinimler:
# pip install pandas qdrant-client sentence-transformers scikit-learn unidecode pydantic tqdm

from __future__ import annotations
import os
import re
import math
import uuid
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple

import pandas as pd
import numpy as np
%pip install tqdm
from tqdm import tqdm
%pip install unidecode
from unidecode import unidecode
from pydantic import BaseModel, Field

# Qdrant
from qdrant_client import QdrantClient
from qdrant_client.models import (
    Distance,
    VectorParams,
    PointStruct,
    Filter,
    FieldCondition,
    Range,
    MatchValue,
)

%pip install sentence-transformers

# -----------------------------
# 0) Embedder (Sentence-Transformers - CPU varsayılan)
# -----------------------------
from sentence_transformers import SentenceTransformer

class ST_Embedder:
    """
    Sentence-Transformers tabanlı embedder.
    - Varsayılan cihaz: CPU
    - normalize_embeddings=True (cosine için iyi pratik)
    """
    def __init__(self, model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
                 force_device: str = "cpu"):
        self.device = force_device
        self.model_name = model_name
        self.model = SentenceTransformer(model_name, device=self.device)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        try:
            return self.model.encode(
                texts,
                convert_to_numpy=True,
                batch_size=64,
                normalize_embeddings=True,
                show_progress_bar=False,
            ).tolist()
        except Exception:
            # GPU'da sorun olursa CPU'ya düş
            if self.device != "cpu":
                self.device = "cpu"
                self.model = SentenceTransformer(self.model_name, device="cpu")
                return self.model.encode(
                    texts,
                    convert_to_numpy=True,
                    batch_size=64,
                    normalize_embeddings=True,
                    show_progress_bar=False,
                ).tolist()
            raise

    def embed_query(self, text: str) -> List[float]:
        return self.embed_documents([text])[0]

    def dimension(self) -> int:
        if hasattr(self.model, "get_sentence_embedding_dimension"):
            return self.model.get_sentence_embedding_dimension()
        # çok nadir durumda:
        return len(self.embed_query("test"))


# -----------------------------
# 1) Yardımcılar / Normalizasyon
# -----------------------------
TURKISH_MAP = {
    "otomatik": ["otomatik", "auto", "dct", "edc", "e-cvt", "cvt", "tiptronic", "multitronic", "dsg"],
    "manuel": ["manuel", "manual"],
    "benzin": ["benzin", "gasoline", "benzinli"],
    "dizel": ["dizel", "diesel"],
    "lpg": ["lpg", "autogas"],
    "hybrid": ["hibrid", "hybrid"],
    "elektrik": ["elektrik", "electric", "bev", "ev"],
}

def ascii_lower(s: Any) -> str:
    return unidecode(str(s or "")).strip().lower()

def to_num(text: Any) -> Optional[float]:
    if text is None:
        return None
    s = str(text).strip().lower()
    if s in ("nan", "", "none", "yok", "—", "-"):
        return None
    s = s.replace("tl", "").replace("₺", "").replace("km", "")
    s = s.replace("milyon", "000000").replace("mn", "000000").replace("m", "000000")
    s = re.sub(
        r"(\d+)[\.,]?(\d*)\s*bin",
        lambda m: str(float(m.group(1) + "." + (m.group(2) or "0")) * 1000),
        s,
    )
    s = s.replace(".", "").replace(" ", "")
    s = s.replace(",", ".")
    try:
        return float(s)
    except:
        return None

def year4(x: Any) -> Optional[int]:
    m = re.search(r"\b(19|20)\d{2}\b", str(x or ""))
    return int(m.group(0)) if m else None

def none_if_nan(x: Any):
    try:
        return None if (x is None or (isinstance(x, float) and math.isnan(x))) else x
    except:
        return x

def make_point_id(raw: Any):
    """
    Qdrant ID: unsigned int veya UUID string olmalı.
    """
    if raw is None:
        return str(uuid.uuid4())
    # int?
    try:
        if isinstance(raw, float):
            if math.isnan(raw):
                return str(uuid.uuid4())
            if float(raw).is_integer() and int(raw) >= 0:
                return int(raw)
            return str(uuid.uuid4())
        iv = int(raw)
        if iv >= 0:
            return iv
    except Exception:
        pass
    # UUID?
    try:
        return str(uuid.UUID(str(raw)))
    except Exception:
        return str(uuid.uuid4())

def match_from_map(value: str, mapping: Dict[str, List[str]]) -> str:
    v = ascii_lower(value)
    for canon, variants in mapping.items():
        for t in variants:
            if t in v:
                return canon
    return value


# -----------------------------
# 2) DataFrame Normalize
# -----------------------------
def normalize_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    expected = [
        "id","baslik","konum","fiyat","aciklama","marka","seri","model","yil","kilometre",
        "yakit_tipi","vites_tipi","renk","arac_durumu","kasa_tipi","cekis","motor_hacmi",
        "motor_gucu","tramer","url"
    ]
    missing = [c for c in expected if c not in df.columns]
    if missing:
        raise ValueError(f"Eksik kolon(lar): {missing}")

    # metin
    for col in [
        "baslik","konum","aciklama","marka","seri","model","yakit_tipi","vites_tipi",
        "renk","arac_durumu","kasa_tipi","cekis","url"
    ]:
        df[col] = df[col].map(lambda x: re.sub(r"\s+", " ", str(x or "").strip()))

    # sayısal
    df["fiyat_num"] = df["fiyat"].map(to_num)
    df["km_num"] = df["kilometre"].map(to_num)
    df["tramer_num"] = df["tramer"].map(to_num)
    df["yil_num"] = df["yil"].map(year4)

    # vites / yakıt std
    df["vites_std"] = df["vites_tipi"].apply(
        lambda v: match_from_map(v, {"otomatik": TURKISH_MAP["otomatik"], "manuel": TURKISH_MAP["manuel"]})
    )
    def yakit_std(v):
        if match_from_map(v, {"benzin": TURKISH_MAP["benzin"]}) == "benzin": return "benzin"
        if match_from_map(v, {"dizel": TURKISH_MAP["dizel"]}) == "dizel": return "dizel"
        if match_from_map(v, {"lpg": TURKISH_MAP["lpg"]}) == "lpg": return "lpg"
        if match_from_map(v, {"hybrid": TURKISH_MAP["hybrid"]}) == "hybrid": return "hybrid"
        if match_from_map(v, {"elektrik": TURKISH_MAP["elektrik"]}) == "elektrik": return "elektrik"
        return ascii_lower(v)
    df["yakit_std"] = df["yakit_tipi"].map(yakit_std)

    # arama anahtarları
    for col in ["marka","seri","model","konum","kasa_tipi","cekis","renk","arac_durumu"]:
        df[col + "_key"] = df[col].map(ascii_lower)

    return df


# -----------------------------
# 3) Doc metni + payload
# -----------------------------
def build_doc_text(row: Dict[str, Any]) -> str:
    marka = str(row.get("marka", "")).strip()
    seri = str(row.get("seri", "")).strip()
    model = str(row.get("model", "")).strip()
    yil = row.get("yil_num") or row.get("yil") or ""
    vites = row.get("vites_tipi", "")
    yakit = row.get("yakit_tipi", "")
    km = row.get("kilometre", "")
    konum = row.get("konum", "")
    kasa = row.get("kasa_tipi", "")
    fiyat = row.get("fiyat", "")
    aciklama = (row.get("aciklama") or "").strip()

    title = f"{marka} {seri} {model} {yil}".strip()
    bullet = f"{yakit}, {vites}, {km} km, {kasa}, {konum}".replace("  "," ").strip(" ,")
    text = f"{title} – {bullet}. Fiyat: {fiyat}. {aciklama}"
    return " ".join(text.split())

def build_payload(r: Dict[str, Any], text: str) -> Dict[str, Any]:
    return {
        # ham alanlar
        "id": r.get("id"),
        "baslik": r.get("baslik"),
        "konum": r.get("konum"),
        "fiyat": r.get("fiyat"),
        "marka": r.get("marka"),
        "seri": r.get("seri"),
        "model": r.get("model"),
        "yil": r.get("yil_num") or r.get("yil"),
        "kilometre": r.get("km_num") or r.get("kilometre"),
        "yakit_tipi": r.get("yakit_tipi"),
        "vites_tipi": r.get("vites_tipi"),
        "renk": r.get("renk"),
        "arac_durumu": r.get("arac_durumu"),
        "kasa_tipi": r.get("kasa_tipi"),
        "cekis": r.get("cekis"),
        "motor_hacmi": r.get("motor_hacmi"),
        "motor_gucu": r.get("motor_gucu"),
        "tramer": r.get("tramer"),
        "url": r.get("url"),

        # sayısal & key alanlar
        "fiyat_num": none_if_nan(r.get("fiyat_num")),
        "km_num": none_if_nan(r.get("km_num")),
        "yil_num": none_if_nan(r.get("yil_num")),
        "marka_key": ascii_lower(r.get("marka")),
        "seri_key": ascii_lower(r.get("seri")),
        "model_key": ascii_lower(r.get("model")),
        "konum_key": ascii_lower(r.get("konum")),

        # arama metni
        "text": text,
    }


# -----------------------------
# 4) Qdrant collection garanti
# -----------------------------
def ensure_collection(client: QdrantClient, collection: str, dim: int, distance: Distance = Distance.COSINE):
    existing = [c.name for c in client.get_collections().collections]
    if collection in existing:
        # Boyut uyuşmasını kontrol et (Qdrant sürümüne göre alanlar değişebilir)
        info = client.get_collection(collection)
        # Bazı sürümlerde: info.config.params.vectors.size
        current_dim = None
        try:
            current_dim = info.config.params.vectors.size  # type: ignore
        except Exception:
            # Yedek yol: vektör sayısı boyut değil, o yüzden kullanma
            pass
        if current_dim is not None and current_dim != dim:
            raise ValueError(f"Koleksiyon '{collection}' farklı boyutta: {current_dim} ≠ {dim}")
        return

    client.create_collection(
        collection_name=collection,
        vectors_config=VectorParams(size=dim, distance=distance),
    )


# -----------------------------
# 5) DF → Qdrant Upsert
# -----------------------------
def df_to_points(df: pd.DataFrame, embedder: ST_Embedder, collection: str,
                 client: QdrantClient, batch_size: int = 256):
    if "fiyat_num" not in df.columns:
        df["fiyat_num"] = df["fiyat"].map(to_num)
    if "km_num" not in df.columns:
        df["km_num"] = df["kilometre"].map(to_num)
    if "yil_num" not in df.columns:
        df["yil_num"] = df["yil"].map(year4)

    dim = embedder.dimension()
    ensure_collection(client, collection, dim)

    rows = df.to_dict(orient="records")
    for i in tqdm(range(0, len(rows), batch_size), desc="upserting"):
        chunk = rows[i:i + batch_size]
        texts = [build_doc_text(r) for r in chunk]
        vecs = embedder.embed_documents(texts)

        points = []
        for r, v, t in zip(chunk, vecs, texts):
            pid = make_point_id(r.get("id"))
            payload = build_payload(r, t)
            points.append(PointStruct(id=pid, vector=v, payload=payload))

        client.upsert(collection_name=collection, points=points)


# -----------------------------
# 6) Filtre modeli (sorgu → payload filter)
# -----------------------------
class QueryFilters(BaseModel):
    marka: Optional[str] = None
    seri: Optional[str] = None
    model: Optional[str] = None
    konum: Optional[str] = None
    fiyat_min: Optional[float] = None
    fiyat_max: Optional[float] = None
    yil_min: Optional[int] = None
    yil_max: Optional[int] = None

def build_qdrant_filter(f: QueryFilters) -> Optional[Filter]:
    must: List[FieldCondition] = []

    def eq(field: str, val: Optional[str]):
        val = (val or "").strip().lower()
        if val:
            must.append(FieldCondition(key=field, match=MatchValue(value=val)))

    def rng(field: str, gte=None, lte=None):
        cond = {}
        if gte is not None: cond["gte"] = float(gte)
        if lte is not None: cond["lte"] = float(lte)
        if cond:
            must.append(FieldCondition(key=field, range=Range(**cond)))

    eq("marka_key", f.marka)
    eq("seri_key",  f.seri)
    eq("model_key", f.model)
    eq("konum_key", f.konum)
    rng("fiyat_num", gte=f.fiyat_min, lte=f.fiyat_max)
    rng("yil_num",   gte=f.yil_min,   lte=f.yil_max)

    return Filter(must=must) if must else None


# -----------------------------
# 7) HybridSearcher (Dense + Sparse TF-IDF + RRF)
# -----------------------------
from sklearn.feature_extraction.text import TfidfVectorizer

def _is_valid_text(s: str, min_len: int = 3) -> bool:
    return isinstance(s, str) and len(s.strip()) >= min_len

class HybridSearcher:
    def __init__(self, client: QdrantClient, collection: str, embedder: ST_Embedder):
        self.client = client
        self.collection = collection
        self.embedder = embedder
        self._tfidf: Optional[TfidfVectorizer] = None
        self._sparse = None
        self._ids: Optional[np.ndarray] = None   # dtype=object
        self._texts: Optional[List[str]] = None
        self._payloads: Dict[Any, Dict] = {}     # id -> payload (cache)

    def _ensure_sparse_index(self, max_points: int = 20000):
        if self._tfidf is not None:
            return

        texts: List[str] = []
        ids:   List[Any] = []
        seen = set()
        next_offset = None

        while True:
            recs, next_offset = self.client.scroll(
                collection_name=self.collection,
                with_payload=True,
                limit=1024,
                offset=next_offset,
            )
            if not recs:
                break

            for p in recs:
                pid = p.id  # tipini KORU (int ya da str-UUID)
                if pid in seen:
                    continue
                pl = p.payload or {}
                t = pl.get("text", "")
                if _is_valid_text(t):
                    texts.append(t)
                    ids.append(pid)
                    self._payloads[pid] = pl
                    seen.add(pid)

            if next_offset is None or len(texts) >= max_points:
                break

        if not texts:
            texts = ["placeholder"]
            ids = ["__placeholder__"]
            self._payloads["__placeholder__"] = {"text": "placeholder"}

        self._tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1, 2))
        self._sparse = self._tfidf.fit_transform(texts)
        self._ids = np.array(ids, dtype=object)
        self._texts = texts

    def dense_search(self, query: str, f: Optional[QueryFilters], top_k: int = 50):
        qv = self.embedder.embed_query(query)
        qf = build_qdrant_filter(f) if f else None
        res = self.client.search(
            collection_name=self.collection,
            query_vector=qv,
            query_filter=qf,
            limit=top_k,
            with_payload=True,
            with_vectors=False,
        )
        return [(r.id, float(r.score), r.payload or {}) for r in res]

    def sparse_search(self, query: str, f: Optional[QueryFilters], top_k: int = 200):
        self._ensure_sparse_index()
        q = self._tfidf.transform([query])
        sim = (q @ self._sparse.T).toarray().ravel()

        k = min(top_k, sim.size)
        if k == 0:
            return []
        idx = np.argpartition(-sim, k - 1)[:k]
        order = idx[np.argsort(-sim[idx])]

        results = []
        for i in order:
            pid = self._ids[i]
            sc = float(sim[i])
            pl = self._payloads.get(pid, {})

            # Payload filtreleri
            if f:
                if f.marka and (pl.get("marka_key") or "") != (f.marka or "").strip().lower():  continue
                if f.seri  and (pl.get("seri_key")  or "") != (f.seri  or "").strip().lower():  continue
                if f.model and (pl.get("model_key") or "") != (f.model or "").strip().lower():  continue
                if f.konum and (pl.get("konum_key") or "") != (f.konum or "").strip().lower():  continue
                if f.fiyat_min is not None and (pl.get("fiyat_num") is None or pl["fiyat_num"] < f.fiyat_min): continue
                if f.fiyat_max is not None and (pl.get("fiyat_num") is None or pl["fiyat_num"] > f.fiyat_max): continue
                if f.yil_min  is not None and (pl.get("yil_num")   is None or pl["yil_num"]   < f.yil_min):  continue
                if f.yil_max  is not None and (pl.get("yil_num")   is None or pl["yil_num"]   > f.yil_max):  continue

            results.append((pid, sc, pl))
        return results

    @staticmethod
    def rrf_merge(dense: List[Tuple[Any, float, Dict]], sparse: List[Tuple[Any, float, Dict]],
                  k: float = 60.0, top_k: int = 50):
        def ranks(lst):
            return {pid: rank for rank, (pid, _, _) in enumerate(sorted(lst, key=lambda x: -x[1]), start=1)}
        rd = ranks(dense)
        rs = ranks(sparse)
        ids = set([pid for pid, _, _ in dense] + [pid for pid, _, _ in sparse])
        merged = []
        for pid in ids:
            r1 = rd.get(pid, 10**6)
            r2 = rs.get(pid, 10**6)
            rrf = 1.0 / (k + r1) + 1.0 / (k + r2)
            payload = None
            if pid in rd:
                payload = [pl for (p, _, pl) in dense if p == pid][0]
            elif pid in rs:
                payload = [pl for (p, _, pl) in sparse if p == pid][0]
            merged.append((pid, rrf, payload or {}))
        merged.sort(key=lambda x: -x[1])
        return merged[:top_k]

    def search(self, query_text: str, f: Optional[QueryFilters] = None, top_k: int = 30):
        dense = self.dense_search(query_text, f, top_k=top_k)
        sparse = self.sparse_search(query_text, f, top_k=top_k * 4)
        return self.rrf_merge(dense, sparse, top_k=top_k)


# -----------------------------
# 8) (Opsiyonel) Heuristik parser örneği
# -----------------------------
def heuristic_parse(user_text: str) -> QueryFilters:
    s = ascii_lower(user_text)
    # fiyat max
    fiyat_max = None
    m = re.search(r"(\d[\d\.\, ]+)\s*(tl|₺|try|lira)?\s*(?:max|üst|tavan|kadar|altında|aşmadan)", s)
    if not m:
        m = re.search(r"maks(?:imum)?\s*(\d[\d\.\, ]+)", s)
    if m: fiyat_max = to_num(m.group(1))

    # yıl min
    yil_min = None
    mm = re.findall(r"\b(19|20)\d{2}\b", s)
    if mm:
        try:
            ys = re.findall(r"\b(19|20)\d{2}\b", s)
            yil_min = int(re.search(r"\b(19|20)\d{2}\b", s).group(0))
        except:
            pass

    # konum basit
    konum = None
    m = re.search(r"(istanbul|ankara|izmir|bursa|antalya|adana|konya|kayseri|kocaeli|mersin|gaziantep)", s)
    if m: konum = m.group(1)

    # marka/seri/model çok basit ipuçları (ör: Astra)
    marka = None
    seri = None
    if "astra" in s:
        marka = "opel"
        seri = "astra"

    return QueryFilters(marka=marka, seri=seri, konum=konum, fiyat_max=fiyat_max, yil_min=yil_min)


# -----------------------------
# 9) Örnek kullanım (çalıştırmayın)
# -----------------------------
"""
# 0) Qdrant bağlantısı
client = QdrantClient(url="http://localhost:6333", prefer_grpc=False)

# 1) DataFrame yükle ve normalize et
df = pd.read_parquet("ilanlar.parquet")   # veya csv
df = normalize_df(df)

# 2) Embedder (CPU)
embedder = ST_Embedder("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", force_device="cpu")

# 3) Upsert
collection = "car_listings_st"   # 384-dim modeller için ayrı koleksiyon önerilir
df_to_points(df, embedder, collection, client, batch_size=256)

# 4) Arama
searcher = HybridSearcher(client, collection, embedder)
user_query = "İstanbul’da 1.3 milyon TL’ye kadar, 2018 üzeri otomatik benzinli Astra"
filters = heuristic_parse(user_query)  # veya LLM tabanlı parser
results = searcher.search(user_query, filters, top_k=20)

# results: [(id, score, payload), ...]
# payload['marka'], payload['model'], payload['yil_num'], payload['fiyat_num'], payload['url'] ...
"""


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Collecting sentence-transformers
  Using cached sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Using cached scipy-1.16.2-cp312-cp312-ma

  from .autonotebook import tqdm as notebook_tqdm


'\n# 0) Qdrant bağlantısı\nclient = QdrantClient(url="http://localhost:6333", prefer_grpc=False)\n\n# 1) DataFrame yükle ve normalize et\ndf = pd.read_parquet("ilanlar.parquet")   # veya csv\ndf = normalize_df(df)\n\n# 2) Embedder (CPU)\nembedder = ST_Embedder("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", force_device="cpu")\n\n# 3) Upsert\ncollection = "car_listings_st"   # 384-dim modeller için ayrı koleksiyon önerilir\ndf_to_points(df, embedder, collection, client, batch_size=256)\n\n# 4) Arama\nsearcher = HybridSearcher(client, collection, embedder)\nuser_query = "İstanbul’da 1.3 milyon TL’ye kadar, 2018 üzeri otomatik benzinli Astra"\nfilters = heuristic_parse(user_query)  # veya LLM tabanlı parser\nresults = searcher.search(user_query, filters, top_k=20)\n\n# results: [(id, score, payload), ...]\n# payload[\'marka\'], payload[\'model\'], payload[\'yil_num\'], payload[\'fiyat_num\'], payload[\'url\'] ...\n'

In [5]:
client = QdrantClient(url="http://localhost:6333", prefer_grpc=False)

In [6]:
# Uğur burda df'i okut
df = pd.read_parquet("../data/arabam_ilanlar.parquet")

print("✅ Veri boyutu:", df.shape)
df.head()


✅ Veri boyutu: (65839, 20)


Unnamed: 0,id,baslik,konum,fiyat,aciklama,marka,seri,model,yil,kilometre,yakit_tipi,vites_tipi,renk,arac_durumu,kasa_tipi,cekis,motor_hacmi,motor_gucu,url,Araç_Yası
0,1,YAVUZLAR'DAN 2014 ALFA ROMEO GİULİETTA 1.6 JTD...,"Karşıyaka Mh. Kepez, Antalya",655000,\n,Alfa Romeo,Giulietta,1.6 JTD Distinctive,2014,209000,Dizel,Düz,Beyaz,İkinci El,Hatchback/5,Önden Çekiş,1598,105,https://www.arabam.com/ilan/galeriden-satilik-...,11
1,2,2006 ALFA ROMEO 156 TS,"Soğanlı Mh. Osmangazi, Bursa",414500,ES ES OTOMOTİV DEN \n\n\n \n\n\nSATILIK \n\n\n...,Alfa Romeo,156,1.6 TS Distinctive,2006,171000,Benzin,Düz,Şampanya,İkinci El,Sedan,Önden Çekiş,1600,125,https://www.arabam.com/ilan/galeriden-satilik-...,19
2,3,Sahibinden Alfa Romeo Giulietta 1.4 TB MultiAi...,"Karşıyaka Mh. Karataş, Adana",735000,Ev almayı düşündüğüm için aracımı satışa çıkar...,Alfa Romeo,Giulietta,1.4 TB MultiAir Distinctive,2011,157100,Benzin,Düz,Siyah,İkinci El,Hatchback/5,Önden Çekiş,1368,170,https://www.arabam.com/ilan/sahibinden-satilik...,14
3,4,Sahibinden Alfa Romeo Giulietta 1.4 TB Progres...,"Esenkent Mh. Esenyurt, İstanbul",900000,"-2016 NİSAN ÇIKIŞLIDIR.-ORJİNAL 90 BİN KM, SIF...",Alfa Romeo,Giulietta,1.4 TB Progression Plus,2015,90800,Benzin,Düz,Kırmızı,İkinci El,Hatchback/5,Önden Çekiş,1368,120,https://www.arabam.com/ilan/sahibinden-satilik...,10
4,5,A L F İ S T,"Deliktaş Mh. Pamukkale, Denizli",800000,,Alfa Romeo,Giulietta,1.6 JTD Distinctive,2014,200000,Dizel,Düz,Beyaz,İkinci El,Hatchback/5,Önden Çekiş,1598,105,https://www.arabam.com/ilan/sahibinden-satilik...,11


In [None]:
df = normalize_df(df)

In [None]:
embedder = ST_Embedder("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

In [None]:
collection = "car_listings_st"   # 384-dim modeller için ayrı koleksiyon önerilir
df_to_points(df, embedder, collection, client, batch_size=256)

In [None]:
searcher = HybridSearcher(client, collection, embedder)
user_query = "İstanbul’da 1.3 milyon TL’ye kadar, 2018 üzeri otomatik benzinli Astra"
filters = heuristic_parse(user_query)  
results = searcher.search(user_query, filters, top_k=20)