# Load CSV into Qdrant

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/thierrypdamiba/qdrant-etl-cookbook/blob/main/notebooks/etl/csv_to_qdrant.ipynb)

Read a CSV file, generate embeddings with sentence-transformers, and upsert into a Qdrant collection with batching.

In [None]:
!pip install -q qdrant-client sentence-transformers pandas

In [None]:
import pandas as pd
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from sentence_transformers import SentenceTransformer

In [None]:
# Connect to Qdrant (use :memory: for local testing)
client = QdrantClient(":memory:")
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# Create sample CSV data for demo
import io

csv_data = """text,category
Qdrant is a vector similarity search engine,database
It provides a production-ready service with a convenient API,api
Vector databases store high-dimensional embeddings,database
Sentence transformers convert text to vectors,embeddings
HNSW is an efficient approximate nearest neighbor algorithm,algorithms
"""

df = pd.read_csv(io.StringIO(csv_data))
print(f"Loaded {len(df)} rows")
df.head()

In [None]:
# Create collection
client.create_collection(
    collection_name="csv_demo",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)

In [None]:
# Batch embed and upsert
batch_size = 100
for i in range(0, len(df), batch_size):
    batch = df.iloc[i : i + batch_size]
    texts = batch["text"].tolist()
    embeddings = model.encode(texts).tolist()

    points = [
        PointStruct(
            id=idx + i,
            vector=emb,
            payload=row.to_dict(),
        )
        for idx, (emb, (_, row)) in enumerate(
            zip(embeddings, batch.iterrows())
        )
    ]
    client.upsert(collection_name="csv_demo", points=points)

print(f"Loaded {len(df)} records into Qdrant")

In [None]:
# Verify: search for something
query = "How does vector search work?"
query_vector = model.encode(query).tolist()

response = client.query_points(
    collection_name="csv_demo",
    query=query_vector,
    limit=3,
)
results = response.points

for r in results:
    print(f"Score: {r.score:.4f} | {r.payload['text']}")