# Semantic Code Search with Qdrant

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/thierrypdamiba/qdrant-etl-cookbook/blob/main/notebooks/etl/code_search.ipynb)

Index code snippets and search by intent (natural language) using sentence-transformers. Load code structures from JSONL and enable semantic code search.

In [None]:
!pip install -q qdrant-client sentence-transformers requests

In [None]:
import json
import requests
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance, PayloadSchemaType
from sentence_transformers import SentenceTransformer

In [None]:
client = QdrantClient(":memory:")
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# Download Qdrant's code structures dataset
url = "https://storage.googleapis.com/tutorial-attachments/code-search/structures.jsonl"
response = requests.get(url, stream=True)

records = []
for line in response.iter_lines():
    if line:
        records.append(json.loads(line))

print(f"Downloaded {len(records)} code structures")
print(f"Sample keys: {list(records[0].keys())}")
print(f"Sample: {records[0].get('name', 'N/A')} - {records[0].get('signature', 'N/A')[:80]}")

In [None]:
client.create_collection(
    collection_name="code_search",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)

client.create_payload_index(
    collection_name="code_search",
    field_name="language",
    field_schema=PayloadSchemaType.KEYWORD,
)

In [None]:
# Embed and index code structures
# Use a combination of name + signature + docstring for the embedding
batch_size = 100
total = 0

for i in range(0, len(records), batch_size):
    batch = records[i : i + batch_size]

    texts = []
    for r in batch:
        parts = [r.get("name", ""), r.get("signature", ""), r.get("docstring", "")]
        texts.append(" ".join(p for p in parts if p))

    embeddings = model.encode(texts).tolist()

    points = [
        PointStruct(
            id=i + idx,
            vector=emb,
            payload={
                "name": r.get("name", ""),
                "signature": r.get("signature", ""),
                "docstring": r.get("docstring", ""),
                "language": r.get("language", "unknown"),
            },
        )
        for idx, (emb, r) in enumerate(zip(embeddings, batch))
    ]

    client.upsert(collection_name="code_search", points=points)
    total += len(points)

print(f"Indexed {total} code structures")

In [None]:
# Search by intent
queries = [
    "function to sort a list",
    "how to read a file",
    "parse JSON data",
    "connect to a database",
]

for query in queries:
    query_vec = model.encode(query).tolist()
    response = client.query_points(
        collection_name="code_search",
        query=query_vec,
        limit=3,
    )

    print(f"\nQuery: '{query}'")
    for r in response.points:
        sig = r.payload.get('signature', 'N/A')[:80]
        print(f"  {r.score:.4f} | {r.payload['name']} | {sig}")