# Deduplication Agent

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/thierrypdamiba/qdrant-etl-cookbook/blob/main/notebooks/agents/dedup_agent.ipynb)

Finds and removes near-duplicate entries in a Qdrant collection using vector similarity thresholds.

In [None]:
!pip install -q qdrant-client sentence-transformers

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, PointIdsList, VectorParams, Distance
from sentence_transformers import SentenceTransformer

In [None]:
client = QdrantClient(":memory:")
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# Create collection with intentional duplicates
client.create_collection(
    collection_name="with_dupes",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)

texts = [
    "Qdrant is a vector database",
    "Qdrant is a vector database engine",  # near-duplicate of 0
    "Python is a programming language",
    "Python is a popular programming language",  # near-duplicate of 2
    "Machine learning uses neural networks",
]

points = [
    PointStruct(id=i, vector=model.encode(t).tolist(), payload={"text": t})
    for i, t in enumerate(texts)
]
client.upsert(collection_name="with_dupes", points=points)
print(f"Loaded {len(points)} points (with intentional near-duplicates)")

In [None]:
def find_duplicates(collection: str, threshold: float = 0.95):
    """Scan collection and find near-duplicate pairs."""
    duplicates = []
    offset = None

    while True:
        records, offset = client.scroll(
            collection_name=collection,
            limit=100,
            offset=offset,
            with_vectors=True,
        )

        if not records:
            break

        for record in records:
            similar_response = client.query_points(
                collection_name=collection,
                query=record.vector,
                limit=5,
                score_threshold=threshold,
            )
            similar = similar_response.points

            for match in similar:
                if match.id != record.id and match.id > record.id:
                    duplicates.append((record.id, match.id, match.score))

        if offset is None:
            break

    return duplicates

In [None]:
dupes = find_duplicates("with_dupes", threshold=0.90)

print(f"Found {len(dupes)} duplicate pairs:")
for orig_id, dup_id, score in dupes:
    print(f"  ID {orig_id} <-> ID {dup_id} (similarity: {score:.4f})")

In [None]:
# Remove duplicates (keep the lower ID)
ids_to_remove = list(set(dup_id for _, dup_id, _ in dupes))

if ids_to_remove:
    client.delete(
        collection_name="with_dupes",
        points_selector=PointIdsList(points=ids_to_remove),
    )
    print(f"Removed {len(ids_to_remove)} duplicate points")

# Verify
remaining, _ = client.scroll(collection_name="with_dupes", limit=100)
print(f"Remaining points: {len(remaining)}")
for r in remaining:
    print(f"  ID {r.id}: {r.payload['text']}")