# Web Scraping to Qdrant

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/thierrypdamiba/qdrant-etl-cookbook/blob/main/notebooks/etl/web_scrape_to_qdrant.ipynb)

Scrape web pages with BeautifulSoup, clean and chunk HTML content, then load into Qdrant.

In [None]:
!pip install -q qdrant-client sentence-transformers beautifulsoup4 requests

In [None]:
import requests
from bs4 import BeautifulSoup
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from sentence_transformers import SentenceTransformer

In [None]:
client = QdrantClient(":memory:")
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
client.create_collection(
    collection_name="web_pages",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)

In [None]:
urls = [
    "https://qdrant.tech/documentation/overview/",
]

points = []
for idx, url in enumerate(urls):
    resp = requests.get(url, timeout=10)
    soup = BeautifulSoup(resp.text, "html.parser")

    # Remove non-content elements
    for tag in soup(["script", "style", "nav", "footer", "header"]):
        tag.decompose()

    text = soup.get_text(separator=" ", strip=True)[:2000]
    title = soup.title.string if soup.title else url
    embedding = model.encode(text).tolist()

    points.append(
        PointStruct(
            id=idx,
            vector=embedding,
            payload={"url": url, "text": text, "title": title},
        )
    )
    print(f"Scraped: {title}")

client.upsert(collection_name="web_pages", points=points)
print(f"\nLoaded {len(points)} pages")

In [None]:
# Search
query_vector = model.encode("What is Qdrant?").tolist()
response = client.query_points(
    collection_name="web_pages",
    query=query_vector,
    limit=3,
)
results = response.points

for r in results:
    print(f"Score: {r.score:.4f} | {r.payload['title']}")
    print(f"  {r.payload['text'][:150]}...")