<a href="https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/Crawl_a_Website.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q llama-index==0.14.10 llama-index-llms-openai==0.6.12 openai==2.13.0 newspaper4k==0.9.4.1 \
                lxml_html_clean==0.4.3 crawl4ai==0.7.8 jedi==0.19.2

!python -m playwright install --with-deps chromium

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.1/52.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m306.2/306.2 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m438.1/438.1 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.5/81.5 kB[0m [31m7.1 MB/s[0m eta [3

In [2]:
import os
import asyncio
import json
import nest_asyncio
from google.colab import userdata

# Colab compatibility
nest_asyncio.apply()


# Set API Keys
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [3]:
import newspaper

urls = [
    "https://docs.langchain.com/oss/javascript/langchain/agents",
    "https://docs.langchain.com/oss/javascript/langchain/models",
    "https://docs.langchain.com/oss/javascript/langchain/messages",
    "https://docs.langchain.com/oss/javascript/langchain/tools",
    "https://docs.langchain.com/oss/javascript/langchain/short-term-memory",

]

pages_content = []

# Retrieve the Content
for url in urls:
    try:
        article = newspaper.Article(url)
        article.download()
        article.parse()
        if len(article.text) > 0:
            pages_content.append(
                {"url": url, "title": article.title, "text": article.text}
            )
    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        continue

if pages_content:
    print(pages_content[0])
else:
    print("No content was retrieved for any URL.")
print(len(pages_content))

{'url': 'https://docs.langchain.com/oss/javascript/langchain/agents', 'title': 'Docs by LangChain', 'text': 'import { ChatOpenAI } from "@langchain/openai"; import { createAgent, createMiddleware } from "langchain"; const basicModel = new ChatOpenAI({ model: "gpt-4o-mini" }); const advancedModel = new ChatOpenAI({ model: "gpt-4o" }); const dynamicModelSelection = createMiddleware({ name: "DynamicModelSelection", wrapModelCall: (request, handler) => { // Choose model based on conversation complexity const messageCount = request.messages.length; return handler({ ...request, model: messageCount > 10 ? advancedModel : basicModel, }); }, }); const agent = createAgent({ model: "gpt-4o-mini", // Base model (used when messageCount ≤ 10) tools, middleware: [dynamicModelSelection], });\n\nimport * as z from "zod"; import { createAgent, tool } from "langchain"; const search = tool( ({ query }) => `Results for: ${query}`, { name: "search", description: "Search for information", schema: z.object({ 

In [4]:
# Convert to Document
from llama_index.core.schema import Document

documents = [
    Document(text=row["text"], metadata={"title": row["title"], "url": row["url"]})
    for row in pages_content
]


## Crawl a Website

In [5]:
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode

urls_to_crawl = [
    "https://docs.langchain.com/oss/javascript/langchain/agents",
]

# Synchronous wrapper
def crawl_sync():
    async def crawl_with_crawl4ai():
        config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            page_timeout=80000,
            word_count_threshold=50
        )

        data_res = {"data": []}

        async with AsyncWebCrawler() as crawler:
            results = await crawler.arun_many(
                urls_to_crawl,
                config=config
            )

            for result in results:
                if result.success:
                    title = result.metadata.get("title", "")
                    if not title and result.markdown:
                        lines = result.markdown.raw_markdown.split('\n')
                        for line in lines:
                            if line.startswith('#'):
                                title = line.strip('#').strip()
                                break

                    data_res["data"].append({
                        "text": result.markdown.raw_markdown if result.markdown else "",
                        "meta": {
                            "url": result.url,
                            "meta": {
                                "title": title
                            }
                        }
                    })

        return data_res

    # Handle async execution
    nest_asyncio.apply()

    loop = asyncio.new_event_loop()
    result = loop.run_until_complete(crawl_with_crawl4ai())
    loop.close()
    return result

# Run the crawler
data_res = crawl_sync()

# Print results (same format as before)
print("URL:", data_res["data"][0]["meta"]["url"])
print("Title:", data_res["data"][0]["meta"]["meta"]["title"])
print("Content:", data_res["data"][0]["text"][0:500], "...")

URL: https://docs.langchain.com/oss/javascript/langchain/agents
Title: Agents - Docs by LangChain
Content: [Skip to main content](https://docs.langchain.com/oss/javascript/langchain/agents#content-area)
[Docs by LangChain home page![light logo](https://mintcdn.com/langchain-5e9cc07a/Xbr8HuVd9jPi6qTU/images/brand/langchain-docs-teal.svg?fit=max&auto=format&n=Xbr8HuVd9jPi6qTU&q=85&s=16111530672bf976cb54ef2143478342)![dark logo](https://mintcdn.com/langchain-5e9cc07a/Xbr8HuVd9jPi6qTU/images/brand/langchain-docs-lilac.svg?fit=max&auto=format&n=Xbr8HuVd9jPi6qTU&q=85&s=b70fb1a2208670492ef94aef14b680be)](ht ...


In [6]:
from llama_index.core.schema import Document

documents = [
    Document(
        text=row["text"],
        metadata={"title": row["meta"]["meta"]["title"], "url": row["meta"]["url"]},
    )
    for row in data_res["data"]
]

In [7]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Settings
from llama_index.core import VectorStoreIndex

Settings.llm = OpenAI(model="gpt-5-mini")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=30)

In [8]:
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()

In [9]:
res = query_engine.query("What is a Agent?")
print(res.response)

An agent is an LLM-driven system that runs tools in a loop to achieve a goal. It executes until a stop condition is met (for example, the model emits a final output or an iteration limit is reached). The runtime is graph-based: the agent moves through nodes and edges that define processing steps (model nodes that call the model, tools nodes that run tools, middleware, etc.). The model is the agent’s reasoning engine and can be specified statically at creation (e.g., createAgent({ model: "openai:gpt-5", tools: [] })). Model identifiers use the format provider:model.


In [10]:
# Show the retrieved nodes
for src in res.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("URL\t", src.metadata["url"])
    print("Score\t", src.score)
    print("-_" * 20)

Node ID	 06c205fb-ff64-47b2-8077-4bb30046f0e8
Title	 Agents - Docs by LangChain
URL	 https://docs.langchain.com/oss/javascript/langchain/agents
Score	 0.5570819315919509
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
Node ID	 0eacdbac-57a4-4759-9d1b-85664c844743
Title	 Agents - Docs by LangChain
URL	 https://docs.langchain.com/oss/javascript/langchain/agents
Score	 0.5555383559335868
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
