<a href="https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/docs/examples/data_connectors/WebPageDemo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Web Page Reader

Demonstrates our web page reader.

If you're opening this Notebook on colab, you will probably need to install LlamaIndex 🦙.

In [None]:
%pip install llama-index llama-index-readers-web

In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

#### Using SimpleWebPageReader

In [None]:
from llama_index.core import SummaryIndex
from llama_index.readers.web import SimpleWebPageReader
from IPython.display import Markdown, display
import os

In [None]:
# NOTE: the html_to_text=True option requires html2text to be installed

In [None]:
documents = SimpleWebPageReader(html_to_text=True).load_data(
    ["http://paulgraham.com/worked.html"]
)

In [None]:
documents[0]

In [None]:
index = SummaryIndex.from_documents(documents)

In [None]:
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")

In [None]:
display(Markdown(f"<b>{response}</b>"))

### Using FireCrawl Reader 🔥


Firecrawl is an api that turns entire websites into clean, LLM accessible markdown.

Using Firecrawl to gather an entire website

In [None]:
from llama_index.readers.web import FireCrawlWebReader

In [None]:
# using firecrawl to crawl a website
firecrawl_reader = FireCrawlWebReader(
    api_key="<your_api_key>",  # Replace with your actual API key from https://www.firecrawl.dev/
    mode="scrape",  # Choose between "crawl" and "scrape" for single page scraping
    params={"additional": "parameters"},  # Optional additional parameters
)

# Load documents from a single page URL
documents = firecrawl_reader.load_data(url="http://paulgraham.com/")

In [None]:
index = SummaryIndex.from_documents(documents)

In [None]:
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")

In [None]:
display(Markdown(f"<b>{response}</b>"))

Using firecrawl for a single page


In [None]:
# Initialize the FireCrawlWebReader with your API key and desired mode
from llama_index.readers.web.firecrawl_web.base import FireCrawlWebReader

firecrawl_reader = FireCrawlWebReader(
    api_key="<your_api_key>",  # Replace with your actual API key from https://www.firecrawl.dev/
    mode="scrape",  # Choose between "crawl" and "scrape" for single page scraping
    params={"additional": "parameters"},  # Optional additional parameters
)

# Load documents from a single page URL
documents = firecrawl_reader.load_data(url="http://paulgraham.com/worked.html")

: 

In [None]:
index = SummaryIndex.from_documents(documents)

In [None]:
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")

In [None]:
display(Markdown(f"<b>{response}</b>"))

#### Using TrafilaturaWebReader

In [None]:
from llama_index.readers.web import TrafilaturaWebReader

ModuleNotFoundError: No module named 'llama_index.readers.web'

In [None]:
documents = TrafilaturaWebReader().load_data(
    ["http://paulgraham.com/worked.html"]
)

In [None]:
index = SummaryIndex.from_documents(documents)

In [None]:
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")

In [None]:
display(Markdown(f"<b>{response}</b>"))

### Using RssReader

In [None]:
from llama_index.core import SummaryIndex
from llama_index.readers.web import RssReader

documents = RssReader().load_data(
    ["https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml"]
)

index = SummaryIndex.from_documents(documents)

# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What happened in the news today?")