## Install the requirements

In [1]:
!pip install -q llama-index==0.12.12 openai==1.59.6 tiktoken==0.8.0 llama-index-readers-web==0.3.4 firecrawl-py==1.10.1

# (OR) To resolve the dependency issue.
# !pip uninstall -q torch torchvision torchaudio
# !pip install -q torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu126
# !pip install -q llama-index==0.12.12 openai==1.59.6 tiktoken==0.8.0 llama-index-readers-web==0.3.4 firecrawl-py==1.10.1

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/56.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m69.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m454.8/454.8 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.2/80.2 kB[0m 

### SET THE ENVIRONMENT VARIABLES

In [2]:
import os
os.environ["OPENAI_API_KEY"] = "<OPENAI_API_KEY>"
FIRECRAWL_API_KEY = "<FIRECRAWL_API_KEY>"

# from google.colab import userdata
# os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY1')
# os.environ["FIRECRAWL_API_KEY"] = userdata.get('FIRECRAWL_API_KEY1')

# FIRECRAWL_API_KEY = userdata.get('FIRECRAWL_API_KEY1')

# SCRAPE WITH FIRECRAWL

## IMPORT THE FIRECRAWL WEBREADER

Firecrawl allows you to turn entire websites into LLM-ready markdown

Get the API key here
https://www.firecrawl.dev/app/api-keys

In [3]:
from llama_index.readers.web import FireCrawlWebReader

In [4]:

# using firecrawl to crawl a website
firecrawl_reader = FireCrawlWebReader(
    api_key=FIRECRAWL_API_KEY,  # Replace with your actual API key from https://www.firecrawl.dev/
    mode="scrape",
)

# Load documents from a single page URL
documents = firecrawl_reader.load_data(url="https://towardsai.net/")

In [5]:
from llama_index.core import VectorStoreIndex, Settings
from llama_index.core.node_parser import SentenceSplitter

# To increase chunk size globally
# Settings.chunk_size = 2048  # or even larger like 4096
# Settings.chunk_overlap = 200


# node parser with larger chunk size only for this index
node_parser = SentenceSplitter(
    chunk_size=2048,
    chunk_overlap=200,
)

index = VectorStoreIndex.from_documents(documents, transformations=[node_parser])
query_engine = index.as_query_engine()

In [6]:
res = query_engine.query("What is towards AI aim?")

print(res.response)

print("-----------------")
# Show the retrieved nodes
for src in res.source_nodes:
  print("Node ID\t", src.node_id)
  print("Title\t", src.metadata['title'])
  print("URL\t", src.metadata['sourceURL'])
  print("Score\t", src.score)
  print("Description\t", src.metadata.get("description"))
  print("-_"*20)

Towards AI aims to be the leading AI community and content platform that makes AI accessible to all by providing high-quality publications, news, articles, and stories on AI and technology-related topics.
-----------------
Node ID	 c43b3951-f35d-4d72-8c5a-176d5d024e8c
Title	 Towards AI
URL	 https://towardsai.net/
Score	 0.8703751173554086
Description	 Towards AI is an online publication, which focuses on sharing high-quality publications, news, articles, and stories on AI and technology related topics., Louie's thoughts on the week's biggest AI developments. 
All major AI news, models, tools and papers covered. 
Read by over 130,000 AI Practitioners, Industry Professionals and Students. Click to read Towards AI Newsletter, a Substack publication.
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
Node ID	 9b2b25a9-fca2-4477-86e2-2421bae9336e
Title	 Towards AI
URL	 https://towardsai.net/
Score	 0.8694065341459926
Description	 Towards AI is an online publication, which focuses on sharing high-qual

# CRAWL A WEBSITE

## Load The CSV

CSV contains the list of tools and url of the page which we use to get information about the tool.

In [7]:
import requests
import csv

# Google Sheets file URL (CSV export link)
url = 'https://docs.google.com/spreadsheets/d/1gHB-aQJGt9Nl3cyOP2GorAkBI_Us2AqkYnfqrmejStc/export?format=csv'

# Send a GET request to fetch the CSV file
response = requests.get(url)

response_list = []
# Check if the request was successful
if response.status_code == 200:
    # Decode the content to a string
    content = response.content.decode('utf-8')

    # Use the csv.DictReader to read the content as a dictionary
    csv_reader = csv.DictReader(content.splitlines(), delimiter=',')
    response_list = [row for row in csv_reader]
else:
    print(f"Failed to retrieve the file: {response.status_code}")


In [8]:
import random

start_index = random.randint(0, len(response_list) - 3)
website_list = response_list[start_index:start_index+10] # Crawling 10 websites only.

In [9]:
import pprint
print("CSV data")
pprint.pprint(website_list)

CSV data
[{'': '',
  'Category': 'Computer Vision',
  'Company': '',
  'Description': 'Facial recognition and facial attribute analysis',
  'Is a direct URL company /tool website?': 'No',
  'Name': 'DeepFace',
  'Parent': '',
  'Tool Type': 'Library',
  'URL': 'https://viso.ai/computer-vision/deepface/'},
 {'': '',
  'Category': 'Computer Vision',
  'Company': '',
  'Description': 'Next generation of Detectron, implementing state-of-the-art '
                 'object detection algorithms',
  'Is a direct URL company /tool website?': 'No',
  'Name': 'Detectron2',
  'Parent': '',
  'Tool Type': 'Library',
  'URL': 'https://ai.meta.com/tools/detectron2/'},
 {'': '',
  'Category': 'Graph Neural Networks',
  'Company': '',
  'Description': 'Python package built for easy implementation of graph neural '
                 'networks',
  'Is a direct URL company /tool website?': 'Yes',
  'Name': 'DGL (Deep Graph Library)',
  'Parent': '',
  'Tool Type': 'Library',
  'URL': 'https://www.dgl.ai/'}

## Initialize the Firecrawl

In [10]:
import os
from firecrawl import FirecrawlApp
app = FirecrawlApp(api_key=FIRECRAWL_API_KEY)

In [11]:
import time

# Crawl websites and handle responses
url_response = {}
crawl_per_min = 1  # Max crawl per minute

# Track crawls
crawled_websites = 0
scraped_pages = 0

for i, website_dict in enumerate(website_list):
    url = website_dict.get('URL')
    print(f"Crawling: {url}")

    try:
        response = app.crawl_url(
            url,
            params={
                'limit': 5,  # Limit pages to scrape per site.
                'scrapeOptions': {'formats': ['markdown', 'html']}
            }
        )
        crawled_websites += 1

    except Exception as exc:
        print(f"Failed to fetch {url} -> {exc}")
        continue

    # Store the scraped data and associated info in the response dict
    url_response[url] = {
        "scraped_data": response.get("data"),
        "csv_data": website_dict
    }

    # Pause to comply with crawl per minute limit for free version its 1 crawl per minute
    if i!=len(website_list) and (i + 1) % crawl_per_min == 0:
        print("Pausing for 1 minute to comply with crawl limit...")
        time.sleep(60)  # Pause for 1 minute after every crawl


Crawling: https://viso.ai/computer-vision/deepface/
Pausing for 1 minute to comply with crawl limit...
Crawling: https://ai.meta.com/tools/detectron2/
Failed to fetch https://ai.meta.com/tools/detectron2/ -> Unexpected error during start crawl job: Status code 403. This website is no longer supported, please reach out to help@firecrawl.com for more info on how to activate it on your account. - No additional error details provided.
Crawling: https://www.dgl.ai/
Pausing for 1 minute to comply with crawl limit...
Crawling: https://deeplearning4j.konduit.ai/
Pausing for 1 minute to comply with crawl limit...
Crawling: http://dlib.net/ml.htmll
Failed to fetch http://dlib.net/ml.htmll -> Payment Required: Failed to start crawl job. Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value. - No additional error details provided.
Crawling: https://echartsjs.com/
Pausing for 1 mi

## Create  llamaindex documents from the scraped content

In [12]:
from llama_index.core import Document
documents = []

for _, scraped_content in url_response.items():
    csv_data = scraped_content.get("csv_data")
    scraped_results = scraped_content.get("scraped_data")

    for scraped_site_dict in scraped_results:
        for result in scraped_results:
            markdown_content = result.get("markdown")
            title = result.get("metadata").get("title")
            url = result.get("metadata").get("sourceURL")
            documents.append(
                Document(
                    text=markdown_content,
                    metadata={
                        "title": title,
                        "url": url,
                        "description": csv_data.get("Description"),
                        "category": csv_data.get("Category")
                    }
                )
            )


# Create The RAG Pipeline.

In [13]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter

llm = OpenAI(model="gpt-4o-mini")
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=30)

In [14]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model
Settings.text_splitter = text_splitter

In [15]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()

In [16]:
from IPython.display import Markdown, display
def display_response(response):
    display(Markdown(f"<b>{response}</b>"))

In [19]:
query = "What is Deepface?" # Enter your query here, it should be relevant to the crawled websites
res = query_engine.query(query)
display_response(res)

print("-----------------")
# Show the retrieved nodes
for src in res.source_nodes:
  print("Node ID\t", src.node_id)
  print("Title\t", src.metadata['title'])
  print("URL\t", src.metadata['url'])
  print("Score\t", src.score)
  print("Description\t", src.metadata.get("description"))
  print("Category\t", src.metadata.get("category"))
  print("-_"*20)

<b>DeepFace is an open-source facial recognition library that facilitates facial recognition and facial attribute analysis. It supports multiple state-of-the-art face recognition models and provides an API for running these functionalities from mobile or web clients. The library is designed to be lightweight and flexible, making it suitable for production-grade tasks. Additionally, it is continuously evolving, with planned features that include new facial attribute models and a Cloud API. DeepFace can be integrated with enterprise-grade solutions for AI vision applications, ensuring security and data privacy.</b>

-----------------
Node ID	 c16215e9-e0bf-46f5-a1dc-f709b4f4f723
Title	 DeepFace: A Popular Open Source Facial Recognition Library - viso.ai
URL	 https://viso.ai/computer-vision/deepface/
Score	 0.665231131503185
Description	 Facial recognition and facial attribute analysis
Category	 Computer Vision
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
Node ID	 3aa2717b-d8dc-4e93-b03b-5d2ef8621dcd
Title	 DeepFace: A Popular Open Source Facial Recognition Library - viso.ai
URL	 https://viso.ai/computer-vision/deepface/
Score	 0.6578897802867881
Description	 Facial recognition and facial attribute analysis
Category	 Computer Vision
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_


In [18]:
# Querying not relevant to the crawled websites.

query = "What is qdrant?"
res = query_engine.query(query)
display_response(res)

print("-----------------")
# Show the retrieved nodes
for src in res.source_nodes:
  print("Node ID\t", src.node_id)
  print("Title\t", src.metadata['title'])
  print("URL\t", src.metadata['url'])
  print("Score\t", src.score)
  print("Description\t", src.metadata.get("description"))
  print("Category\t", src.metadata.get("category"))
  print("-_"*20)

<b>The provided context does not contain information about Qdrant. Therefore, I cannot provide an answer regarding it.</b>

-----------------
Node ID	 fc29c703-ac28-4cf1-8ea4-baf7ebdaf7c2
Title	 Welcome to Faiss Documentation — Faiss  documentation
URL	 https://faiss.ai/index.html
Score	 0.2283685171282458
Description	 Library for efficient similarity search and clustering of dense vectors
Category	 Similarity Search
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
Node ID	 73f42acd-4312-4144-a18e-5127bfd5a17a
Title	 快速入门 | Deeplearning4j
URL	 https://deeplearning4j.konduit.ai/zhong-wen-v1.0.0/kai-shi/kuai-su-ru-men
Score	 0.17316500990360198
Description	 Deep learning library for Java
Category	 Deep Learning
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
