<a href="https://colab.research.google.com/github/thisiskj/rag-chatbot/blob/main/extraction/RAG_Django.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Web Crawler

Install the required dependencies

In [None]:
!pip install chromadb langchain-text-splitters

Crawl the site

In [None]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Comment
from urllib.parse import urlparse, urljoin
import concurrent
from concurrent.futures import ThreadPoolExecutor
import queue
import threading
import time
from langchain_text_splitters import RecursiveCharacterTextSplitter
import chromadb
from chromadb.config import Settings
from google.colab import userdata

discovered = set()
visited = set()
lock = threading.Lock() # Lock for accessing the above 2 variables

# https://docs.djangoproject.com/en/5.1/
DOMAIN = 'docs.djangoproject.com'
PATH_PREFIX = '/en/5.1/'
BASE_URL = f'https://{DOMAIN}{PATH_PREFIX}'

# Time to sleep between HTTP requests. Applies per thread
SLEEP_TIME = 0.25

# Doc splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False
)

# Chroma DB
# chroma_client = chromadb.PersistentClient(path="/content/drive/MyDrive/Chroma_Django")
chroma_client = chromadb.HttpClient(
    host='chroma-production-e1c6.up.railway.app',
    port=443,
    ssl=True,
    settings=Settings(
        chroma_client_auth_provider="chromadb.auth.token_authn.TokenAuthClientProvider",
        chroma_client_auth_credentials=userdata.get("CHROMA_CLIENT_AUTH_CREDENTIALS"),
        chroma_auth_token_transport_header="X-Chroma-Token"
    )
)
# chroma_client.delete_collection(name="django_docs")
collection = chroma_client.create_collection(
    name="django_docs",
    get_or_create=True
)


# Given a BS4 object, extract the page content
def extract_main_content(soup):
    # Remove unwanted elements
    for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'noscript', 'iframe']):
        element.extract()

    # Function to filter visible text
    def is_visible(element):
        if isinstance(element, Comment):
            return False
        parent = element.parent.name
        if parent in ['style', 'script', 'head', 'title', 'meta', '[document]']:
            return False
        return True

    # Extract visible text elements
    texts = soup.findAll(string=True)
    visible_texts = filter(is_visible, texts)

    # Join the texts and clean up whitespace
    text = ' '.join(t.strip() for t in visible_texts if t.strip())

    return text

def extract_links(url):
    headers = {
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    time.sleep(SLEEP_TIME)
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Get links on page
    links = [link.get('href') for link in soup.find_all('a')]
    discovered_links = filter_links(url, links)
    with lock:
        visited.add(url)
        print(f"[{threading.get_native_id()}] Found {len(discovered_links)} from URL {url} visited {len(visited)}/{len(discovered)} sites ({int(len(visited)/len(discovered)*100)}%). discovered_links: {discovered_links}")

    # Store content page content
    page_content = extract_main_content(soup)
    page_chunks = splitter.split_text(page_content)
    collection.add(
        documents=page_chunks,
        ids=[f"url|||{url}|||{i}" for i, pc in enumerate(page_chunks)],
        metadatas=[{"source": url} for pc in page_chunks],
    )

    return discovered_links

def filter_links(source_url, links):
    filtered_links = []
    source = urlparse(source_url)
    for link in links:
        parsed = urlparse(link)

        # print(source)
        # print(parsed)

        # Remove fragment
        parsed = parsed._replace(fragment='')

        # Set scheme, if empty
        if parsed.scheme == '':
            parsed = parsed._replace(scheme='https')

        # Set netloc, if empty
        if parsed.netloc == '':
            parsed = parsed._replace(netloc=DOMAIN)

        # If path is relative to current page, add prefix
        if not parsed.path.startswith('/'):
            join = '/' if source.path.endswith('') else ''
            parsed = parsed._replace(path=f'{source.path}{join}{parsed.path}')

        # print(parsed)
        final_url = parsed.geturl()
        # print(final_url)

        if final_url.startswith('https://docs.djangoproject.com/en/5.1/releases'):
            continue

        if final_url.startswith(BASE_URL):
            final_url = final_url.rstrip('/')
            final_url = urljoin(final_url, final_url.split('/')[-1])
            # print('adding to frontier:')
            # print(final_url)
            # Append to discovery list and return
            # The returned filtered_urls will be crawled
            with lock:
                if final_url and final_url not in discovered:
                    filtered_links.append(final_url)
                    discovered.add(final_url)
        # print("--"*20)
    return filtered_links

# The q always holds the futures!
q = []

with ThreadPoolExecutor(max_workers=32) as executor:
    q.append(executor.submit(extract_links, BASE_URL))
    while len(q) > 0:
        for future in concurrent.futures.as_completed(q):
            q.remove(future)
            try:
                links = future.result()
            except Exception as e:
                print(e)
                continue
            for link in links:
                q.append(executor.submit(extract_links, link))

# print(extract_links('https://docs.djangoproject.com/en/5.1/topics/db/search'))
print('DONE')
print(discovered)