<a href="https://colab.research.google.com/github/thisiskj/rag-chatbot/blob/main/extraction/RAG_Django.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chroma Testing

In [None]:
!pip install chromadb openai

Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.8.3-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.29.0-py3-

In [None]:
import chromadb
import pprint
from openai import OpenAI
from google.colab import userdata

openai_client = OpenAI(
    api_key=userdata.get('OPENAI_API_KEY')
)

SYSTEM_PROMPT = """
You are a helpful assistant that has knowledge on the Django web framework
If you don't know the answer, say you don't know. Do not try to make up an answer.
"""

PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

# chroma_client = chromadb.Client()
# collection = chroma_client.create_collection(name="my_collection")

chroma_client = chromadb.HttpClient(
    host='chroma-production-e1c6.up.railway.app',
    port=443,
    ssl=True,
    settings=Settings(
        chroma_client_auth_provider="chromadb.auth.token_authn.TokenAuthClientProvider",
        chroma_client_auth_credentials=userdata.get("CHROMA_CLIENT_AUTH_CREDENTIALS"),
        chroma_auth_token_transport_header="X-Chroma-Token"
    )
)
collection = chroma_client.get_collection(name="django_docs")

collection.peek()
print(collection.count())

question = "How can I define a URL path param and get the value in a view?"

results = collection.query(
    query_texts=[question], # Chroma will embed this for you
    n_results=20 # how many results to return
)
pprint.pp(results)

context = "\n".join([doc for doc in results['documents'][0]])

completion = openai_client.chat.completions.create(
    model="gpt-4o",
    temperature=1.0,
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": PROMPT_TEMPLATE.format(context=context, question=question)}
    ]
)

def pluck(lst, key):
    return [d.get(key) for d in lst]

print(completion.choices[0].message.content)
print(set(pluck(results['metadatas'][0], 'source')))

6411
{'ids': [['url|||https://docs.djangoproject.com/en/5.1/ref/templates/builtins|||38',
          'url|||https://docs.djangoproject.com/en/5.1/ref/templates/builtins|||37',
          'url|||https://docs.djangoproject.com/en/5.1/ref/templates/builtins|||39',
          'url|||https://docs.djangoproject.com/en/5.1/topics/http/shortcuts|||4',
          'url|||https://docs.djangoproject.com/en/5.1/ref/class-based-views/base|||10',
          'url|||https://docs.djangoproject.com/en/5.1/ref/urls|||3',
          'url|||https://docs.djangoproject.com/en/5.1/ref/class-based-views/base|||8',
          'url|||https://docs.djangoproject.com/en/5.1/ref/urls|||1',
          'url|||https://docs.djangoproject.com/en/5.1/ref/urlresolvers|||5',
          'url|||https://docs.djangoproject.com/en/5.1/topics/http/urls|||3',
          'url|||https://docs.djangoproject.com/en/5.1/topics/http/urls|||21',
          'url|||https://docs.djangoproject.com/en/5.1/ref/urlresolvers|||4',
          'url|||https://do

In [None]:
!python --version

Python 3.11.11


# Web Crawler

In [None]:
!pip install chromadb langchain-text-splitters

Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.8.4-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.29.0-py3-

In [None]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Comment
from urllib.parse import urlparse, urljoin
import concurrent
from concurrent.futures import ThreadPoolExecutor
import queue
import threading
import time
from langchain_text_splitters import RecursiveCharacterTextSplitter
import chromadb
from chromadb.config import Settings
from google.colab import userdata

discovered = set()
visited = set()
lock = threading.Lock() # Lock for accessing the above 2 variables

# https://docs.djangoproject.com/en/5.1/
DOMAIN = 'docs.djangoproject.com'
PATH_PREFIX = '/en/5.1/'
BASE_URL = f'https://{DOMAIN}{PATH_PREFIX}'

# Time to sleep between HTTP requests. Applies per thread
SLEEP_TIME = 0.25

# Doc splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False
)

# Chroma DB
# chroma_client = chromadb.PersistentClient(path="/content/drive/MyDrive/Chroma_Django")
chroma_client = chromadb.HttpClient(
    host='chroma-production-e1c6.up.railway.app',
    port=443,
    ssl=True,
    settings=Settings(
        chroma_client_auth_provider="chromadb.auth.token_authn.TokenAuthClientProvider",
        chroma_client_auth_credentials=userdata.get("CHROMA_CLIENT_AUTH_CREDENTIALS"),
        chroma_auth_token_transport_header="X-Chroma-Token"
    )
)
# chroma_client.delete_collection(name="django_docs")
collection = chroma_client.create_collection(
    name="django_docs",
    get_or_create=True
)


# Given a BS4 object, extract the page content
def extract_main_content(soup):
    # Remove unwanted elements
    for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'noscript', 'iframe']):
        element.extract()

    # Function to filter visible text
    def is_visible(element):
        if isinstance(element, Comment):
            return False
        parent = element.parent.name
        if parent in ['style', 'script', 'head', 'title', 'meta', '[document]']:
            return False
        return True

    # Extract visible text elements
    texts = soup.findAll(string=True)
    visible_texts = filter(is_visible, texts)

    # Join the texts and clean up whitespace
    text = ' '.join(t.strip() for t in visible_texts if t.strip())

    return text

def extract_links(url):
    headers = {
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    time.sleep(SLEEP_TIME)
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Get links on page
    links = [link.get('href') for link in soup.find_all('a')]
    discovered_links = filter_links(url, links)
    with lock:
        visited.add(url)
        print(f"[{threading.get_native_id()}] Found {len(discovered_links)} from URL {url} visited {len(visited)}/{len(discovered)} sites ({int(len(visited)/len(discovered)*100)}%). discovered_links: {discovered_links}")

    # Store content page content
    page_content = extract_main_content(soup)
    page_chunks = splitter.split_text(page_content)
    collection.add(
        documents=page_chunks,
        ids=[f"url|||{url}|||{i}" for i, pc in enumerate(page_chunks)],
        metadatas=[{"source": url} for pc in page_chunks],
    )

    return discovered_links

def filter_links(source_url, links):
    filtered_links = []
    source = urlparse(source_url)
    for link in links:
        parsed = urlparse(link)

        # print(source)
        # print(parsed)

        # Remove fragment
        parsed = parsed._replace(fragment='')

        # Set scheme, if empty
        if parsed.scheme == '':
            parsed = parsed._replace(scheme='https')

        # Set netloc, if empty
        if parsed.netloc == '':
            parsed = parsed._replace(netloc=DOMAIN)

        # If path is relative to current page, add prefix
        if not parsed.path.startswith('/'):
            join = '/' if source.path.endswith('') else ''
            parsed = parsed._replace(path=f'{source.path}{join}{parsed.path}')

        # print(parsed)
        final_url = parsed.geturl()
        # print(final_url)

        if final_url.startswith('https://docs.djangoproject.com/en/5.1/releases'):
            continue

        if final_url.startswith(BASE_URL):
            final_url = final_url.rstrip('/')
            final_url = urljoin(final_url, final_url.split('/')[-1])
            # print('adding to frontier:')
            # print(final_url)
            # Append to discovery list and return
            # The returned filtered_urls will be crawled
            with lock:
                if final_url and final_url not in discovered:
                    filtered_links.append(final_url)
                    discovered.add(final_url)
        # print("--"*20)
    return filtered_links

# The q always holds the futures!
q = []

with ThreadPoolExecutor(max_workers=32) as executor:
    q.append(executor.submit(extract_links, BASE_URL))
    while len(q) > 0:
        for future in concurrent.futures.as_completed(q):
            q.remove(future)
            try:
                links = future.result()
            except Exception as e:
                print(e)
                continue
            for link in links:
                q.append(executor.submit(extract_links, link))

# print(extract_links('https://docs.djangoproject.com/en/5.1/topics/db/search'))
print('DONE')
print(discovered)

[1135] Found 156 from URL https://docs.djangoproject.com/en/5.1/ visited 1/156 sites (0%). discovered_links: ['https://docs.djangoproject.com/en/5.1', 'https://docs.djangoproject.com/en/5.1/faq/help', 'https://docs.djangoproject.com/en/5.1/intro/overview', 'https://docs.djangoproject.com/en/5.1/intro/install', 'https://docs.djangoproject.com/en/5.1/intro/tutorial01', 'https://docs.djangoproject.com/en/5.1/intro/tutorial02', 'https://docs.djangoproject.com/en/5.1/intro/tutorial03', 'https://docs.djangoproject.com/en/5.1/intro/tutorial04', 'https://docs.djangoproject.com/en/5.1/intro/tutorial05', 'https://docs.djangoproject.com/en/5.1/intro/tutorial06', 'https://docs.djangoproject.com/en/5.1/intro/tutorial07', 'https://docs.djangoproject.com/en/5.1/intro/tutorial08', 'https://docs.djangoproject.com/en/5.1/intro/reusable-apps', 'https://docs.djangoproject.com/en/5.1/intro/contributing', 'https://docs.djangoproject.com/en/5.1/faq', 'https://docs.djangoproject.com/en/5.1/genindex', 'https:/

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:01<00:00, 69.1MiB/s]


[1189] Found 0 from URL https://docs.djangoproject.com/en/5.1/topics/db/queries visited 2/156 sites (1%). discovered_links: []
[1181] Found 0 from URL https://docs.djangoproject.com/en/5.1/topics visited 3/156 sites (1%). discovered_links: []
[1176] Found 0 from URL https://docs.djangoproject.com/en/5.1/faq visited 4/156 sites (2%). discovered_links: []
[1182] Found 0 from URL https://docs.djangoproject.com/en/5.1/ref visited 5/156 sites (3%). discovered_links: []
[1174] Found 0 from URL https://docs.djangoproject.com/en/5.1/intro/reusable-apps visited 6/156 sites (3%). discovered_links: []
[1191] Found 0 from URL https://docs.djangoproject.com/en/5.1/ref/models/lookups visited 7/156 sites (4%). discovered_links: []
[1175] Found 0 from URL https://docs.djangoproject.com/en/5.1/intro/contributing visited 8/156 sites (5%). discovered_links: []
[1172] Found 0 from URL https://docs.djangoproject.com/en/5.1/intro/tutorial07 visited 9/156 sites (5%). discovered_links: []
[1187] Found 0 from 

  k = self.parse_starttag(i)


[1166] Found 0 from URL https://docs.djangoproject.com/en/5.1/ref/contrib/admin/ visited 291/327 sites (88%). discovered_links: []
[1170] Found 0 from URL https://docs.djangoproject.com/en/5.1/releases/5.1 visited 292/327 sites (89%). discovered_links: []
[1169] Found 0 from URL https://docs.djangoproject.com/en/5.1/releases/5.0 visited 293/327 sites (89%). discovered_links: []
[1180] Found 0 from URL https://docs.djangoproject.com/en/5.1/releases/4.2 visited 294/327 sites (89%). discovered_links: []
[1192] Found 0 from URL https://docs.djangoproject.com/en/5.1/releases/4.0 visited 295/327 sites (90%). discovered_links: []
[1188] Found 0 from URL https://docs.djangoproject.com/en/5.1/releases/4.1 visited 296/327 sites (90%). discovered_links: []
[1176] Found 0 from URL https://docs.djangoproject.com/en/5.1/releases/3.0 visited 297/327 sites (90%). discovered_links: []
[1171] Found 0 from URL https://docs.djangoproject.com/en/5.1/releases/3.2 visited 298/327 sites (91%). discovered_link