## **Installing Required Librarires**

In [None]:
# !pip install qdrant-haystack
# !pip install fastembed
# !pip install groq

## **Imports**

In [None]:
import os
import numpy as np
import time
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
from fastembed import TextEmbedding
from groq import Groq

from haystack.dataclasses.document import Document
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever

## **Scraping Documentation**

In [1]:
def get_sitemap_data(url):
    """
    Retrieves the sitemap.xml data from the given URL.

    Args:
        url (str): The base URL of the documentation website.

    Returns:
        str: The content of the sitemap.xml file.
    """

    sitemap_url = f"{url}/sitemap.xml"
    try:
        response = requests.get(sitemap_url)
        response.raise_for_status()  # Raise an exception for bad status codes
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching sitemap: {e}")
        return None

def extract_urls_from_sitemap(sitemap_data):
    """
    Extracts URLs from the given sitemap.xml data.

    Args:
        sitemap_data (str): The content of the sitemap.xml file.

    Returns:
        list: A list of URLs extracted from the sitemap.
    """

    soup = BeautifulSoup(sitemap_data, 'xml')
    urls = []
    for url_tag in soup.find_all('url'):
        loc_tag = url_tag.find('loc')
        if loc_tag:
            urls.append(loc_tag.text)
    return urls

def fetch_and_store_documentation(base_url):
    """
    Fetches documentation content from URLs and stores them in a dictionary.

    Args:
        base_url (str): The base URL of the documentation website.

    Returns:
        dict: A dictionary where keys are URLs and values are filtered HTML content.
    """

    sitemap_data = get_sitemap_data(base_url)
    if sitemap_data:
        urls = extract_urls_from_sitemap(sitemap_data)
        docs = {}  # Initialize an empty dictionary

        for url in urls:
            try:
                response = requests.get(url)
                response.raise_for_status()

                soup = BeautifulSoup(response.text, 'html.parser')
                # Filter out unwanted tags using BeautifulSoup (adjust as needed)
                for tag in ['script', 'style', 'nav', 'aside', 'footer']:
                    for element in soup.find_all(tag):
                        element.decompose()

                docs[url] = soup.get_text(separator=' ')  # Store filtered HTML content
                print(f"Fetched and stored content from: {url}")

            except requests.exceptions.RequestException as e:
                print(f"Error fetching {url}: {e}")

        return docs
    else:
        return None

In [2]:
base_url = "https://llama-cpp-python.readthedocs.io/"
documentation_data = fetch_and_store_documentation(base_url)

Fetched and stored content from: https://llama-cpp-python.readthedocs.io/en/stable/
Fetched and stored content from: https://llama-cpp-python.readthedocs.io/en/latest/


## **Sentence Tokenization**

In [3]:
for url, content in documentation_data.items():
    sentences = sent_tokenize(content)
    documentation_data[url] = sentences
    print(f"Sentences in {url}: {len(sentences)}")

Sentences in https://llama-cpp-python.readthedocs.io/en/stable/: 144
Sentences in https://llama-cpp-python.readthedocs.io/en/latest/: 144


## **Embedding Generation**

In [4]:
# Initialize the TextEmbedding model
embedding_model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5", cache_dir="./embeddings")

def embed_documents(documents):
    for url, sentences in documentation_data.items():
        
        embeddings = []
        for sentence in sentences:
            # Embed document using FastEmbed
            embedding = np.array(list((embedding_model.embed([sentence]))))
            
            # Append the embedding to the list of embeddings
            embeddings.append((sentence,embedding))
        
        documentation_data[url] = embeddings
        
    return documentation_data

# Perform embedding generation
documentation_data = embed_documents(documentation_data)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 110960.42it/s]


## **Creating a Vector Database using Qdrant on Haystack Framework**

In [23]:
ingestion_data = []

document_store = QdrantDocumentStore(
    ":memory:",
    index="Document",
    embedding_dim=384,
    recreate_index=True,
    hnsw_config={"m": 16, "ef_construct": 64}  # Optional
)

for url, sentences in documentation_data.items():
    # print(sentences[0][0])
    ingestion_data.append(Document(content=sentences[0][0], embedding=sentences[0][1][0], meta={"url": url}))
    try:
        document_store.write_documents(ingestion_data)
    except:
        # Duplicate document
        pass


100it [00:00, 45427.32it/s]          


In [31]:
retriever = QdrantEmbeddingRetriever(document_store=document_store)

query = "How to install Llama-cpp ?"

query_embedding = list((embedding_model.embed([query])))

In [38]:
retrieved_content = retriever.run(list(query_embedding[0]))

## **RAG with Llama 3**

In [73]:
client = Groq(
    api_key="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",
)

prompt = f"""Below is given a Documentation and answer the question asked in the end:
{retrieved_content['documents'][0].content}
\n\n\n
{query}
"""

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="llama3-70b-8192",
)

In [74]:
print(chat_completion.choices[0].message.content)

According to the documentation, the question "How to install Llama-cpp?" is not directly answered in the provided text. The text is an introduction to the Python bindings for llama.cpp, but it does not provide installation instructions.
