In [1]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

True

In [2]:
import os

# Set environment variables
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_PROJECT'] = 'cortex'

# Get keys from the environment
langchain_api_key = os.getenv("LANGCHAIN_API_KEY")
groq_api_key = os.getenv("GROQ_API_KEY")

if langchain_api_key:
    os.environ['LANGCHAIN_API_KEY'] = langchain_api_key
else:
    raise ValueError("LANGCHAIN_API_KEY is not set in the environment.")

if groq_api_key:
    os.environ['GROQ_API_KEY'] = groq_api_key
else:
    raise ValueError("GROQ_API_KEY is not set in the environment.")

PART 12 - MULTI-REPRESENTATION INDEXING

In [3]:
from langchain_community.document_loaders import WebBaseLoader

# Configure headers directly in the loader
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
}

# Load documents with headers
loader1 = WebBaseLoader(
    web_paths=("https://medium.com/@pankaj_pandey/introduction-to-retrieval-augmented-generation-rag-9209bf8a076d",),
    requests_kwargs={"headers": headers}
)
docs = loader1.load()

loader2 = WebBaseLoader(
    web_paths=("https://medium.com/humansdotai/an-introduction-to-ai-agents-e8c4afd2ee8f",),
    requests_kwargs={"headers": headers}
)
docs.extend(loader2.load())

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
import uuid

from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq

chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | ChatGroq(model="llama3-70b-8192", temperature=0)
    | StrOutputParser()
)

summaries = chain.batch(docs, {"max_concurrency": 1})

In [5]:
summaries

['The article introduces Retrieval-Augmented Generation (RAG), a framework that improves the accuracy and reliability of Large Language Models (LLMs) by incorporating factual information during response generation. RAG addresses the drawbacks of LLMs, including knowledge cutoff, inconsistency, and response hallucination.\n\nRAG consists of two phases: retrieval and content generation. In the retrieval phase, algorithms search for and retrieve relevant information from external knowledge bases. This information is then used in the generative phase, where the LLM synthesizes an answer based on both the augmented prompt and its internal representation of training data.\n\nThe benefits of RAG include:\n\n* Accuracy and fact-checking: Ensures LLM responses are based on reliable sources, allowing users to verify claims.\n* Reduced bias and hallucination: Limits LLM reliance on internal biases and prevents fabrication of information.\n* Lower cost and maintenance: Reduces the need for continu

In [6]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.storage import InMemoryByteStore
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_core.documents import Document
import uuid

# Initialize HuggingFace embeddings with the updated class
model_name = "BAAI/bge-small-en-v1.5"
hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True}
)

# Initialize the vectorstore for indexing child chunks with the updated class
vectorstore = Chroma(
    collection_name="summaries",
    embedding_function=hf_embeddings
)

# Initialize the storage layer for parent documents
store = InMemoryByteStore()
id_key = "doc_id"

# Create the retriever combining vectorstore and byte storage
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key
)

# Generate unique document IDs
doc_ids = [str(uuid.uuid4()) for _ in docs]

# Create Document objects for summaries with linked metadata
summary_docs = [
    Document(page_content=summary, metadata={id_key: doc_ids[i]})
    for i, summary in enumerate(summaries)
]

# Add summary documents to the vectorstore
retriever.vectorstore.add_documents(summary_docs)

# Link parent documents to their corresponding IDs in the byte store
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [7]:
doc_ids

['108f3a67-ba87-4454-ad62-57b9a81bbf77',
 '8b6ab1e8-e3a4-4fc5-aa37-c62aa9a1e93f']

In [8]:
retriever.docstore.mget(doc_ids)

[Document(metadata={'source': 'https://medium.com/@pankaj_pandey/introduction-to-retrieval-augmented-generation-rag-9209bf8a076d', 'title': 'Introduction to Retrieval-Augmented Generation (RAG) | by Pankaj | Medium', 'description': 'The world is advancing rapidly, introducing new technologies and stacks in AI and other areas every day. Large Language Models (LLMs) are a significant innovation in this space. However, LLMs have…', 'language': 'en'}, page_content='Introduction to Retrieval-Augmented Generation (RAG) | by Pankaj | MediumOpen in appSign upSign inWriteSign upSign inMastodonIntroduction to Retrieval-Augmented Generation (RAG)PankajFollow6 min read·Dec 16, 2023--ListenShareRAG systems aim to address the drawbacks of Large Language Models by incorporating factual information during response generation, mitigating issues such as knowledge cutoff and response hallucination.Retrieval Augmented Generation (RAG)The world is advancing rapidly, introducing new technologies and stacks 

In [9]:
query = "What is agent"
sub_docs = vectorstore.similarity_search(query,k=1)
sub_docs[0]

Document(id='3c6c3615-0e8f-4dd9-adb0-4b673d28fa4b', metadata={'doc_id': '8b6ab1e8-e3a4-4fc5-aa37-c62aa9a1e93f'}, page_content='The article introduces Artificial Intelligence (AI) Agents, which are digital systems that can perform autonomously, make decisions, and interact with their environment like humans. Powered by machine learning, natural language processing, and other cutting-edge technologies, AI Agents can learn from data, adapt to new information, and execute complex functions autonomously.\n\nThe article explores the functionalities, technology, and applications of AI Agents across industries, including customer service, healthcare, finance, transportation, and more. It also discusses the ethical considerations that arise as these agents become increasingly integrated into our lives.\n\nAI Agents are designed to understand, analyze, and respond to human input, constantly evolving to enhance their capabilities. They can operate independently, driven by goals rather than specif

In [10]:
retrieved_docs = retriever.invoke(query,n_results=1)
retrieved_docs[0].page_content[0:500]

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


'An Introduction to AI Agents. Artificial Intelligence Agents are the… | by Humans.ai | humansdotai | MediumOpen in appSign upSign inWriteSign upSign inhumansdotai·Humans.ai utilizes blockchain technology to align artificial intelligence with trust, creating a human-centric approach. Our company views AI not as a tool but as a companion, envisioning a collaborative future with humanity.An Introduction to AI AgentsHumans.aiFollow7 min read·Dec 27, 2023--2ListenShareArtificial Intelligence Agents a'

PART 13 - ColBERT

RAGatouille makes it as simple to use ColBERT.

ColBERT generates a contextually influenced vector for each token in the passages.

ColBERT similarly generates vectors for each token in the query.

Then, the score of each document is the sum of the maximum similarity of each query embedding to any of the document embeddings:

In [16]:
import os
import torch  # Import PyTorch
import requests
from ragatouille import RAGPretrainedModel

# Disable tokenizers parallelism to avoid deadlock warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load the RAG pretrained model
RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

# Handle cases where CUDA is not available
if not torch.cuda.is_available():
    print("Warning: CUDA is not available. Operations will be run on the CPU.")
    print("Consider installing a supported CUDA version for optimal performance.")

def get_wikipedia_page(title: str) -> str:
    """
    Retrieve the full text content of a Wikipedia page.

    :param title: str - Title of the Wikipedia page.
    :return: str - Full text content of the page as raw string, or None if page is not found.
    """
    # Wikipedia API endpoint
    URL = "https://en.wikipedia.org/w/api.php"

    # Parameters for the API request
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "extracts",
        "explaintext": True,
    }

    # Custom User-Agent header to comply with Wikipedia's best practices
    headers = {"User-Agent": "RAGatouille_tutorial/0.0.1 (ben@clavie.eu)"}

    try:
        # Make the request
        response = requests.get(URL, params=params, headers=headers)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)

        # Parse the response
        data = response.json()
        page = next(iter(data["query"]["pages"].values()))

        # Return the page content if it exists
        return page.get("extract", "No content found for the page.")
    except requests.exceptions.RequestException as e:
        print(f"Error retrieving Wikipedia page '{title}': {e}")
        return None

# Retrieve the full document from Wikipedia
title = "Document retrieval"
full_document = get_wikipedia_page(title)

# Check if content was successfully retrieved
if full_document:
    print(f"Successfully retrieved content for '{title}'.")
else:
    print(f"Failed to retrieve content for '{title}'. Exiting.")
    exit(1)

# Index the document using RAG
try:
    RAG.index(
        collection=[full_document],
        index_name="Doc-1",
        max_document_length=180,
        split_documents=True,
    )
    print("Document successfully indexed in RAG.")
except Exception as e:
    print(f"Error while indexing the document: {e}")

Consider installing a supported CUDA version for optimal performance.
Successfully retrieved content for 'Document retrieval'.
This is a behaviour change from RAGatouille 0.8.0 onwards.
This works fine for most users and smallish datasets, but can be considerably slower than FAISS and could cause worse results in some situations.
If you're confident with FAISS working on your machine, pass use_faiss=True to revert to the FAISS-using behaviour.
--------------------


[May 15, 11:13:20] #> Note: Output directory .ragatouille/colbert/indexes/Doc-1 already exists


[May 15, 11:13:20] #> Will delete 10 files already at .ragatouille/colbert/indexes/Doc-1 in 20 seconds...
[May 15, 11:13:43] [0] 		 #> Encoding 7 passages..


100%|██████████| 1/1 [00:00<00:00,  3.38it/s]

[May 15, 11:13:43] [0] 		 avg_doclen_est = 116.42857360839844 	 len(local_sample) = 7
[May 15, 11:13:43] [0] 		 Creating 256 partitions.
[May 15, 11:13:43] [0] 		 *Estimated* 815 embeddings.
[May 15, 11:13:43] [0] 		 #> Saving the indexing plan to .ragatouille/colbert/indexes/Doc-1/plan.json ..





used 6 iterations (0.006s) to cluster 775 items into 256 clusters
[0.036, 0.038, 0.038, 0.032, 0.034, 0.034, 0.033, 0.038, 0.029, 0.028, 0.032, 0.03, 0.029, 0.032, 0.036, 0.039, 0.03, 0.026, 0.029, 0.032, 0.029, 0.027, 0.03, 0.038, 0.039, 0.031, 0.031, 0.039, 0.03, 0.041, 0.034, 0.04, 0.03, 0.035, 0.03, 0.03, 0.034, 0.034, 0.027, 0.042, 0.027, 0.044, 0.035, 0.035, 0.034, 0.038, 0.027, 0.03, 0.025, 0.036, 0.028, 0.038, 0.028, 0.026, 0.033, 0.042, 0.04, 0.039, 0.038, 0.04, 0.033, 0.043, 0.029, 0.04, 0.04, 0.044, 0.032, 0.04, 0.033, 0.032, 0.029, 0.024, 0.041, 0.033, 0.041, 0.029, 0.038, 0.031, 0.041, 0.044, 0.037, 0.038, 0.033, 0.037, 0.038, 0.033, 0.039, 0.035, 0.033, 0.029, 0.031, 0.034, 0.032, 0.037, 0.037, 0.032, 0.036, 0.036, 0.032, 0.033, 0.029, 0.035, 0.035, 0.033, 0.036, 0.036, 0.028, 0.025, 0.035, 0.027, 0.042, 0.035, 0.034, 0.038, 0.039, 0.034, 0.031, 0.036, 0.035, 0.032, 0.03, 0.038, 0.026, 0.035, 0.028, 0.035, 0.037, 0.03]


0it [00:00, ?it/s]

[May 15, 11:13:43] [0] 		 #> Encoding 7 passages..


100%|██████████| 1/1 [00:00<00:00,  4.16it/s]
1it [00:00,  4.00it/s]
100%|██████████| 1/1 [00:00<00:00, 848.19it/s]

[May 15, 11:13:43] #> Optimizing IVF to store map from centroids to list of pids..
[May 15, 11:13:43] #> Building the emb2pid mapping..
[May 15, 11:13:43] len(emb2pid) = 815



100%|██████████| 256/256 [00:00<00:00, 86543.23it/s]

[May 15, 11:13:43] #> Saved optimized IVF to .ragatouille/colbert/indexes/Doc-1/ivf.pid.pt
Done indexing!
Document successfully indexed in RAG.





In [17]:
import torch
from ragatouille import RAGPretrainedModel

# Define the device
device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the existing index **without the 'device' argument**
RAG = RAGPretrainedModel.from_index(index_path=".ragatouille/colbert/indexes/Doc-1")

# Load the index — device will be handled automatically by ragatouille internally
RAG = RAGPretrainedModel.from_index(index_path=".ragatouille/colbert/indexes/Doc-1")

# Perform search
results = RAG.search(query="What is an example for form-based indexing?", k=3)
print("Search results:", results)

# For LangChain retriever
retriever = RAG.as_langchain_retriever(k=3)
print(retriever.invoke("What is an example for form-based indexing?"))

Using device: mps


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Loading searcher for index Doc-1 for the first time... This may take a few seconds
[May 15, 11:13:56] #> Loading codec...
[May 15, 11:13:56] #> Loading IVF...
[May 15, 11:13:56] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 3216.49it/s]

[May 15, 11:13:56] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 533.56it/s]

Searcher loaded!

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . What is an example for form-based indexing?, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([ 101,    1, 2054, 2003, 2019, 2742, 2005, 2433, 1011, 2241, 5950, 2075,
        1029,  102,  103,  103,  103,  103,  103,  103,  103,  103,  103,  103,
         103,  103,  103,  103,  103,  103,  103,  103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

Search results: [{'content': '== Variations ==\nThere are two main classes of indexing schemata for document retrieval systems: form based (or word based), and content based indexing. The document classification scheme (or indexing algorithm) in use determines the nature of the document retrieval system.\n\n\n=== Form based ===\nForm based document retrieval addresses the exact syntactic properties of a text, comparable to substring




In [18]:
retriever = RAG.as_langchain_retriever(k=3)
retriever.invoke("What is an example for form based indexing?")

[Document(metadata={}, page_content='== Variations ==\nThere are two main classes of indexing schemata for document retrieval systems: form based (or word based), and content based indexing. The document classification scheme (or indexing algorithm) in use determines the nature of the document retrieval system.\n\n\n=== Form based ===\nForm based document retrieval addresses the exact syntactic properties of a text, comparable to substring matching in string searches. The text is generally unstructured and not necessarily in a natural language, the system could for example be used to process large sets of chemical representations in molecular biology. A suffix tree algorithm is an example for form based indexing.'),
 Document(metadata={}, page_content='== Example: PubMed ==\nThe PubMed form interface features the "related articles" search which works through a comparison of words from the documents\' title, abstract, and MeSH terms using a word-weighted algorithm.\n\n\n== See also ==\n