In [3]:
# The aim of this script is to provide simple pipeline to understand the RAG concept.
# Uses OpenAI's gpt-4o-mini LLM

# Upgrade pip to latest version
%pip install --upgrade pip

# Uninstall conflicting packages
%pip uninstall -y langchain-core langchain-openai langchain-experimental beautifulsoup4 langchain-community langchain chromadb beautifulsoup4

# Install compatible versions of langchain-core and langchain-openai
%pip install langchain-core

# Provides integration between LangChain and OpenAI’s language models
%pip install langchain-openai

# Offers additional capabilities and tools beyond the core LangChain library that are not yet fully stable or production-ready, 
# but are still available for experimentation andexploration
%pip install langchain-experimental

# Community-driven package for the LangChain library, which is an open source framework for building applications with LLMs
%pip install langchain-community

# Core LangChain library itself, provides a framework and a set of abstractions for building applications with LLMs
%pip install langchain

# High-performance embedding/vector database designed for efficient similarity search and retrieval
%pip install chromadb

# Python library used for parsing HTML and XML documents
%pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.
Found existing installation: langchain-core 0.3.15
Uninstalling langchain-core-0.3.15:
  Successfully uninstalled langchain-core-0.3.15
Found existing installation: langchain-openai 0.2.5
Uninstalling langchain-openai-0.2.5:
  Successfully uninstalled langchain-openai-0.2.5
Found existing installation: langchain-experimental 0.3.3
Uninstalling langchain-experimental-0.3.3:
  Successfully uninstalled langchain-experimental-0.3.3
Found existing installation: beautifulsoup4 4.12.3
Uninstalling beautifulsoup4-4.12.3:
  Successfully uninstalled beautifulsoup4-4.12.3
Found existing installation: langchain-community 0.3.5
Uninstalling langchain-community-0.3.5:
  Successfully uninstalled langchain-community-0.3.5
Found existing installation: langchain 0.3.7
Uninstalling langchain-0.3.7:
  Successfully uninstalled langchain-0.3.7
[0mNote: you may need to restart the kernel to use updated packages.
Collecting langchain-core
  Us

In [None]:
# Restart kernel after install operations are done
import IPython
app = IPython.get_ipython()
app.kernel.do_shutdown(True)

In [1]:
# provides necesary functionalities to interact with the operating system, such as retrieving an env variable
import os

# Declaring new OS parameter to avoid warning
os.environ['USER_AGENT'] = 'RAGUserAgent'

In [2]:
# The WebBaseLoader class is a document loader that can fetch and load web pages as documents
from langchain_community.document_loaders import WebBaseLoader

# Beautiful Soup 4 is a library for web scraping and parsing HTML or XML documents
import bs4

# Provides an interface to interact with OpenAI’s language models and APIs
import openai

# Specific implementations of language models and embeddings that use OpenAI’s models that work directly with LangChain
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Hub component provides access to various pre-built components and utilities for working with language models
from langchain import hub

# Parses the output generated by the language model and extracts the relevant information. 
# In this case, it assumes that the language model’s output is a string and returns it as-is.
from langchain_core.output_parsers import StrOutputParser

# This component passes through the question or query without any modifications. 
# It allows the question to be used as-is in the subsequent steps of the chain
from langchain_core.runnables import RunnablePassthrough

# Chromadb imports the Chroma DB vector store, a high-performance embedding/vector database 
# designed for efficient similarity search and retrieval.
import chromadb

# Provides an interface to interact with the Chroma vector database using LangChain
from langchain_community.vectorstores import Chroma

# A text splitter is typically a function that we use to split the text into small chunks based on a specified chunk size and overlap. 
# This splitter is called SemanticChunker, an experimental text-splitting utility provided by the Langchain_experimental library. 
# The main purpose of SemanticChunker is to break down long text into more manageable pieces 
# while preserving the semantic coherence and context of each chunk.
from langchain_experimental.text_splitter import SemanticChunker

In [3]:
# OpenAI Setup
# We will use the following API key for `gpt-4o-mini` model and
# OpenAI embedding service
openai.api_key = os.environ['OPENAI_API_KEY']

In [4]:
## INDEXING ##
# Next few steps represent the indexing stage, where we obtain our target data, pre-process it, and
# vectorize it. These steps are often done offline, meaning they are done to prepare the application for
# usage later. But in some cases, it may make sense to do this all in real time, such as in rapidly changing
# data environments where the data that is used is relatively small. THe overall steps are:

# 1. Web loading and crawling.
# 2. Splitting the data into digestible chunks for the Chroma DB vectorizing algorithm.
# 3. Embedding and indexing those chunks.
# 4. Adding those chunks and embeddings to the Chroma DB vector store.

In [6]:
# Load Documents
loader = WebBaseLoader(
    web_paths=("https://kbourne.github.io/chapter1.html",), # A tuple containing the URLs of the web pages to be loaded.
    bs_kwargs=dict( # A dictionary of keyword arguments to be passed to the BeautifulSoup parser.
        parse_only=bs4.SoupStrainer( # specifies the HTML elements to parse. In this case, it is set to parse only the elements with the CSS classes
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

# I. Makes HTTP requests to the specified URLs to fetch the web pages.
# II. Parses the HTML content of the web pages using BeautifulSoup, considering only the elements specified by
# the parse_only parameter.
# III. Extracts the relevant text content from the parsed HTML elements.
# IV. Creates Document objects for each web page that contain the extracted text content, along with metadata such as the source URL.

In [15]:
# Splitting
# SemanticChunker focuses on breaking down long text into more manageable pieces while preserving
# the semantic coherence and context of each chunk.
# There are other text splitters that are not context-aware, and does the splitting by arbitrarily
# defined sizes, but SemanticChunker focuses on accounting for context 
# rather than just arbitrary length of our chunks

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    # With the `text-embedding-3` class
    # of models, you can specify the size
    # of the embeddings you want returned.
    # dimensions=1024
)

text_splitter = SemanticChunker(embeddings)
splits = text_splitter.split_documents(docs)

In [16]:
# Embed
# We are using OpenAI embeddings here as well as well splitting step, 
# which will use our OpenAI key to send our chunks of data to the OpenAI API, convert them into embeddings, 
# and then send them back in their mathematical form.

# First, we create the Chroma vector store with the Chroma.from_documents method, which is called to
# create a Chroma vector store from the documents that we used split operation before
vectorstore = Chroma.from_documents(
    documents=splits, # The list of split documents (splits) obtained from the previous code snippet 
    embedding=embeddings # An instance of the OpenAIEmbeddings class, which is used to generate embeddings for the documents
)

# What happens under the hood is:
# 1. It iterates over each Document object in the splits list.
# 2. For each Document object, it uses the provided OpenAIEmbeddings instance to generate an embedding vector.
# 3. It stores the document text and its corresponding embedding vector in the Chroma vector database.


# The retriever is an object that provides a convenient interface for performing similarity searches 
# and retrieving the relevant documents from the vector database based on those searches
retriever = vectorstore.as_retriever()

In [17]:
#### RETRIEVAL and GENERATION ####

# Here's what we will do in the following steps:
# 1. Take in a user query.
# 2. Vectorize that user query.
# 3. Perform a similarity search of the vector store to find the closest vectors to the user query vector, as well as their associated
# content.
# 4. Pass the retrieved content into a prompt template, a process known as hydrating.
# 5. Pass that hydrated prompt to the LLM.
# 6. Once you receive a response from the LLM, present it to the user.

In [20]:
# Prompt - ignore LangSmith warning, you will not need langsmith for this coding exercise
prompt = hub.pull("jclemens24/rag-prompt")

# The prompt template is a key part of the RAG pipeline as it represents how you communicate with the
# LLM to receive the response you are seeking

print(type(prompt))
print(prompt)



<class 'langchain_core.prompts.chat.ChatPromptTemplate'>
input_variables=['context', 'question'] input_types={} partial_variables={} metadata={'lc_hub_owner': 'jclemens24', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '1a1f3ccb9a5a92363310e3b130843dfb2540239366ebe712ddd94982acc06734'} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


In [19]:
# Later, we will add the question and context variables to hydrate the prompt, but starting with this format
# optimizes it to work better for RAG applications.

# "You are an assistant for question-answering tasks. Use the following pieces of 
# retrieved-context to answer the question. 
# If you don't know the answer, just say that you don't know.
# Question: {question}
# Context: {context}
# Answer:"

In [9]:
# Post-processing

# The join method is called on the \n\n string to concatenate page_content of each document with two
# newline characters between each document’s content.

# The purpose of this function is to format the output of the retriever into the string format that it will 
# need to be in for the next step in the chain, after the retriever step.
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [10]:
# LLM
# Creates an instance of the ChatOpenAI class from the langchain_openai module,
# which serves as an interface to OpenAI’s language models
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)


# The temperature parameter in Large Language Models (LLMs) 
# directly affects the variability and randomness of generated responses. 
# A lower LLM temperature value (close to 0) produces more deterministic and focused outputs, 
# ideal for tasks requiring factual accuracy, such as summarization or translation.

In [11]:
# Chain it all together with LangChain
# Q) How Retriever receives the input question?
# A) The retriever is designed to accept a query (the question) as input. 
# Since it's the first component in the pipeline, LangChain feeds the input question to it by default.
# Behind the scenes, LangChain calls the __call__ or invoke method of the retriever object, passing the question as an argument.
# ---
# Input Question is first passed to the retriever object for the vectorization process to do vector similarity search with the existing documents in chromadb;
# and then it is passed to the prompt for the hydration. So in total, there are two passes occur.
rag_chain = (
    # `retriever | format_docs` passes the question through the retriever, generating Document objects, 
    # and then to `format_docs` to generate strings
    # RunnablePassthrough() passes through the input question unchanged
    {"context": retriever | format_docs, "question": RunnablePassthrough()} 
    | prompt # hydrating the prompt with pipe operator for context and question keys
    | llm # llm will see the hydrated prompt, and give an answer
    | StrOutputParser() # We need parsing that will only give answer of llm as string, since the answer of llm will include fields that we do not care about at the moment
)

In [12]:
# Question - run the chain
# The first step in the chain expects the following string as the value of question key.
# rag_chain.invoke(question) would build a formatted prompt, ready for inference.
rag_chain.invoke("What are the advantages of using RAG?")

"The advantages of using Retrieval-Augmented Generation (RAG) include:\n\n1. **Improved Accuracy and Relevance**: RAG enhances the accuracy and relevance of responses generated by large language models (LLMs) by incorporating specific, real-time information from databases or datasets, ensuring outputs are based on both the model's pre-existing knowledge and the most current data.\n\n2. **Customization and Flexibility**: RAG allows for tailored responses based on domain-specific needs by integrating a company's internal databases into the response generation process, creating personalized experiences and outputs that meet unique business requirements.\n\n3. **Expanding Model Knowledge Beyond Training Data**: RAG enables models to access and utilize information that was not included in their initial training sets, effectively expanding the model's knowledge base without the need for retraining, making LLMs more versatile and adaptable to new domains or rapidly evolving topics."