In [35]:
"""
   The aim of this script is to integrate a simple UI into RAG pipeline. The UI used in this script is Gradio.
   Gradio: Python library used to create user-friendly web interfaces for machine learning models, APIs, or any Python function.
"""

# Upgrade pip to latest version
%pip install --upgrade pip

# Uninstall conflicting packages
%pip uninstall -y langchain-core langchain-openai langchain-experimental beautifulsoup4 langchain-community langchain chromadb beautifulsoup4 gradio

    
# Install compatible versions of langchain-core and langchain-openai
%pip install langchain-core

# Provides integration between LangChain and OpenAI’s language models
%pip install langchain-openai

# Offers additional capabilities and tools beyond the core LangChain library that are not yet fully stable or production-ready, 
# but are still available for experimentation andexploration
%pip install langchain-experimental

# Community-driven package for the LangChain library, which is an open source framework for building applications with LLMs
%pip install langchain-community

# Core LangChain library itself, provides a framework and a set of abstractions for building applications with LLMs
%pip install langchain

# High-performance embedding/vector database designed for efficient similarity search and retrieval
%pip install chromadb

# Python library used for parsing HTML and XML documents
%pip install beautifulsoup4

# Installing Gradio for UI functionality
%pip install gradio

# uvloop is an event loop implementation that can improve asyncio performance. 
# However, it causes compatibility issues with other libraries (e.g., Gradio or Jupyter). 
%pip uninstall uvloop -y

# Upgrading jupyter and ipywidgets ensures you have the latest features and bug fixes
%pip install --upgrade jupyter ipywidgets
    


Note: you may need to restart the kernel to use updated packages.
Found existing installation: langchain-core 0.3.28
Uninstalling langchain-core-0.3.28:
  Successfully uninstalled langchain-core-0.3.28
Found existing installation: langchain-openai 0.2.14
Uninstalling langchain-openai-0.2.14:
  Successfully uninstalled langchain-openai-0.2.14
Found existing installation: langchain-experimental 0.3.4
Uninstalling langchain-experimental-0.3.4:
  Successfully uninstalled langchain-experimental-0.3.4
Found existing installation: beautifulsoup4 4.12.3
Uninstalling beautifulsoup4-4.12.3:
  Successfully uninstalled beautifulsoup4-4.12.3
Found existing installation: langchain-community 0.3.13
Uninstalling langchain-community-0.3.13:
  Successfully uninstalled langchain-community-0.3.13
Found existing installation: langchain 0.3.13
Uninstalling langchain-0.3.13:
  Successfully uninstalled langchain-0.3.13
Found existing installation: chromadb 0.5.23
Uninstalling chromadb-0.5.23:
  Successfully u

In [36]:
# Restart kernel after install operations are done
import IPython
app = IPython.get_ipython()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [1]:
# provides necesary functionalities to interact with the operating system, such as retrieving an env variable
import os

# Declaring new OS parameter to avoid warning
os.environ['USER_AGENT'] = 'RAGUserAgent'

In [2]:
# The WebBaseLoader class is a document loader that can fetch and load web pages as documents
from langchain_community.document_loaders import WebBaseLoader

# Beautiful Soup 4 is a library for web scraping and parsing HTML or XML documents
import bs4

# Provides an interface to interact with OpenAI’s language models and APIs
import openai

# Specific implementations of language models and embeddings that use OpenAI’s models that work directly with LangChain
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Hub component provides access to various pre-built components and utilities for working with language models
from langchain import hub

# Parses the output generated by the language model and extracts the relevant information. 
# In this case, it assumes that the language model’s output is a string and returns it as-is.
from langchain_core.output_parsers import StrOutputParser

# This component passes through the question or query without any modifications. 
# It allows the question to be used as-is in the subsequent steps of the chain
from langchain_core.runnables import RunnablePassthrough

# Chromadb imports the Chroma DB vector store, a high-performance embedding/vector database 
# designed for efficient similarity search and retrieval.
import chromadb

# Provides an interface to interact with the Chroma vector database using LangChain
from langchain_community.vectorstores import Chroma

# A text splitter is typically a function that we use to split the text into small chunks based on a specified chunk size and overlap. 
# This splitter is called SemanticChunker, an experimental text-splitting utility provided by the Langchain_experimental library. 
# The main purpose of SemanticChunker is to break down long text into more manageable pieces 
# while preserving the semantic coherence and context of each chunk.
from langchain_experimental.text_splitter import SemanticChunker

# Introduces the concept of running the retriever and question in parallel. 
# This can improve performance by allowing the retriever to fetch the context while the question is being processed simultaneously.
from langchain_core.runnables import RunnableParallel

# Allows to define and create our own custom prompt templates
from langchain_core.prompts import PromptTemplate

# Python's standard library module for writing asynchronous programs. It allows for concurrent execution of tasks using coroutines.
import asyncio

# Modifies the asyncio event loop to allow nested event loops. 
# This is particularly useful when running asynchronous code in environments like Jupyter notebooks, where an event loop might already be running.
import nest_asyncio

# Sets the event loop policy to the default one
asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())

# Modifies the event loop so that it can be reused or nested
nest_asyncio.apply()

# Imports Gradio, a library for creating interactive user interfaces for machine learning models 
import gradio as gr

In [3]:
# OpenAI Setup
# We will use the following API key for `gpt-4o-mini` model and
# OpenAI embedding service
openai.api_key = os.environ['OPENAI_API_KEY']

In [4]:
## INDEXING ##
# Next few steps represent the indexing stage, where we obtain our target data, pre-process it, and
# vectorize it. These steps are often done offline, meaning they are done to prepare the application for
# usage later. But in some cases, it may make sense to do this all in real time, such as in rapidly changing
# data environments where the data that is used is relatively small. THe overall steps are:

# 1. Web loading and crawling.
# 2. Splitting the data into digestible chunks for the Chroma DB vectorizing algorithm.
# 3. Embedding and indexing those chunks.
# 4. Adding those chunks and embeddings to the Chroma DB vector store.

In [5]:
# Load Documents
loader = WebBaseLoader(
    web_paths=("https://kbourne.github.io/chapter1.html",), # A tuple containing the URLs of the web pages to be loaded.
    bs_kwargs=dict( # A dictionary of keyword arguments to be passed to the BeautifulSoup parser.
        parse_only=bs4.SoupStrainer( # specifies the HTML elements to parse. In this case, it is set to parse only the elements with the CSS classes
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

# I. Makes HTTP requests to the specified URLs to fetch the web pages.
# II. Parses the HTML content of the web pages using BeautifulSoup, considering only the elements specified by
# the parse_only parameter.
# III. Extracts the relevant text content from the parsed HTML elements.
# IV. Creates Document objects for each web page that contain the extracted text content, along with metadata such as the source URL.

In [6]:
# Splitting
# SemanticChunker focuses on breaking down long text into more manageable pieces while preserving
# the semantic coherence and context of each chunk.
# There are other text splitters that are not context-aware, and does the splitting by arbitrarily
# defined sizes, but SemanticChunker focuses on accounting for context 
# rather than just arbitrary length of our chunks

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    # With the `text-embedding-3` class
    # of models, you can specify the size
    # of the embeddings you want returned.
    # dimensions=1024
)

text_splitter = SemanticChunker(embeddings)
splits = text_splitter.split_documents(docs)

In [7]:
# Embed
# We are using OpenAI embeddings here as well as well splitting step, 
# which will use our OpenAI key to send our chunks of data to the OpenAI API, convert them into embeddings, 
# and then send them back in their mathematical form.

# First, we create the Chroma vector store with the Chroma.from_documents method, which is called to
# create a Chroma vector store from the documents that we used split operation before
vectorstore = Chroma.from_documents(
    documents=splits, # The list of split documents (splits) obtained from the previous code snippet 
    embedding=embeddings # An instance of the OpenAIEmbeddings class, which is used to generate embeddings for the documents
)

# What happens under the hood is:
# 1. It iterates over each Document object in the splits list.
# 2. For each Document object, it uses the provided OpenAIEmbeddings instance to generate an embedding vector.
# 3. It stores the document text and its corresponding embedding vector in the Chroma vector database.


# The retriever is an object that provides a convenient interface for performing similarity searches 
# and retrieving the relevant documents from the vector database based on those searches
retriever = vectorstore.as_retriever()

In [8]:
#### RETRIEVAL and GENERATION ####

# Here's what we will do in the following steps:
# 1. Take in a user query.
# 2. Vectorize that user query.
# 3. Perform a similarity search of the vector store to find the closest vectors to the user query vector, as well as their associated
# content.
# 4. Pass the retrieved content into a prompt template, a process known as hydrating.
# 5. Pass that hydrated prompt to the LLM.
# 6. Once you receive a response from the LLM, present it to the user.

In [9]:
# Prompt - ignore LangSmith warning, you will not need langsmith for this coding exercise
prompt = hub.pull("jclemens24/rag-prompt")

# The prompt template is a key part of the RAG pipeline as it represents how you communicate with the
# LLM to receive the response you are seeking



In [10]:
# Relevance check prompt
relevance_prompt_template = PromptTemplate.from_template(
    """
    Given the following question and retrieved context, determine if the context is relevant to the question.
    Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
    Return ONLY the numeric score, without any additional text or explanation.

    Question: {question}
    Retrieved Context: {retrieved_context}

    Relevance Score:"""
)

In [11]:
# Post-processing

# The join method is called on the \n\n string to concatenate page_content of each document with two
# newline characters between each document’s content.

# The purpose of this function is to format the output of the retriever into the string format that it will 
# need to be in for the next step in the chain, after the retriever step.
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [12]:
# Function will receive the output of llm, which will be the relevancy of the question 
# with the given context from 1 to 5. Then, it will parse it is as float, and will return a float.
def extract_score(llm_relevance_score) -> float:
    try:
        score = float(llm_relevance_score.strip())
        return score
    except ValueError:
        return 0.0  # Returning 0.0 as a float instead of an integer

In [13]:
# This function will accept the response of llm, and returns the answer based on
# the relevance score.
# We are to determine the relevancy of the question as a security measure to block prompt injections
def conditional_answer(llm_response) -> str:
    relevance_score = extract_score(llm_response["relevance_score"])
    if relevance_score < 4:
        return "I do not know."
    else:
        return llm_response["answer"]

In [14]:
# LLM
# Creates an instance of the ChatOpenAI class from the langchain_openai module,
# which serves as an interface to OpenAI’s language models
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)


# The temperature parameter in Large Language Models (LLMs) 
# directly affects the variability and randomness of generated responses. 
# A lower LLM temperature value (close to 0) produces more deterministic and focused outputs, 
# ideal for tasks requiring factual accuracy, such as summarization or translation.

In [15]:
# We are defining RunnablePassthrough for concurrency, because we need two different outputs from LLM:
# Relevance Score: LLM will check if the question that is asked by the user is relative to the context provided.s
# Answer: LLM will give answer based on the context it receives.

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | RunnableParallel(
        {"relevance_score": (
            RunnablePassthrough()
            | (lambda x: relevance_prompt_template.format(question=x['question'], retrieved_context=x['context']))
            | llm
            | StrOutputParser()
        ), "answer": (
            RunnablePassthrough()
            | prompt
            | llm
            | StrOutputParser()
        )}
    )
    | RunnablePassthrough().assign(final_answer=conditional_answer)
)

In [18]:
rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [24]:
# Gradio Interface

# Function to be run once submit button is pressed on Gradio UI
def process_question(question):
    result = rag_chain_with_source.invoke(question)

    # # "result" is a dictionary, and within that, the "answer" key is also a dictionary
    relevance_score = result['answer']['relevance_score']
    final_answer = result['answer']['final_answer']
    
    sources = [doc.metadata['source'] for doc in result['context']]
    source_list = ", ".join(sources)
    
    # We return three values, and each of them will be mapped to output fields defined in gr.Interface() function
    return relevance_score, final_answer, source_list

demo = gr.Interface(
    # Function to be called 
    fn=process_question,
    # Input field configuration (for user query)
    inputs=gr.Textbox(label="Enter your question", value="What are the advantages of using RAG?"),
    # Output fields configuration, each output will be retrieved from process_question function
    outputs=[
        gr.Textbox(label="Relevance Score"),
        gr.Textbox(label="Final Answer"),
        gr.Textbox(label="Sources")
    ],
    title="RAG Question Answering",
    description="Enter a question and get the relevance score, final answer, and sources from RAG."
)

In [None]:
# debug = True -> enables verbose logging during the app's execution
# share = True -> creates sharable public URL
# After launch, press square icon to stop hosting
# For the jupyter notebook to show the UI, first delete the output if it already exists, and then run the cell again
demo.launch(share = True, debug = True)

# For enabling simple authentication
# demo.launch(share = True, debug = True, auth=("tuna","tuna1234")

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://f0fc1e4a52e7d25786.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
