# Demonstration of Retrieval Augmented Generation with LangChain

## Install required libraries

In [1]:
!pip install openai langchain azure-search-documents==11.4.0b8 azure-identity pypdf tiktoken



## Import libraries

In [2]:
import openai
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI

## Configure OpenAI settings

In [3]:
os.environ["OPENAI_API_TYPE"] = "openai"
os.environ["OPENAI_API_BASE"] = "https://api.openai.com/v1"
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"
os.environ["OPENAI_API_VERSION"] = "2020-11-07"
model: str = "text-embedding-ada-002"

## Configure vector store settings

In [4]:
vector_store_address: str = "YOUR_AZURE_COGNITIVE_SEARCH_ENDPOINT"
vector_store_password: str = "YOUR_AZURE_COGNITIVE_SEARCH_API_KEY"

## Create a new index in the vector store

In [16]:
from azure.search.documents.indexes.models import (
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField
)

embeddings: OpenAIEmbeddings = OpenAIEmbeddings(deployment=model, chunk_size=1)
embedding_function = embeddings.embed_query

fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True,
    ),
    SearchableField(
        name="content",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=len(embedding_function("Text")),
        vector_search_configuration="default",
    ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
    )
]

index_name: str = "rag-demo-kroger"

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=embedding_function,
    fields=fields
)

## Download some documents

In [21]:
!wget https://ir.kroger.com/files/doc_financials/2023/q1/10qq223.pdf -P "docs/"
!wget https://ir.kroger.com/files/doc_financials/2023/q1/10qq123.pdf -P "docs/"
!wget https://ir.kroger.com/files/doc_financials/2022/q4/10k22.pdf -P "docs/"
!wget https://ir.kroger.com/files/doc_financials/2022/q3/10qq322.pdf -P "docs/"

--2023-10-09 00:45:10--  https://ir.kroger.com/files/doc_financials/2023/q1/10qq223.pdf
Resolving ir.kroger.com (ir.kroger.com)... 162.159.129.11, 162.159.130.11, 2606:4700:7::a29f:820b, ...
Connecting to ir.kroger.com (ir.kroger.com)|162.159.129.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 368222 (360K) [application/pdf]
Saving to: ‘docs/10qq223.pdf’


2023-10-09 00:45:10 (28.4 MB/s) - ‘docs/10qq223.pdf’ saved [368222/368222]

--2023-10-09 00:45:10--  https://ir.kroger.com/files/doc_financials/2023/q1/10qq123.pdf
Resolving ir.kroger.com (ir.kroger.com)... 162.159.130.11, 162.159.129.11, 2606:4700:7::a29f:820b, ...
Connecting to ir.kroger.com (ir.kroger.com)|162.159.130.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 335276 (327K) [application/pdf]
Saving to: ‘docs/10qq123.pdf’


2023-10-09 00:45:11 (29.5 MB/s) - ‘docs/10qq123.pdf’ saved [335276/335276]

--2023-10-09 00:45:11--  https://ir.kroger.com/files/doc_financials/2022

## Define function to estimate number of tokens from documents

In [5]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

## Load documents and split them into chunks

In [6]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import TokenTextSplitter

loader = PyPDFDirectoryLoader("docs/")

documents = loader.load()
text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

## Calculate total number of tokens and estimate cost of embedding

In [7]:
total_tokens = 0
token_encoding = "cl100k_base"

for doc in docs:
    total_tokens += num_tokens_from_string(doc.page_content, token_encoding)

print(f"Total number of document chunks = {len(docs)}")
print(f"Total tokens = {total_tokens}")
print(f"Estimated cost of generating embeddings = ${total_tokens/1000.0*.0001:.4f}")

Total number of document chunks = 444
Total tokens = 164959
Estimated cost of generating embeddings = $0.0165


## Add documents and embeddings to vector store

In [8]:
vector_store.add_documents(documents=docs)

['NzU5OWE5OWEtODA2NS00YWNhLTk4ZGEtNmNlOTk3YTk3ZWFl',
 'M2E0ZmM2OWMtNDdiMy00NWViLTgxZDItYTQxOTZiYzQwNTBh',
 'NGFlMjc0MjEtNTc0Yy00MTgyLThiYmYtZTI0NTY3MDk1NTBk',
 'YjdjYjQ5YTItM2FmNS00ZWQ3LWI0MDYtMzlhNjQ0MGI1NGU4',
 'MTY3ODBlYzQtMTkwOC00NWFmLWE3NDctNzUzOWY3YTNlYThl',
 'YzA5MmRjZjUtYmEyZC00Yzg4LThjMzMtY2EzN2RmNzFkZTIz',
 'MWY5Y2ViMTUtMTQzZS00MGZkLTkwYzctYjI2OWVmNmFhMTEz',
 'M2JlYTFlOGEtMmE1ZC00NzgwLWIwYmQtM2NiZmNhMjljZDQ5',
 'YTI5NzQ0MjItMDE0OC00ZTM2LThkYzEtOGNkODcwYWQ0MjVk',
 'MzA4NjJkOWItNGRjOC00OTUxLTlmOWQtMzAxZTQ3MzQ2MWYx',
 'Zjg2YmQyMzctNmFiYS00ZDgzLWE3NWUtNzZkNmYwMjYzYThk',
 'YTA5NjNlMzYtN2NmZC00ODNhLWFmOWUtMWE1MDNjMmNjODZk',
 'ZmQ3MWYxZmUtOTAxNi00ZjFkLWExNGQtZjM5YjUyY2M3MDc5',
 'NmRmM2Y5NjktNGU2ZS00NDQxLWEwNjMtYThiN2NiODA5Mjkz',
 'M2MwNjE0ZTQtMGU1OC00YjY4LTgwOGMtMzdkN2Y1MzMzZWUx',
 'NGJiYTRhZWQtMmY1MC00NDg2LThjNzUtODMxNGYxNWRkNjY0',
 'Y2I1NjQ3OTAtYWQ0OS00ZTdlLWI0ZjUtN2JmMmE2YTY5YjBm',
 'Njc4NmZjZWYtNmExYi00OTk2LWJmNjgtN2M2YzY2YmQ0ZTM3',
 'OWMxY2JmZDAtMWU5Ny00ODY5LTkxNTAtNjQyY2Q0ZTg2

## Perform a vector similarity search

In [6]:
docs = vector_store.similarity_search(
    query="What is the latest status of merger with Albertsons?",
    k=3,
    search_type="similarity",
)

In [7]:
print(docs[1].page_content)

16On September 8, 2023, the Company and Albertsons announced they entered into a definitive agreement with C&S for the
sale of 413 stores, as well as the QFC, Mariano’s and Carrs brand names, eight distribution centers, two offices and certain
other assets in connection with the proposed merger.  In addition, Kroger will divest the Debi Lilly Design, Primo Taglio,
Open Nature, ReadyMeals and Waterfront Bistro private label brands.  All fuel centers and pharmacies associated with the 
divested stores will remain with the stores and continue to operate.  The stores will be divested by the Company following the 
closing of the proposed merger with Albertsons.  The definitive purchase agreement has customary representations and
warranties and covenants of a transaction of its type. The transaction is subject to fulfillment of customary closing conditions, 
including clearance by the Federal Trade Commission (“FTC”) and the completion of the proposed merger.  C&S will pay the 
Company all-c

## Initialize a retriever from the vector store

In [8]:
retriever = vector_store.as_retriever(search_kwargs={'k': 3})

## Create a custom prompt

In [9]:
template = """You are an AI system well-versed in finance designed to answer questions from users in a designated context. 
When presented with a question, you must reply with accuracy to inquiries using only information provided in that same context. 
Please reference the source document at the end of your answer when possible.
If there is ever a situation where you are unsure of the potential answers, simply respond with "I don't know".

Context:
{context}

Question: {question}

Answer:"""

rag_prompt_custom = PromptTemplate.from_template(template=template)

## Initialize LLM and RAG chain

In [10]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0)

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()} 
    | rag_prompt_custom
    | llm
)

## Ask some questions

In [11]:
response = rag_chain.invoke("Are there any problems regarding the merger with Albertsons?")

In [12]:
response.content

"Yes, there are potential risks and problems associated with the merger with Albertsons. These risks include the expected timing and likelihood of completion of the proposed transaction, potential divestitures that may be imposed by regulators, the occurrence of events that could lead to the termination of the merger agreement, legal proceedings that may be instituted against the parties, the disruption of current plans and operations, the ability to identify and recognize anticipated benefits, the amount of costs and expenses related to the transaction, the potential effect on relationships with associates, suppliers, and competitors, and the potential effect on the market price of securities. These risks are outlined in the risk factors section of the company's documents. (Source: docs/10qq322.pdf, page 32; docs/10k22.pdf, page 12)"

In [13]:
response = rag_chain.invoke("How much money was returned to shareholders as dividend in the first two quarters of 2023?")

In [14]:
response.content

'$376 million was returned to shareholders as dividends in the first two quarters of 2023. (Source: docs/10qq223.pdf, page 20)'

In [15]:
response = rag_chain.invoke("What is the gross margin rate for the second quarter of 2023?")

In [16]:
response.content

'The gross margin rate for the second quarter of 2023 is 21.79%. [Source: docs/10qq223.pdf, page 26]'

In [19]:
response = rag_chain.invoke("What is the value creation model of Kroger and how will that be achieved?")

In [20]:
response.content

"The value creation model of Kroger is based on its omnichannel food retail business, which is built on strategic assets such as its stores, digital ecosystem, Our Brands, and data. By combining these assets with a go-to-market strategy, Kroger aims to deliver a compelling value proposition for its customers. Kroger focuses on building long-term customer loyalty through initiatives such as Fresh, Our Brands, Personalization, and a seamless shopping experience. This approach drives sustainable sales growth in its retail supermarket business, including fuel and health and wellness. The data and traffic generated from these efforts enable Kroger's fast-growing, high operating margin alternative profit businesses. Kroger is evolving from primarily a food retailer into a more diverse, food-first business that is expected to consistently deliver net earnings growth in the future. This will be achieved by growing identical sales without fuel and making strategic investments in customers, asso

In [17]:
response = rag_chain.invoke("What is alzheimer's disease?")

In [18]:
response.content

"I don't know."