## RAG Demo Using Offline Docs

### Import the packages

In [9]:
import warnings
warnings.filterwarnings('ignore')

import os
import openai
import sys
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv("env_vars.env")) # read local .env file
openai.api_key  = os.environ['OPENAI_API_KEY']
# print(openai.api_key)

### Load the documents

In [4]:
from langchain.document_loaders import PyPDFLoader

loaders = [PyPDFLoader("../docs/Inview_June_2023.pdfInview_June_2023.pdf")]
docs = []
for loader in loaders:
    docs.extend(loader.load())
    
print(len(docs))

6


In [5]:
# Split into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

splits = text_splitter.split_documents(docs)
print(len(splits))

24


### Store "chunks" as vectors

In [12]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

persist_directory = '../docs/chroma-test/'
!rm -rf ../docs/chroma-test/  # remove old database files if any

embeddings = OpenAIEmbeddings()

In [14]:
# Store in the vector DB
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory=persist_directory
)

print(vectordb._collection.count())

24


### Semantic search

In [15]:
question = "What industry sectors are recommended for stocks?"

docs = vectordb.similarity_search(question, k=3)
print(len(docs))

3


In [16]:
# Inspect the top-ranked result
docs[0].page_content

'versus the\nbenchmark\nEquity Sector Views\nUK\nIndustrials is the largest sector overweight within UK stocks, \ntaking advantage of the de-rating seen across the sector to \npick up quality companies. We favour more internationally \nexposed companies in the sector over those more reliant on \ndomestic UK business. \nWe have continued to add to utilities to boost defensive \nholdings in anticipation of a further weakening in the \nmacroeconomic outlook for the UK. Regulatory uncertainty \nhas reduced in recent months with clarity provided on \nwindfall taxes, earnings resilience remains attractive in an \ninflationary environment, renewable transition programs are \nbeing accelerated and peaking bond yields should prove \nsupportive for the sector.The consumer staples sector has demonstrated resilient \nearnings through this period of high inflation as it has been \nable to price ahead of rising costs in raw materials and labour \nwhile also keeping volumes stable. However, recent ea

In [19]:
for d in docs:
    print(d.metadata)

{'page': 3, 'source': '../docs/Inview_June_2023.pdfInview_June_2023.pdf'}
{'page': 4, 'source': '../docs/Inview_June_2023.pdfInview_June_2023.pdf'}
{'page': 1, 'source': '../docs/Inview_June_2023.pdfInview_June_2023.pdf'}


In [21]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + 
                                   d.page_content for i, d in enumerate(docs)]))
    
pretty_print_docs(docs)

Document 1:

versus the
benchmark
Equity Sector Views
UK
Industrials is the largest sector overweight within UK stocks, 
taking advantage of the de-rating seen across the sector to 
pick up quality companies. We favour more internationally 
exposed companies in the sector over those more reliant on 
domestic UK business. 
We have continued to add to utilities to boost defensive 
holdings in anticipation of a further weakening in the 
macroeconomic outlook for the UK. Regulatory uncertainty 
has reduced in recent months with clarity provided on 
windfall taxes, earnings resilience remains attractive in an 
inflationary environment, renewable transition programs are 
being accelerated and peaking bond yields should prove 
supportive for the sector.The consumer staples sector has demonstrated resilient 
earnings through this period of high inflation as it has been 
able to price ahead of rising costs in raw materials and labour 
while also keeping volumes stable. However, recent earnings 

In [17]:
# Persist the vector DB for RAG
vectordb.persist()

In [18]:
# TODO: Experiment with other embeddings (such as HF, sentence_transformers)

### Retrieval

In [23]:
from langchain.chat_models import ChatOpenAI

# Invoke the LLM
llm_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=llm_name, temperature=0)

In [59]:
# Create QA chain and prompt template
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# Build the prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

# Run the chain
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 3})

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [60]:
# question = "What is the main topic of this document?"
# question = "What industry sectors are recommended for stocks?"
question = "What is the house view between stocks and bonds?"

result = qa_chain({"query": question})
result["result"]

'The house view maintains a slight overweight to equities and a small overweight to fixed income. They may look to reduce exposure to equities once consensus on the asset class turns more bullish.'

In [52]:
# result["source_documents"][2]

In [53]:
question = "What does the document suggest about currencies?"

result = qa_chain({"query": question})
result["result"]

'The document suggests that the US dollar is expected to weaken in the second half of the year, while interest rates in Europe and the UK are not expected to roll over until early 2024. Additionally, the document indicates an increasing exposure to emerging market local currency debt and a slight reduction in emerging market hard currency debt.'

In [58]:
# pretty_print_docs(result["source_documents"])

### Alternative: Compression retrieval

In [66]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

# Wrap our vectorstore
compressor = LLMChainExtractor.from_llm(llm)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_kwargs={"k": 3})
)

In [67]:
question = "What does the document suggest about bonds?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Document 1:

exposure to high yield bonds should be reduced in favour of investment grade corporate bonds.
----------------------------------------------------------------------------------------------------
Document 2:

increased bond yields and the nearing of the end of monetary policy tightening make longer-dated government bonds attractive, including local currency emerging market debt.
----------------------------------------------------------------------------------------------------
Document 3:

No changes were made to our broad asset allocation positioning, reflective of the overall uncertain environment.


In [68]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=compression_retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

question = "What does the document suggest about bonds?"
result = qa_chain({"query": question})
result["result"]

'The document suggests reducing exposure to high yield bonds in favor of investment grade corporate bonds, finding longer-dated government bonds and local currency emerging market debt attractive, and maintaining the current asset allocation due to uncertainty.'

### Create a chatbot!!

In [70]:
import panel as pn  # GUI
pn.extension()
