#### Retrieval is the centerpiece of Retrieval Augmented Generation (RAG) flow

In [1]:
import os
GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]

In [2]:
from langchain.vectorstores import Chroma
persist_directory = "./vector_database/chroma/"

In [3]:
from langchain.embeddings import GooglePalmEmbeddings
embedding = GooglePalmEmbeddings(google_api_key=GOOGLE_API_KEY)

In [4]:
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)

print (vectordb._collection.count())

209


In [5]:
texts = [
    """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",
    """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",
]

In [6]:
# create a small vector db
smalldb = Chroma.from_texts(texts, embedding=embedding)

In [7]:
print (smalldb._collection.count())

3


In [8]:
question = "Tell me about all-white mushrooms with large fruiting bodies"

In [9]:
smalldb.similarity_search(question, k = 2)

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).')]

#### MMR search: Maximal Marginal Relevance
- retrieve both relevant and distinct chunks at the same time, getting the diverse information for a query

In [10]:
smalldb.max_marginal_relevance_search(question, k=2, fetch_k=3)

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(page_content='A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.')]

In [11]:
question = "what did they say about matlab?"
docs_ss = vectordb.similarity_search(question,k=3)

In [12]:
docs_ss[0].page_content[:100]

'those homeworks will be done in either MATLA B or in Octave, which is sort of — I \nknow some people '

In [13]:
docs_ss[1].page_content[:100]

'those homeworks will be done in either MATLA B or in Octave, which is sort of — I \nknow some people '

In [14]:
docs_mmr = vectordb.max_marginal_relevance_search(question,k=3)

In [15]:
docs_mmr[0].page_content[:100]

'those homeworks will be done in either MATLA B or in Octave, which is sort of — I \nknow some people '

In [16]:
docs_mmr[1].page_content[:100]

'simplicity of this algorithm will let us come back and use it as a building block. Okay? \nBut that’s'

In [17]:
question = "what did they say about regression in the third lecture?"

##### manually passing this info to look in third lecture as a metadata filter

In [18]:
docs = vectordb.similarity_search(
    question,
    k=3,
    filter={"source":"./docs/MachineLearning-Lecture03.pdf"}
)

In [19]:
for d in docs:
    print(d.metadata)

{'page': 0, 'source': './docs/MachineLearning-Lecture03.pdf'}
{'page': 14, 'source': './docs/MachineLearning-Lecture03.pdf'}
{'page': 0, 'source': './docs/MachineLearning-Lecture03.pdf'}


#### Infer the metadata from the query itself
- using LLM and SelfQueryRetriever to filter metadata and query for vector search 

In [20]:
from langchain.llms import GooglePalm
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [21]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The lecture the chunk is from, should be one of './docs/MachineLearning-Lecture01.pdf', './docs/MachineLearning-Lecture02.pdf, or './docs/MachineLearning-Lecture03.pdf'",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the lecture",
        type="integer",
    ),
]

In [22]:
document_content_description = "Lecture notes"
llm = GooglePalm(google_api_key=GOOGLE_API_KEY, temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm, vectordb, document_content_description, metadata_field_info, verbose=True
)

In [23]:
question = "what did they say about regression in the third lecture?"

In [24]:
docs = retriever.get_relevant_documents(question)

In [25]:
for d in docs:
    print(d.metadata)

{'page': 10, 'source': './docs/MachineLearning-Lecture03.pdf'}
{'page': 10, 'source': './docs/MachineLearning-Lecture03.pdf'}
{'page': 10, 'source': './docs/MachineLearning-Lecture03.pdf'}


In [26]:
docs[0]

Document(page_content='answer. You predict that if X is to the right of, sort of, the mid-point here then Y is one \nand then next to the left of that mid-point then Y is zero.  \nSo some people actually do this. Apply linear  regression to classi fication problems and \nsometimes it’ll work okay, but in general it’s actually a pretty bad idea to apply linear \nregression to classification problems like thes e and here’s why. Let’s say I change my \ntraining set by giving you just one more tr aining example all the way up there, right? \nImagine if given this training set is actually  still entirely obvious  what the relationship \nbetween X and Y is, right? It’s ju st – take this value as greate r than Y is one and it’s less \nthen Y is zero. By giving you this additiona l training example it really shouldn’t change \nanything. I mean, I didn’t really convey much  new information. There’s no surprise that \nthis corresponds to Y equals one. But if you now  fit linear regression to thi

#### Contextual compression

- Information most relevant to a query may be buried in a document with a lot of irrelevant text.
- Passing that full document through your application can lead to more expensive LLM calls and poorer responses. Contextual compression is meant to fix this.

In [27]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [28]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [29]:
llm = GooglePalm(temperature=0, google_api_key=GOOGLE_API_KEY)
compressor = LLMChainExtractor.from_llm(llm)

In [30]:
compressor_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)

In [31]:
question = "what did they say about matlab?"

In [32]:
compressed_docs = compressor_retriever.get_relevant_documents(question)

In [33]:
pretty_print_docs(compressed_docs)

Document 1:

MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to 
plot data. And it's sort of an extremely easy to  learn tool to use for implementing a lot of 
learning algorithms.
----------------------------------------------------------------------------------------------------
Document 2:

MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to 
plot data. And it's sort of an extremely easy to  learn tool to use for implementing a lot of 
learning algorithms.
----------------------------------------------------------------------------------------------------
Document 3:

So for those of you that don't know MATLAB yet, I hope you do learn it. It's not hard, and we'll actually have a short MATLAB tutorial in one of the discussion sections for those of you tha

#### Combining MMR with Contextual Compression

In [34]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr")
)

In [35]:
question = "what did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)

In [36]:
pretty_print_docs(compressed_docs)

Document 1:

MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to 
plot data. And it's sort of an extremely easy to  learn tool to use for implementing a lot of 
learning algorithms.


#### Note

Vector Database is not the only kind of tool to retrieve documents.
The LangChain retriever abstraction includes other ways to retrieve documents, such as TF-IDF or SVM, but the results may vary.

In [37]:
from langchain.retrievers import SVMRetriever
from langchain.retrievers import TFIDFRetriever
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [38]:
# Load PDF
loader = PyPDFLoader("./docs/MachineLearning-Lecture01.pdf")
pages = loader.load()
all_page_text=[p.page_content for p in pages]
joined_page_text=" ".join(all_page_text)

# Split 
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)
splits = text_splitter.split_text(joined_page_text)

In [39]:
svm_retriever = SVMRetriever.from_texts(splits, embedding)
tfidf_retriever = TFIDFRetriever.from_texts(splits)

In [40]:
question = "What are major topics for this class?"
docs_svm=svm_retriever.get_relevant_documents(question)
docs_svm[0]

Document(page_content="let me just check what questions you have righ t now. So if there are no questions, I'll just \nclose with two reminders, which are after class today or as you start to talk with other \npeople in this class, I just encourage you again to start to form project partners, to try to \nfind project partners to do your project with. And also, this is a good time to start forming \nstudy groups, so either talk to your friends  or post in the newsgroup, but we just \nencourage you to try to star t to do both of those today, okay? Form study groups, and try \nto find two other project partners.  \nSo thank you. I'm looking forward to teaching this class, and I'll see you in a couple of \ndays.   [End of Audio]  \nDuration: 69 minutes")

In [41]:
question = "What are major topics for this class?"
docs_tfidf=tfidf_retriever.get_relevant_documents(question)
docs_tfidf[0]

Document(page_content="into four major sections. We're gonna talk about four major topics in this class, the first \nof which is supervised learning. So le t me give you an example of that.  \nSo suppose you collect a data set of housing prices. And one of the TAs, Dan Ramage, \nactually collected a data set for me last week to use in the example later. But suppose that \nyou go to collect statistics about how much hous es cost in a certain geographic area. And \nDan, the TA, collected data from housing pr ices in Portland, Oregon. So what you can do \nis let's say plot the square footage of the house against the list price of  the house, right, so \nyou collect data on a bunch of houses. And let' s say you get a data set like this with \nhouses of different sizes that are li sted for different amounts of money.  \nNow, let's say that I'm trying to sell a hous e in the same area as Portland, Oregon as \nwhere the data comes from. Let's say I have a hou se that's this size in square foo