In [1]:
import google.generativeai as genai
import pathlib
import textwrap

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
GOOGLE_API_KEY = 'AIzaSyC3qifQ3POSG_F5Mg43f-iqNdEHFPvwesQ'
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-2.0-flash-exp')

In [3]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key="AIzaSyC3qifQ3POSG_F5Mg43f-iqNdEHFPvwesQ"
)


In [4]:
from langchain.vectorstores import Chroma
persist_directory = 'docs/chroma/'

In [6]:
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embeddings
)

  vectordb = Chroma(


In [7]:
print(vectordb._collection.count())

684


In [None]:
texts = [
    """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",
    """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",
]

In [14]:
smalldb = Chroma.from_texts(texts,embedding=embeddings)

In [15]:
question = "Tell me about all-white mushrooms with large fruiting bodies"

In [16]:
smalldb.similarity_search(question, k=2)

[Document(metadata={}, page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(metadata={}, page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.')]

### Addressing Diversity: Maximum marginal relevance

Last class we introduced one problem: how to enforce diversity in the search results.
 
`Maximum marginal relevance` strives to achieve both relevance to the query *and diversity* among the results.

In [17]:
smalldb.max_marginal_relevance_search(question,k=2, fetch_k=3)

[Document(metadata={}, page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(metadata={}, page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).')]

### Addressing Specificity: working with metadata

In last lecture, we showed that a question about the third lecture can include results from other lectures as well.

To address this, many vectorstores support operations on `metadata`.

`metadata` provides context for each embedded chunk.

In [19]:
question = "what did they say about regression in the third lecture?"

In [21]:
docs = vectordb.similarity_search(
    question,
    k=3,
    filter={"source":"./MachineLearning-Lecture01.pdf"}
)

In [22]:
for d in docs:
    print(d.metadata)

{'creator': 'PScript5.dll Version 5.2.2', 'total_pages': 22, 'moddate': '2008-07-11T11:25:23-07:00', 'creationdate': '2008-07-11T11:25:23-07:00', 'page_label': '7', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'title': '', 'page': 6, 'author': '', 'source': './MachineLearning-Lecture01.pdf'}
{'page': 6, 'author': '', 'page_label': '7', 'moddate': '2008-07-11T11:25:23-07:00', 'source': './MachineLearning-Lecture01.pdf', 'total_pages': 22, 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creationdate': '2008-07-11T11:25:23-07:00', 'title': '', 'creator': 'PScript5.dll Version 5.2.2'}
{'creator': 'PScript5.dll Version 5.2.2', 'source': './MachineLearning-Lecture01.pdf', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'total_pages': 22, 'page_label': '7', 'moddate': '2008-07-11T11:25:23-07:00', 'creationdate': '2008-07-11T11:25:23-07:00', 'title': '', 'page': 6, 'author': ''}


## Other types of retrieval

It's worth noting that vectordb as not the only kind of tool to retrieve documents. 

The `LangChain` retriever abstraction includes other ways to retrieve documents, such as TF-IDF or SVM.

In [23]:
from langchain.retrievers import SVMRetriever
from langchain.retrievers import TFIDFRetriever
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [24]:
# Load PDF
loader = PyPDFLoader("./MachineLearning-Lecture01.pdf")
pages = loader.load()
all_page_text=[p.page_content for p in pages]
joined_page_text=" ".join(all_page_text)

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)
splits = text_splitter.split_text(joined_page_text)


In [27]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp313-cp313-win_amd64.whl.metadata (14 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.0-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.0-cp313-cp313-win_amd64.whl (10.7 MB)
   ---------------------------------------- 0.0/10.7 MB ? eta -:--:--
   --- ------------------------------------ 1.0/10.7 MB 6.4 MB/s eta 0:00:02
   ---- ----------------------------------- 1.3/10.7 MB 3.4 MB/s eta 0:00:03
   -------- ------------------------------- 2.4/10.7 MB 3.9 MB/s eta 0:00:03
   ---------- ----------------------------- 2.9/10.7 MB 3.3 MB/s eta 0:00:03
   ------------- -------------------------- 3.7/10.7 MB 3.5 MB/s eta 0:00:02
   --------------- ----------------

In [30]:
# Retrieve
svm_retriever = SVMRetriever.from_texts(splits,embeddings)
tfidf_retriever = TFIDFRetriever.from_texts(splits)

In [29]:
question = "what did they say about matlab?"
docs_tfidf=tfidf_retriever.get_relevant_documents(question)
docs_tfidf[0]

  docs_tfidf=tfidf_retriever.get_relevant_documents(question)


Document(metadata={}, page_content="yourselves. You can also come and talk to me or the TAs if you want to brainstorm ideas \nwith us.  \nOkay. So one more organizational question. I'm curious, how many of you know \nMATLAB? Wow, cool, quite a lot. Okay. So as part of the — act ually how many of you \nknow Octave or have used Octave? Oh, okay, much smaller number.  \nSo as part of this class, especially in the homeworks, we'll ask you to implement a few \nprograms, a few machine learning algorithms as part of the homeworks. And most of those homeworks will be done in either MATLAB or in Octave, which is sort of — I \nknow some people call it a free version of MATLAB, which it sort of is, sort of isn't.  \nSo I guess for those of you that haven't seen MATLAB before, and I know most of you \nhave, MATLAB is I guess part of the programming language that makes it very easy to \nwrite codes using matrices, to write code for numerical routines, to move data around, to \nplot data. And it's s