# Conclusion

In [1]:
import tomli, os
with open("../.streamlit/secrets.toml","rb") as f:
    secrets = tomli.load(f)
os.environ["OPENAI_API_KEY"] = secrets["OPENAI_API_KEY"]

## Question answering on chapters

In [None]:
pdf = {
       'chap1':'Chap 1 - Intro.pdf',
       'chap2':'Chap 2 - The ChatGPT API.pdf',
       'chap3':'Chap 3 - Chaining & Summarization.pdf',
       'chap4':'Chap 4 - Vector search & Question Answering.pdf',
       'chap5':'Chap 5 - Agents & Tools.pdf',
       'chap6':'Chap 6 - Speech-to-Text & Text-to-Speech.pdf',
       'chap7':'Chap 7 - Vision.pdf',
       'chap8':'Chap 8 - DALL-E.pdf',
       'chap9':'Chap 9 - Conclusion.pdf',
       'chap10':'Chap 10 - Appendix.pdf'
       }

In [None]:
# add all chapters into one big PDF
from pypdf import PdfWriter
pdfs = []
for k,v in pdf.items():
    if v:
        pdfs.append(f'../{k}/{v}')
merger = PdfWriter()
for pdf in pdfs:
    merger.append(pdf)
merger.write('book.pdf')

## Simple vector search over numpy arrays

In [33]:
from pypdf import PdfReader

# Read the PDF and split into pages
pdf_reader = PdfReader("book.pdf")
pages = []

for page in pdf_reader.pages:
    text = page.extract_text()
    if text:
        pages.append(text)

print(f"Extracted {len(pages)} pages from the PDF.")


Extracted 140 pages from the PDF.


In [38]:
import numpy as np
np.save('pages.npy',pages)

In [12]:
import openai
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return openai.embeddings.create(input = [text], model=model).data[0].embedding

In [None]:
# Assume 'pages' is a list of text chunks from your document
embeddings = []

for page in pages:
    embedding = get_embedding(page)
    embeddings.append(embedding)

# embeddings is now a list of embedding vectors, one for each page

In [None]:
import numpy as np
# Convert the list of embeddings to a numpy array
embeddings_array = np.array(embeddings)

# Save the numpy array to a file
np.save('embeddings.npy', embeddings_array)

In [None]:
# Generate embedding for the query
query = "What are vector databases"
query_embedding = get_embedding(query)
query_embedding

[-0.004794903565198183,
 0.05458392575383186,
 0.01885128952562809,
 -0.012544726021587849,
 -0.028400056064128876,
 -0.010629501193761826,
 0.00451787980273366,
 0.03127289563417435,
 -0.027100440114736557,
 0.03266827389597893,
 0.0018092039972543716,
 0.0044768392108380795,
 0.05086291581392288,
 -0.04396810382604599,
 0.022148214280605316,
 0.008488552644848824,
 0.00016726159083191305,
 0.021819889545440674,
 -0.038359228521585464,
 -0.009275163523852825,
 0.033188119530677795,
 0.012154840864241123,
 0.017195988446474075,
 -0.030807768926024437,
 -0.012989331968128681,
 0.00573541596531868,
 0.0015869010239839554,
 0.055131129920482635,
 0.045007798820734024,
 -0.04651261866092682,
 -0.009446165524423122,
 -0.03540430963039398,
 0.014555713161826134,
 0.010670541785657406,
 0.0026898656506091356,
 0.03901587799191475,
 0.014651474542915821,
 -0.011723915114998817,
 0.016908705234527588,
 0.015403884463012218,
 0.04391338303685188,
 0.012565246783196926,
 0.012893571518361568,
 0.

In [None]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Assume we have a list of document embeddings and their corresponding texts
# For this example, let's create a simple list
documents = [
    ("Vector databases are specialized database systems designed to store and query high-dimensional vectors efficiently.", get_embedding("Vector databases are specialized database systems designed to store and query high-dimensional vectors efficiently.")),
    ("Embeddings are numerical representations of data that capture semantic meaning.", get_embedding("Embeddings are numerical representations of data that capture semantic meaning.")),
    ("Vector search algorithms find the most similar vectors to a query vector.", get_embedding("Vector search algorithms find the most similar vectors to a query vector.")),
    ("Traditional databases store structured data in tables with rows and columns.", get_embedding("Traditional databases store structured data in tables with rows and columns.")),
    ("Machine learning models often use vector representations for various tasks.", get_embedding("Machine learning models often use vector representations for various tasks."))
]

# Calculate similarities
similarities = [(doc[0], cosine_similarity(query_embedding, doc[1])) for doc in documents]

# Sort by similarity (descending order) and get top 3
top_3 = sorted(similarities, key=lambda x: x[1], reverse=True)[:3]

# Print results
print("Top 3 most relevant documents for the query 'What are vector databases':")
for i, (doc, similarity) in enumerate(top_3, 1):
    print(f"{i}. Similarity: {similarity:.4f}")
    print(f"   Document: {doc}\n")

Top 3 most relevant documents for the query 'What are vector databases':
1. Similarity: 0.7385
   Document: Vector databases are specialized database systems designed to store and query high-dimensional vectors efficiently.

2. Similarity: 0.4550
   Document: Vector search algorithms find the most similar vectors to a query vector.

3. Similarity: 0.4508
   Document: Machine learning models often use vector representations for various tasks.



In [None]:
# Calculate similarities
similarities = []
for embedding in embeddings:
    similarity = cosine_similarity(query_embedding, np.array(embedding))
    similarities.append(similarity)
similarities

[0.16215622036322738,
 0.17145976988775258,
 0.15923143441854304,
 0.13022300221615743,
 0.12165349097525548,
 0.04502390106713089,
 0.13079414002672296,
 0.1498863486413091,
 0.11602750428076539,
 0.053510664524484934,
 0.119763994430735,
 0.08304524190681606,
 0.056212604046999556,
 0.07577709264421677,
 0.06504991808611697,
 0.08207590147271444,
 0.04733788594631328,
 0.027382450792499465,
 0.060581859369666084,
 0.10185981398758036,
 0.15574117554246242,
 0.10880576987309161,
 0.14232661193508547,
 0.12305350116010491,
 0.2239687789353034,
 0.09286205444792774,
 0.11013406307523325,
 0.12130004576485753,
 0.14338805992743436,
 0.14146749032126446,
 0.02212598904722134,
 0.11721511943481404,
 0.12193345548012653,
 0.33780084365861857,
 0.1552791666652065,
 0.04333084949771576,
 0.08575623575992562,
 0.2665729627618522,
 0.43644907859135296,
 0.23764732675936032,
 0.202680399921547,
 0.3535982836226771,
 0.10966594589725924,
 0.1425760042735195,
 0.41379377110015053,
 0.1399540348576

In [None]:
# Sort by similarity (descending order) and get top 3, store the index
top_3_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:3]
top_3_indices


[70, 38, 47]

In [None]:
# Print results
print(f"Top 3 most relevant documents for the query '{query}':")
for i, index in enumerate(top_3_indices, 1):
    print(f"{i}. Similarity: {similarities[index]:.4f}")
    print(f"   PDF Page: {index}")
    print(f"   Document: {pages[index][:100]}...\n")  # Print first 100 characters of the document

Top 3 most relevant documents for the query 'What are vector databases':
1. Similarity: 0.4999
   PDF Page: 70
   Document: 22 
  
messages = list(client.beta.threads.messages.list( thread_id =thread.id, 
run_id=run.id))  
 ...

2. Similarity: 0.4364
   PDF Page: 38
   Document: 6 
 All those limitations motivate the need for another class of search, called vector search.  
4.2...

3. Similarity: 0.4305
   PDF Page: 47
   Document: 15 
 Finally, after trying several vector databases, you can build a production system with Pinecone...



## FAISS

In [None]:
import faiss

In [None]:
import numpy as np
# Step 1: Load the embeddings as numpy array from file
embeddings = np.load('embeddings.npy') # (140 vectors, each 1536-dimensional)

In [None]:
# Step 2: Create a FAISS index (FlatL2 for Euclidean distance)
d = 1536  # Dimensionality of each vector (1536 in this case)
index = faiss.IndexFlatL2(d)  # You can also use other index types (e.g., IndexFlatIP for cosine similarity)

# Step 3: Add the embeddings to the index
index.add(embeddings)  # Now, the index contains 140 vectors


In [None]:
# Step 4: Perform a search (optional)
# Let's search the index using one of the vectors
question = "What are agents" 
query = np.array([get_embedding(question)]) # turn to embedding
k = 5  # Number of nearest neighbors to search for
distances, indices = index.search(query, k)

# Display results
print("Nearest neighbors (indices):", indices)
print("Distances to neighbors:", distances)

Nearest neighbors (indices): [[49 50 54 51 58]]
Distances to neighbors: [[1.0562747 1.2381678 1.3184009 1.3842411 1.4155457]]


In [None]:
res_pages = indices[0].tolist()
for p in res_pages:
    print(f'Page {p}:')
    print(pages[p][:100])

Page 49:
1 
 5. Agents & Tools  
Large Language Models are one (big) step in the direction of what is called 
Page 50:
2 
 Here is a simple representation of what is happening here:  
 
5.1.1.  Smith : my pedagogical  a
Page 54:
6 
 What is the difference between an Agent and a Pipeline in Haystack?  A Pipeline is a one -pass s
Page 51:
3 
 Answer the user request given the following information retrieved from an 
internet search:  
  
Page 58:
10 
 5.3. OpenAI Assistants  
In November 2023  at their first DevDay , OpenAI introduced the notion


## LangChain + ChromaDB

In [4]:
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
chat = ChatOpenAI(model_name='gpt-4o-mini')

In [None]:
loader = PyPDFLoader('book.pdf')
pages = loader.load_and_split()
embeddings = OpenAIEmbeddings()

In [5]:
# create the vectorestore to use as the index
db = Chroma.from_documents(pages, embeddings,persist_directory=f"./chroma")

In [7]:
query = "What are the vector databases mentioned in the book?"
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":3})
# create a chain to answer questions 
qa = RetrievalQA.from_chain_type(
    llm=chat, chain_type="stuff", retriever=retriever, return_source_documents=True)
result = qa.invoke({"query": query})
result

{'query': 'What are the vector databases mentioned in the book?',
 'result': 'The vector databases mentioned in the book are Chroma, Pinecone, Weaviate, Faiss, Qdrant, and MongoDB.',
 'source_documents': [Document(page_content='15 \n Finally, after trying several vector databases, you can build a production system with Pinecone like this:  \n \nAs you will see in a future chapter, this application21 can be nicely architectured with plugins, that clearly \ndefine the API with two main endpoints: upsert  (to update or insert the vector database), or query  that \nwill convert the prompt into an embedding and perform a vector search to find the N (= 5) closest ones.  \n4.3. Application: Question answering on Book  \nThis is what the app will look like, a simple text entry and a button to trigger the workflow. The answer \nwill be written in the body, with sources from the document corpus. Check out the code under \nchap4/qa_app.py  \n \n \n21 https://github.com/pinecone -io/examples/blob/

## LangChain + FAISS

In [3]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

In [5]:
loader = PyPDFLoader('book.pdf')
pages = loader.load_and_split()
embeddings = OpenAIEmbeddings()

In [6]:
index = FAISS.from_documents(pages, embeddings)

In [7]:
question = "What is an agent?"
docs = index.similarity_search(question, k=3)

In [9]:
for doc in docs:
    print(str(doc.metadata["page"]) + ":", doc.page_content[:100])

49: 1 
 5. Agents & Tools  
Large Language Models are one (big) step in the direction of what is called 
54: 6 
 What is the difference between an Agent and a Pipeline in Haystack?  A Pipeline is a one -pass s
50: 2 
 Here is a simple representation of what is happening here:  
 
5.1.1.  Smith : my pedagogical  a


In [11]:
index.save_local("faiss")