In [None]:
!pip install openai
!pip install langchain
!pip install -U langchain-community
!pip install pypdf

In [2]:
import os
import openai
import sys

In [None]:
from langchain.document_loaders import PyPDFLoader

loaders = [
    PyPDFLoader("/content/One Up On Wall Street by Peter Lynch ( PDFDrive ).pdf"),
    PyPDFLoader("/content/The Intelligent Investor ( PDFDrive ).pdf")
]

docs = []

for loader in loaders:
  try:
    docs.extend(loader.load())
  except TypeError as e:
    print(f"Error loading document: {e}")

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [9]:
splits = text_splitter.split_documents(docs)

In [10]:
len(splits)

519

### Embeddings

In [None]:
!pip install tiktoken

In [47]:
import getpass
import os
import numpy as np

#if not os.getenv("OPENAI_API_KEY"):
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

Enter your OpenAI API key: ··········


In [48]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [49]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [None]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [None]:
# Calculate the similarity between the embeddings
# The higher the value, the higher the similarity between the embeddings
np.dot(embedding1, embedding2)
np.dot(embedding1, embedding3)
np.dot(embedding2, embedding3)

### Vectorstores

In [51]:
!pip install chromadb

Successfully installed asgiref-3.8.1 backoff-2.2.1 bcrypt-4.2.0 chroma-hnswlib-0.7.6 chromadb-0.5.7 coloredlogs-15.0.1 deprecated-1.2.14 durationpy-0.7 fastapi-0.115.0 httptools-0.6.1 humanfriendly-10.0 importlib-metadata-8.4.0 kubernetes-31.0.0 mmh3-5.0.0 monotonic-1.6 onnxruntime-1.19.2 opentelemetry-api-1.27.0 opentelemetry-exporter-otlp-proto-common-1.27.0 opentelemetry-exporter-otlp-proto-grpc-1.27.0 opentelemetry-instrumentation-0.48b0 opentelemetry-instrumentation-asgi-0.48b0 opentelemetry-instrumentation-fastapi-0.48b0 opentelemetry-proto-1.27.0 opentelemetry-sdk-1.27.0 opentelemetry-semantic-conventions-0.48b0 opentelemetry-util-http-0.48b0 overrides-7.7.0 posthog-3.6.6 pypika-0.48.9 starlette-0.38.5 uvicorn-0.30.6 uvloop-0.20.0 watchfiles-0.24.0 websockets-13.0.1


In [52]:
from langchain.vectorstores import Chroma

In [53]:
persist_directory = "content/chroma/"

In [54]:
!rm -rf ./docs/chroma  # remove old database files if any

In [None]:
vectorbd = Chroma.from_documents(
    documents = splits,
    embedding = embedding,
    persist_directory = persist_directory
)

In [None]:
print(vectordb._collection.count())

### Similarity Search

In [None]:
question = "is there an email i can ask for help"

In [None]:
# k=3, return 3 chuncks with the most similarity
docs = vectordb.similarity_search(question,k=3)

In [None]:
len(docs)

In [None]:
docs[0].page_content

In [None]:
for doc in docs:
    print(doc.metadata)

'''
{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 0}
{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 14}
{'source': 'docs/cs229_lectures/MachineLearning-Lecture02.pdf', 'page': 0}
{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 6}
{'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf', 'page': 8}
'''