In [1]:
!pip install -U --quiet langchain
!pip install -U --quiet pypdf
!pip install --upgrade --quiet  sentence_transformers > /dev/null
!pip install -U --quiet chromadb

In [2]:
import nltk
nltk.download('punkt')
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import NLTKTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

persist_directory = './vector_db/chroma/'
!rm -rf ./vector_db/chroma  
!set TOKENIZERS_PARALLELISM = true

[nltk_data] Downloading package punkt to /Users/venkat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
loader = PyPDFLoader("Machine_learning.pdf")
pages = loader.load()
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [4]:
text_splitter = NLTKTextSplitter(chunk_size=200)
vectordb = Chroma.from_documents(
    documents=text_splitter.split_documents(pages),
    embedding=embeddings,
    persist_directory=persist_directory
)

Created a chunk of size 262, which is longer than the specified 200
Created a chunk of size 234, which is longer than the specified 200
Created a chunk of size 300, which is longer than the specified 200
Created a chunk of size 201, which is longer than the specified 200
Created a chunk of size 312, which is longer than the specified 200
Created a chunk of size 251, which is longer than the specified 200
Created a chunk of size 529, which is longer than the specified 200
Created a chunk of size 203, which is longer than the specified 200
Created a chunk of size 258, which is longer than the specified 200
Created a chunk of size 244, which is longer than the specified 200
Created a chunk of size 353, which is longer than the specified 200
Created a chunk of size 440, which is longer than the specified 200
Created a chunk of size 245, which is longer than the specified 200
Created a chunk of size 289, which is longer than the specified 200
Created a chunk of size 246, which is longer tha

In [11]:
vector = "what is the link between computational statistics and machine learning?"

search_results = vectordb.similarity_search(vector, k=3)

print("\nSimilarity search (", len(search_results) ,"results )\n-- ")

for s in search_results:
    print (s)

search_results = vectordb.max_marginal_relevance_search(vector,k=3, fetch_k=3)

print("\nMax Marginal relevance search (", len(search_results) ,"results )\n-- ")

for s in search_results:
    print (s)

vectordb.persist()


Similarity search ( 3 results )
-- 
page_content='Machine learning and statistics are closely related fields in terms of methods , but distinct in their principal\ngoal: statistics draws popul ation inferences from a sample, while machine learning finds generalizable\npredictive patterns.' metadata={'page': 2, 'source': 'Machine_learning.pdf'}
page_content="Although  not all machine learning is statistically based, computational statistics is an\nimportant source of the field's methods ." metadata={'page': 0, 'source': 'Machine_learning.pdf'}
page_content='Some statisticians have adopted methods  from machine learning, leading to a combined field that they call\nstatistical learning.\n\n[31]Generalization\nStatistics\nStatistical Physics' metadata={'page': 2, 'source': 'Machine_learning.pdf'}

Max Marginal relevance search ( 3 results )
-- 
page_content='Machine learning and statistics are closely related fields in terms of methods , but distinct in their principal\ngoal: statistics d

In [12]:
search_results = vectordb.similarity_search(vector, 
                                            k=3,
                                            filter={"source": "not_valid_source.pdf"})

print ("No of results: ", len(search_results))

No of results:  0
