In [1]:
# metadata can work as link between documents 
# when large document is divided into smaller documents metadata tells which documents are related

# metadata can be used to store information about the source of the document, such as the URL or file path where the document was retrieved from.

from langchain_core.documents import Document
documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Goldfish are popular pets for beginners, requiring relatively simple care.",
        metadata={"source": "fish-pets-doc"},
    ),
    Document(
        page_content="Parrots are intelligent birds capable of mimicking human speech.",
        metadata={"source": "bird-pets-doc"},
    ),
    Document(
        page_content="Rabbits are social animals that need plenty of space to hop around.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

In [None]:
# Vector stores
# Use similarity metrics to identify related data 
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
embeddings_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')

In [7]:
vectorstore = Chroma.from_documents(documents,embedding=embeddings_model)

In [8]:
vectorstore.similarity_search('cat')

[Document(page_content='Cats are independent pets that often enjoy their own space.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Dogs are great companions, known for their loyalty and friendliness.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Parrots are intelligent birds capable of mimicking human speech.', metadata={'source': 'bird-pets-doc'}),
 Document(page_content='Rabbits are social animals that need plenty of space to hop around.', metadata={'source': 'mammal-pets-doc'})]

In [9]:
vectorstore.similarity_search_with_score('cat')
# Return distance matrix

[(Document(page_content='Cats are independent pets that often enjoy their own space.', metadata={'source': 'mammal-pets-doc'}),
  1.025897741317749),
 (Document(page_content='Dogs are great companions, known for their loyalty and friendliness.', metadata={'source': 'mammal-pets-doc'}),
  1.434983491897583),
 (Document(page_content='Parrots are intelligent birds capable of mimicking human speech.', metadata={'source': 'bird-pets-doc'}),
  1.677903175354004),
 (Document(page_content='Rabbits are social animals that need plenty of space to hop around.', metadata={'source': 'mammal-pets-doc'}),
  1.7785110473632812)]

In [12]:
# Search by vector 
test_vector = embeddings_model.embed_query('fish')
vectorstore.similarity_search_by_vector(test_vector)

[Document(page_content='Goldfish are popular pets for beginners, requiring relatively simple care.', metadata={'source': 'fish-pets-doc'}),
 Document(page_content='Parrots are intelligent birds capable of mimicking human speech.', metadata={'source': 'bird-pets-doc'}),
 Document(page_content='Dogs are great companions, known for their loyalty and friendliness.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Rabbits are social animals that need plenty of space to hop around.', metadata={'source': 'mammal-pets-doc'})]

In [14]:
# Retrivers 
# VectorStore obj is not subclass of Runnable so it can not be used in chain directly 
# Langchian Retrivers are runnable 
# Also we can create our own runnable 
from langchain_core.runnables import RunnableLambda
custom_retivers = RunnableLambda(vectorstore.similarity_search).bind(k=1) # Return top 1 result 
custom_retivers.invoke('cat')
custom_retivers.batch(['cat','fish'])

[[Document(page_content='Cats are independent pets that often enjoy their own space.', metadata={'source': 'mammal-pets-doc'})],
 [Document(page_content='Goldfish are popular pets for beginners, requiring relatively simple care.', metadata={'source': 'fish-pets-doc'})]]

In [15]:
# Another way to create retrivers 
retriver = vectorstore.as_retriever(
    search_type='similarity',
    search_kwargs={'k':1}
)
retriver.batch(['cat','fish'])

[[Document(page_content='Cats are independent pets that often enjoy their own space.', metadata={'source': 'mammal-pets-doc'})],
 [Document(page_content='Goldfish are popular pets for beginners, requiring relatively simple care.', metadata={'source': 'fish-pets-doc'})]]

In [16]:
# Use with Rag Application 
import os
from dotenv import load_dotenv
load_dotenv()
os.environ['GROQ_API_KEY'] = os.getenv('GROQ_API_KEY')
from langchain_groq import ChatGroq
llm = ChatGroq(model='llama3-8b-8192')

In [17]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
message = """
Answer this questions using the provided context only 
{question}
context:{context}
"""

prompt = ChatPromptTemplate.from_messages([('human',message)])
reg_chain = {'context':retriver,'question':RunnablePassthrough()} | prompt | llm

In [20]:
response = reg_chain.invoke('dogs are known for which thing?')
print(response.content)

According to the context, dogs are known for their loyalty and friendliness.
