In [10]:
# %pip install langchain_chroma
# %pip install langchain_openai
# %pip install langchain_google_genai
# %pip install langchain_huggingface
# %pip install langchain_community
# %pip install FlagEmbedding
# !pip install python-dotenv



In [2]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.docstore.document import Document

## Load the files and index them in a vector database


In [3]:
docs= []
for doc in PyPDFDirectoryLoader("/content/data").load():


    metadata ={
        "source": doc.metadata["source"],
        "page": doc.metadata["page"],

    }
    doc_text = ' '.join(doc.page_content.split())
    # Remove newlines and extra spaces
    docs.append(Document(page_content=doc_text,
                                   metadata=metadata))

# PyPDFDirectoryLoader loads all PDFs in a directory

In [4]:
from pprint import pprint
pprint(docs[1])

Document(metadata={'source': '/content/data/mistral_paper.pdf', 'page': 1}, page_content='Mistral 7B is released under the Apache 2.0 license. This release is accompanied by a reference implementation1 facilitating easy deployment either locally or on cloud platforms such as AWS, GCP, or Azure using the vLLM [17] inference server and SkyPilot 2. Integration with Hugging Face 3 is also streamlined for easier integration. Moreover, Mistral 7B is crafted for ease of fine-tuning across a myriad of tasks. As a demonstration of its adaptability and superior performance, we present a chat model fine-tuned from Mistral 7B that significantly outperforms the Llama 2 13B – Chat model. Mistral 7B takes a significant step in balancing the goals of getting high performance while keeping large language models efficient. Through our work, our aim is to help the community create more affordable, efficient, and high-performing language models that can be used in a wide range of real-world applications. 

In [12]:
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
#from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os
from langchain_chroma import Chroma
from FlagEmbedding import BGEM3FlagModel
# The splitting and chunking strategy
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

load_dotenv(".env")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACE_TOKEN')

In [8]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

In [15]:
# Download the embeddings
open_ai_large = OpenAIEmbeddings(model="text-embedding-3-large")
open_ai_small = OpenAIEmbeddings(model="text-embedding-3-small")


model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


In [16]:
embedding_list = [open_ai_large, open_ai_small, hf] #, bge
embedding_names = ["open_ai_large", "open_ai_small", "hf"] #, "bge"

In [17]:
chunked_docs = splitter.split_documents(docs)

In [None]:

# chroma_db = Chroma.from_documents(documents=chunked_docs,
#                                   collection_name=f'capstone_db_bge',
#                                   embedding=bge,
#                                   collection_metadata={"hnsw:space": "euclidean"},
#                                   persist_directory=f"./capstone_db_bge")

In [None]:
for embedding, name in zip(embedding_list, embedding_names):
    print(f"Using embedding: {name}")
    # Create a Chroma vector store
    chroma_db = Chroma.from_documents(documents=chunked_docs,
                                      collection_name=f'capstone_db_{name}',
                                      embedding=embedding,
                                      collection_metadata={"hnsw:space": "cosine"},
                                      persist_directory=f"./capstone_db_{name}")
    print(f"Chroma DB created for embedding: {name}")


Using embedding: open_ai_large
Chroma DB created for embedding: open_ai_large
Using embedding: open_ai_small
Chroma DB created for embedding: open_ai_small
Using embedding: hf


In [None]:

# chroma_db = Chroma.from_documents(documents=chunked_docs,
#                                   collection_name='capstone_db',
#                                   embedding=openaiembedding,
#                                   # need to set the distance function to cosine else it uses euclidean by default
#                                   # check https://docs.trychroma.com/guides#changing-the-distance-function
#                                   collection_metadata={"hnsw:space": "cosine"},
#                                   persist_directory="./capstone_db")

In [None]:
# load from disk
model_name =[]
retriever_details = []
combined_list = {}

for embedding, name in zip(embedding_list, embedding_names):


    chroma_db = Chroma(persist_directory="./capstone_db",
                      collection_name=f'capstone_db_{name}',
                       embedding_function=embedding)
    similarity_retriever = chroma_db.as_retriever(search_type="similarity_score_threshold",
                                              search_kwargs={"k": 5, "score_threshold": 0.3})
    model_name.append(name)
    print(f"Model name: {model_name}")
    #print(embedding.model)
    similarity = similarity_retriever.invoke("What is attention?", k=5)
    retriever_details.append(similarity)
    print(f"Retriever details: {retriever_details}")
    details = dict(zip(model_name, retriever_details))
    combined_list.update(details)

NameError: name 'embedding_list' is not defined

In [None]:
combined_list['text-embedding-3-large'][0].page_content

'i. 3.2 Attention An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum 3'

In [None]:
for model, retriever in combined_list.items():
    print(f"Model: {model}")
    for doc in retriever:
        print(f"Document: {doc.page_content}")
        #print(f"Score: {doc.metadata['similarity_score']}")
        print(f"Source: {doc.metadata['source']}")
        print(f"Page: {doc.metadata['page']}")
        print("-" * 80)

Model: text-embedding-3-large
Document: i. 3.2 Attention An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum 3
Source: data/attention_paper.pdf
Page: 2
--------------------------------------------------------------------------------
Document: we have a theoretical attention span of approximately 131K tokens. In practice, for a sequence length of 16K andW = 4096, changes made to FlashAttention [ 11] and xFormers [ 18] yield a 2x speed improvement over a vanilla attention baseline. Rolling Buffer Cache. A fixed attention span means that we can limit our cache size using a rolling buffer cache. The cache has a fixed size of W, and the keys and values for the timestep i are stored in position i mod W of the cache. As a result, when the position i is larger than W, past values in the cache are overwritten, and the size of the cache stops increa

### From the above embeddings, we can infer that text-embedding-3-large retrieves relevant data in the top.

In [None]:
similarity_retriever.invoke("What is attention?", k=5)

[Document(id='e213dc17-555a-4842-a9a2-d85ad16472d4', metadata={'page': 2, 'source': 'data/attention_paper.pdf'}, page_content='i. 3.2 Attention An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum 3'),
 Document(id='f6ea3a5c-8027-4cad-8ec5-652746c7edd3', metadata={'page': 12, 'source': 'data/attention_paper.pdf'}, page_content='Attention Visualizations Input-Input Layer5 It is in this spirit that a majority of American governments have passed new laws since 2009 making the registration or voting process more difficult . <EOS> <pad> <pad> <pad> <pad> <pad> <pad> It is in this spirit that a majority of American governments have passed new laws since 2009 making the registration or voting process more difficult . <EOS> <pad> <pad> <pad> <pad> <pad> <pad> Figure 3: An example of the attention mechanism following long-distance dependencies in th

In [None]:
from langchain_core.prompts import ChatPromptTemplate

prompt = """You are an assistant for question-answering tasks.
            Use the following pieces of retrieved context to answer the question.
            If the answer is not present in the context, just say that you don't know.
            Keep the answer to the point. also show the top 3 context documents of the answer.

            Question:
            {question}

            Context:
            {context}

            Answer:
         """

prompt_template = ChatPromptTemplate.from_template(prompt)

In [None]:
from langchain_openai import ChatOpenAI

chatgpt = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)


In [None]:
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

qa_rag_chain = (
    {
        "context": (similarity_retriever
                      |
                    format_docs),
        "question": RunnablePassthrough()
    }
       |
    prompt_template
      |
    chatgpt
)

In [None]:
from IPython.display import display, Markdown
query = "What is gemini"
result = qa_rag_chain.invoke(query)
display(Markdown(result.content))


Gemini is a family of highly capable multimodal models that includes evaluations at both the model and product levels, focusing on safety mitigations and user experience. It addresses critical policy areas such as hate speech and dangerous content, and adopts a user-centric approach to maximize diversity across various topics and user journeys.

Top 3 context documents:
1. Gemini: A Family of Highly Capable Multimodal Models 7.4.2. Gemini Advanced
2. Gemini: A Family of Highly Capable Multimodal Models 7.4.2. Gemini Advanced
3. Gemini: A Family of Highly Capable Multimodal Models 7.4.2. Gemini Advanced