In [1]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.docstore.document import Document

## Load the files and index them in a vector database


In [2]:
docs= []
for doc in PyPDFDirectoryLoader("data").load():
   
 
    metadata ={
        "source": doc.metadata["source"],
        "page": doc.metadata["page"],
        
    }
    doc_text = ' '.join(doc.page_content.split())
    # Remove newlines and extra spaces
    docs.append(Document(page_content=doc_text,
                                   metadata=metadata))

# PyPDFDirectoryLoader loads all PDFs in a directory

In [3]:
from pprint import pprint
pprint(docs[1])

Document(metadata={'source': 'data/mistral_paper.pdf', 'page': 1}, page_content='Mistral 7B is released under the Apache 2.0 license. This release is accompanied by a reference implementation1 facilitating easy deployment either locally or on cloud platforms such as AWS, GCP, or Azure using the vLLM [17] inference server and SkyPilot 2. Integration with Hugging Face 3 is also streamlined for easier integration. Moreover, Mistral 7B is crafted for ease of fine-tuning across a myriad of tasks. As a demonstration of its adaptability and superior performance, we present a chat model fine-tuned from Mistral 7B that significantly outperforms the Llama 2 13B – Chat model. Mistral 7B takes a significant step in balancing the goals of getting high performance while keeping large language models efficient. Through our work, our aim is to help the community create more affordable, efficient, and high-performing language models that can be used in a wide range of real-world applications. 2 Archite

In [2]:
from langchain_openai import OpenAIEmbeddings
#from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os
from langchain_chroma import Chroma

# The splitting and chunking strategy
from langchain.text_splitter import RecursiveCharacterTextSplitter

load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [4]:
openaiembedding = OpenAIEmbeddings(model="text-embedding-3-small")

In [6]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

In [7]:
chunked_docs = splitter.split_documents(docs)

In [8]:
pprint(chunked_docs[:3])

[Document(metadata={'source': 'data/mistral_paper.pdf', 'page': 0}, page_content='Mistral 7B Albert Q. Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, Lélio Renard Lavaud, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed Abstract We introduce Mistral 7B, a 7–billion-parameter language model engineered for superior performance and efficiency. Mistral 7B outperforms the best open 13B model (Llama 2) across all evaluated benchmarks, and the best released 34B model (Llama 1) in reasoning, mathematics, and code generation. Our model leverages grouped-query attention (GQA) for faster inference, coupled with sliding window attention (SW A) to effectively handle sequences of arbitrary length with a reduced inference cost. We also provide a model fine-tuned to follow instructions, Mistral 7B – Instruct, that

In [None]:

chroma_db = Chroma.from_documents(documents=chunked_docs,
                                  collection_name='capstone_db',
                                  embedding=openaiembedding,
                                  # need to set the distance function to cosine else it uses euclidean by default
                                  # check https://docs.trychroma.com/guides#changing-the-distance-function
                                  collection_metadata={"hnsw:space": "cosine"},
                                  persist_directory="./capstone_db")

In [5]:
# load from disk
chroma_db = Chroma(persist_directory="./capstone_db",
                   collection_name='capstone_db',
                   embedding_function=openaiembedding)

In [18]:
similarity_retriever = chroma_db.as_retriever(search_type="similarity_score_threshold",
                                              search_kwargs={"k": 5, "score_threshold": 0.3})

In [19]:
similarity_retriever.invoke("What is attention?", k=5)

[Document(id='7748bdf8-a49b-4cf3-88a0-f23681410bd4', metadata={'page': 2, 'source': 'data/attention_paper.pdf'}, page_content='i. 3.2 Attention An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum 3'),
 Document(id='aa31e5d0-bc89-4720-84ca-a2356944cae4', metadata={'page': 2, 'source': 'data/attention_paper.pdf'}, page_content='i. 3.2 Attention An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum 3'),
 Document(id='e904b965-55e8-4b73-9398-b67be569ed1b', metadata={'page': 2, 'source': 'data/attention_paper.pdf'}, page_content='i. 3.2 Attention An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vecto

In [20]:
from langchain_core.prompts import ChatPromptTemplate

prompt = """You are an assistant for question-answering tasks.
            Use the following pieces of retrieved context to answer the question.
            If the answer is not present in the context, just say that you don't know.
            Keep the answer to the point. also show the top 3 context documents of the answer.

            Question:
            {question}

            Context:
            {context}

            Answer:
         """

prompt_template = ChatPromptTemplate.from_template(prompt)

In [21]:
from langchain_openai import ChatOpenAI

chatgpt = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)


In [22]:
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

qa_rag_chain = (
    {
        "context": (similarity_retriever
                      |
                    format_docs),
        "question": RunnablePassthrough()
    }
       |
    prompt_template
      |
    chatgpt
)

In [23]:
from IPython.display import display, Markdown
query = "What is gemini"
result = qa_rag_chain.invoke(query)
display(Markdown(result.content))


Gemini is a family of highly capable multimodal models that includes evaluations at both the model and product levels, focusing on safety mitigations and user experience. It addresses critical policy areas such as hate speech and dangerous content, and adopts a user-centric approach to maximize diversity across various topics and user journeys.

Top 3 context documents:
1. Gemini: A Family of Highly Capable Multimodal Models 7.4.2. Gemini Advanced
2. Gemini: A Family of Highly Capable Multimodal Models 7.4.2. Gemini Advanced
3. Gemini: A Family of Highly Capable Multimodal Models 7.4.2. Gemini Advanced

## Experiment with different embedding models 

In [None]:
openaiembedding = OpenAIEmbeddings(model="text-embedding-3-small")