In [1]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.docstore.document import Document

In [2]:
docs= []
for doc in PyPDFDirectoryLoader("data").load():
   
 
    metadata ={
        "source": doc.metadata["source"],
        "page": doc.metadata["page"],
        
    }
    doc_text = ' '.join(doc.page_content.split())
    # Remove newlines and extra spaces
    docs.append(Document(page_content=doc_text,
                                   metadata=metadata))

# PyPDFDirectoryLoader loads all PDFs in a directory

In [3]:
from pprint import pprint
pprint(docs[1])

Document(metadata={'source': 'data/mistral_paper.pdf', 'page': 1}, page_content='Mistral 7B is released under the Apache 2.0 license. This release is accompanied by a reference implementation1 facilitating easy deployment either locally or on cloud platforms such as AWS, GCP, or Azure using the vLLM [17] inference server and SkyPilot 2. Integration with Hugging Face 3 is also streamlined for easier integration. Moreover, Mistral 7B is crafted for ease of fine-tuning across a myriad of tasks. As a demonstration of its adaptability and superior performance, we present a chat model fine-tuned from Mistral 7B that significantly outperforms the Llama 2 13B – Chat model. Mistral 7B takes a significant step in balancing the goals of getting high performance while keeping large language models efficient. Through our work, our aim is to help the community create more affordable, efficient, and high-performing language models that can be used in a wide range of real-world applications. 2 Archite

In [4]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os
from langchain_chroma import Chroma

# The splitting and chunking strategy
from langchain.text_splitter import RecursiveCharacterTextSplitter

load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [5]:
openaiembedding = OpenAIEmbeddings(model="text-embedding-3-small")

  openaiembedding = OpenAIEmbeddings(model="text-embedding-3-small")


In [6]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

In [7]:
chunked_docs = splitter.split_documents(docs)

In [8]:
pprint(chunked_docs[:3])

[Document(metadata={'source': 'data/mistral_paper.pdf', 'page': 0}, page_content='Mistral 7B Albert Q. Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, Lélio Renard Lavaud, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed Abstract We introduce Mistral 7B, a 7–billion-parameter language model engineered for superior performance and efficiency. Mistral 7B outperforms the best open 13B model (Llama 2) across all evaluated benchmarks, and the best released 34B model (Llama 1) in reasoning, mathematics, and code generation. Our model leverages grouped-query attention (GQA) for faster inference, coupled with sliding window attention (SW A) to effectively handle sequences of arbitrary length with a reduced inference cost. We also provide a model fine-tuned to follow instructions, Mistral 7B – Instruct, that

In [9]:
# create vector DB of docs and embeddings - takes 1 min on Colab
chroma_db = Chroma.from_documents(documents=chunked_docs,
                                  collection_name='capstone_db',
                                  embedding=openaiembedding,
                                  # need to set the distance function to cosine else it uses euclidean by default
                                  # check https://docs.trychroma.com/guides#changing-the-distance-function
                                  collection_metadata={"hnsw:space": "cosine"},
                                  persist_directory="./capstone_db")

In [12]:
chroma_db.similarity_search("What is mistral?", k=3)

[Document(id='ac4d011f-a2b6-463e-9326-a4bfa56a4125', metadata={'page': 1, 'source': 'data/mistral_paper.pdf'}, page_content='Mistral 7B is released under the Apache 2.0 license. This release is accompanied by a reference implementation1 facilitating easy deployment either locally or on cloud platforms such as AWS, GCP, or Azure using the vLLM [17] inference server and SkyPilot 2. Integration with Hugging Face 3 is also streamlined for easier integration. Moreover, Mistral 7B is crafted for ease of fine-tuning across a myriad of tasks. As a demonstration of its adaptability and superior performance, we present a chat model fine-tuned from Mistral 7B that significantly outperforms the Llama 2 13B – Chat model. Mistral 7B takes a significant step in balancing the goals of getting high performance while keeping large language models efficient. Through our work, our aim is to help the community create more affordable, efficient, and high-performing language models that can be used in a wide

In [13]:
similarity_retriever = chroma_db.as_retriever(search_type="similarity_score_threshold",
                                              search_kwargs={"k": 5, "score_threshold": 0.2})

In [18]:
similarity_retriever.invoke("What is attention?", k=5)

[Document(id='7748bdf8-a49b-4cf3-88a0-f23681410bd4', metadata={'page': 2, 'source': 'data/attention_paper.pdf'}, page_content='i. 3.2 Attention An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum 3'),
 Document(id='8d9e1617-7e2d-4b38-a91d-ffb04846fcc9', metadata={'page': 12, 'source': 'data/attention_paper.pdf'}, page_content='Attention Visualizations Input-Input Layer5 It is in this spirit that a majority of American governments have passed new laws since 2009 making the registration or voting process more difficult . <EOS> <pad> <pad> <pad> <pad> <pad> <pad> It is in this spirit that a majority of American governments have passed new laws since 2009 making the registration or voting process more difficult . <EOS> <pad> <pad> <pad> <pad> <pad> <pad> Figure 3: An example of the attention mechanism following long-distance dependencies in th

In [19]:
from langchain_core.prompts import ChatPromptTemplate

prompt = """You are an assistant for question-answering tasks.
            Use the following pieces of retrieved context to answer the question.
            If the answer is not present in the context, just say that you don't know.
            Keep the answer to the point.

            Question:
            {question}

            Context:
            {context}

            Answer:
         """

prompt_template = ChatPromptTemplate.from_template(prompt)

In [24]:
from langchain_openai import ChatOpenAI

chatgpt = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)


In [25]:
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

qa_rag_chain = (
    {
        "context": (similarity_retriever
                      |
                    format_docs),
        "question": RunnablePassthrough()
    }
       |
    prompt_template
      |
    chatgpt
)

In [27]:
from IPython.display import display, Markdown
query = "What is the difference between gemini and mistral?"
result = qa_rag_chain.invoke(query)
display(Markdown(result.content))

Mistral is a 7-billion-parameter language model focused on performance and efficiency, particularly in reasoning, mathematics, and code generation. It uses techniques like grouped-query attention and sliding window attention for improved inference. In contrast, Gemini is a family of multimodal models that handle various types of data, including text, images, and video, and emphasizes societal benefits and risks through impact assessments.