In [1]:
# MODEL = "llama3.2:latest"
MODEL = "mistral:latest"
CHROMA_PATH = "chroma"
DATA_PATH = "mydocs"


In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

test = os.getenv("TEST")
print(test)


This is a test variable


In [None]:
from langchain_community.llms import Ollama

model = Ollama(model= MODEL)

print(model.invoke("What is the capital of India?"))

  model = Ollama(model= MODEL)


' The capital of India is New Delhi. It\'s important to note that while New Delhi serves as the administrative center for the government, there are actually three "capitals" in India: New Delhi (the capital of the country), Mumbai (the commercial capital), and Thiruvananthapuram (the legislative capital). However, when people refer to the capital of India, they are typically referring to New Delhi.'

In [4]:
from langchain_community.document_loaders import PyPDFDirectoryLoader

def load_documents():
    print("Loading documents")
    loader = PyPDFDirectoryLoader(DATA_PATH)
    documents = loader.load()
    return documents

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

def split_text(documents: list[Document]):
    print("Splitting text")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

In [6]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
import os
import shutil

def save_to_chroma(chunks: list[Document]):
    print("Saving to Chroma")
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, OllamaEmbeddings(model="nomic-embed-text"), persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [7]:
def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks)

In [8]:
generate_data_store()

Loading documents
Splitting text
Split 450 documents into 2307 chunks.
GENERAL MOTORS, GM, the GM Emblem, CADILLAC,
the CADILLAC Crest & Wreath, and the name DTS are
registered trademarks of General Motors Corporation.
This manual includes the latest information at the time
it was printed. We reserve the right to make changes
after that time without notice. For vehicles ﬁrst sold
in Canada, substitute the name “General Motors
{'producer': 'Acrobat Distiller 7.0 (Windows)', 'creator': 'XPP', 'creationdate': '2007-05-25T15:03:26+00:00', 'subject': '', 'author': 'EDS', 'keywords': '', 'moddate': '2007-05-29T09:21:33-04:00', 'title': 'GM Owner Manuals', 'source': 'mydocs\\DTS Manual.pdf', 'total_pages': 450, 'page': 1, 'page_label': '2', 'start_index': 0}
Saving to Chroma


  chunks, OllamaEmbeddings(model="nomic-embed-text"), persist_directory=CHROMA_PATH


KeyboardInterrupt: 

In [10]:
query_text = "What are airbags?"

# Prepare the DB.
embedding_function = OllamaEmbeddings(model="nomic-embed-text")
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

# Search the DB.
results = db.similarity_search_with_relevance_scores(query_text, k=4)
if len(results) == 0 or results[0][1] < 0.7:
    print(f"Unable to find matching results.")

results

Unable to find matching results.


[]

In [11]:
from langchain.prompts import PromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
print(prompt.format(context="Here is some context", question="Here is a question"))


Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: Here is some context

Question: Here is a question



In [12]:
from langchain.prompts import ChatPromptTemplate

context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(template)
prompt = prompt_template.format(context=context_text, question=query_text)
print(prompt)

response_text = model.predict(prompt)

sources = [doc.metadata.get("source", None) for doc, _score in results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)

Human: 
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: 

Question: What are airbags?



  response_text = model.predict(prompt)


Response:  Airbags are safety devices found in motor vehicles that are designed to inflate quickly and soften an impact between the occupant and interior surfaces of a vehicle during a collision. The main purpose is to reduce the risk of injury or death in the event of a crash.
Sources: []


In [None]:
from query_data import query_rag

EVAL_PROMPT = """
Expected Response: {expected_response}
Actual Response: {actual_response}
---
(Answer with 'true' or 'false') Does the actual response match the expected response? 
"""

def query_and_validate(question: str, expected_response: str):
    response_text = query_rag(question)
    prompt = EVAL_PROMPT.format(
        expected_response=expected_response, actual_response=response_text
    )

    model = Ollama(model="mistral")
    evaluation_results_str = model.invoke(prompt)
    evaluation_results_str_cleaned = evaluation_results_str.strip().lower()

    print(prompt)

    if "true" in evaluation_results_str_cleaned:
        # Print response in Green if it is correct.
        print("\033[92m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m")
        return True
    elif "false" in evaluation_results_str_cleaned:
        # Print response in Red if it is incorrect.
        print("\033[91m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m")
        return False
    else:
        raise ValueError(
            f"Invalid evaluation result. Cannot determine if 'true' or 'false'."
        )