In [19]:
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

from dotenv import load_dotenv
import os
import shutil
from sentence_transformers import SentenceTransformer

from typing import Optional, List

import chromadb
from chromadb.utils import embedding_functions
from openai_client import client

In [20]:
# Load environment variables. Assumes that project contains .env file with API keys
load_dotenv()

#---- Set OpenAI API key 
# Change environment variable name from "OPENAI_API_KEY" to the name given in 
# your .env file.
# openai.api_key = os.environ['OPENAI_API_KEY']

CHROMA_PATH = "chroma"
CHROMA_COLLECTION = "alice"
DATA_PATH = "data/"
EMBEDDINGS_MODEL="all-MiniLM-L6-v2"

In [21]:
# Clear out the database first.
if os.path.exists(CHROMA_PATH):
    shutil.rmtree(CHROMA_PATH)

# Setup Chroma client with local persistence
chroma_client = chromadb.Client(
    chromadb.config.Settings(
        persist_directory=CHROMA_PATH,  # Folder to store Chroma DB
    )
)

# Use SentenceTransformer for embeddings
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBEDDINGS_MODEL
)

# Load or create collection
collection_name = CHROMA_COLLECTION
collection = chroma_client.get_or_create_collection(
    name=collection_name,
    embedding_function=sentence_transformer_ef
)

# Clear db system cache if any
chroma_client.clear_system_cache()

In [22]:
def load_documents(data_path: str=DATA_PATH)->List[Document]:
    """
    Load documents from the specified directory.
    
    Args:
        data_path: Directory path.

    Returns:
        documents: A list of Document objects.
    """
    loader = DirectoryLoader(data_path, glob="*.md")
    documents = loader.load()
    return documents


def split_text(documents: List[Document]):
    """
    Split documents in chunks.
    
    Args:
        documents: A list of Documents objects.

    Returns:
        chunks: A list of Document objects divided in chunks.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    return chunks


def add_documents(docs: Optional[Document], ids=None , metadatas=None):
    """
    Add documents to Chroma colection.
    
    Args:
        docs: Optional list of Documents objects.
        ids: Optional list of Documents objects ids.
        metadata: Optional list of Documents objects metadatas.

    Returns:
        void.
    """
    ids = ids or [f"doc_{i}" for i in range(len(docs))]
    metadatas = metadatas or [{} for _ in docs]
    collection.add(documents=docs, ids=ids, metadatas=metadatas)
    

def retrieve_documents(query: str, k=3)->Document:
    """
    Retrieve documents from Chroma colection.
    
    Args:
        query_texts: The document texts to get the closes neighbors of. Optional..
        n_results: The number of neighbors to return for each query_embedding or query_texts. Optional..

    Returns:
        document: The first Document object of the query result.
    """
    results = collection.query(query_texts=[query], n_results=k)
    print('\nResults:\n', results)
    document = results['documents'][0]
    return document  # List of docs


def gpt4o_text(prompt: str):
    """
    Call OpenAi client model.
    
    Args:
        prompt: Prompt to send to the model.

    Returns:
        answer: Model answer.
    """
    response = client.chat.completions.create(
        messages=[
            { "role": "user", "content": [{"type": "text", "text": prompt}] }
        ],
        model="gpt-4o", # https://github.com/marketplace/models
        temperature=1,
        max_tokens=4096,
        top_p=1
    )

    return response.choices[0].message.content


def run_rag_pipeline(query):
    """
    Run RAG Pipeline Model.
    
    Args:
        query: Query to retrive context from database by similarity.

    Returns:
        answer: RAG Pipeline Model answer.
    """
    context_docs = retrieve_documents(query)
    context = "\n".join(context_docs)

    prompt = f"""Use the following context to answer the question.

Context:
{context}

Question:
{query}

Answer:"""

    print('\nPrompt:\n', prompt)
    return gpt4o_text(prompt)

In [23]:
documents = load_documents()
chunks = split_text(documents)
print('\nChunk sample:\n', chunks[0])

contents = [chunk.page_content for chunk in chunks]
print('\nContent sample:\n', contents[0])

metadatas = [chunk.metadata for chunk in chunks]
print('\nMetadata sample:\n', metadatas[0])

add_documents(contents, metadatas=metadatas)

Split 1 documents into 818 chunks.

Chunk sample:
 page_content='The Project Gutenberg eBook of Alice's Adventures in Wonderland' metadata={'source': 'data/alice_in_worderland.md', 'start_index': 0}

Content sample:
 The Project Gutenberg eBook of Alice's Adventures in Wonderland

Metadata sample:
 {'source': 'data/alice_in_worderland.md', 'start_index': 0}


In [None]:
query = "How does the Project Gutenberg is financed?"
print('\nQuery:\n', query)

answer = run_rag_pipeline(query)
print("Answer:", answer)


Query:
 How does the Project Gutenberg is financed?

Results:
 {'ids': [['doc_796', 'doc_733', 'doc_797']], 'embeddings': None, 'documents': [['Section 2. Information about the Mission of Project Gutenberg™', 'To protect the Project Gutenberg™ mission of promoting the free distribution of electronic works, by using or distributing this work (or any other work associated in any way with the phrase “Project Gutenberg”), you agree to comply with all the terms of the Full Project Gutenberg™ License available', 'Project Gutenberg™ is synonymous with the free distribution of electronic works in formats readable by the widest variety of computers including obsolete, old, middle-aged and new computers. It exists because of the efforts of hundreds of volunteers and donations from people in all walks of life.']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[{'source': 'data/alice_in_worderland.md', 'start_index': 158216}, {'source': 'data/alice