## Educational RAG

### A question answering agent based on some UNESCO publications

This project use RAG (Retrieval Augmented Generation) to ensure the assistant accuracy.

In [1]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go

In [2]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document

In [3]:
selected_model = "gpt-4o-mini"
db_name = "../data/educational_db"
knowledge_base = "../data/knowledge-base/*"

In [4]:
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'key-if-not-using-dotenv')

In [5]:
folders = glob.glob(knowledge_base)

def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

text_loader_kwargs = {'encoding': 'utf-8'}
# text_loader_kwargs={'autodetect_encoding': True}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")

Created a chunk of size 1088, which is longer than the specified 1000


Total number of chunks: 123
Document types found: {'contracts', 'products', 'employees', 'company'}


In [6]:
embeddings = OpenAIEmbeddings()

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 123 documents


In [7]:
collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

There are 123 vectors with 1,536 dimensions in the vector store


In [8]:
query = "Please explain what Insurellm is in a couple of sentences"
retrieved_docs = vectorstore.similarity_search(query)
print(retrieved_docs)

[Document(id='8ac98bc4-413a-4935-878d-468f9b52a84d', metadata={'doc_type': 'company', 'source': '../data/knowledge-base/company/about.md'}, page_content="# About Insurellm\n\nInsurellm was founded by Avery Lancaster in 2015 as an insurance tech startup designed to disrupt an industry in need of innovative products. It's first product was Markellm, the marketplace connecting consumers with insurance providers.\nIt rapidly expanded, adding new products and clients, reaching 200 emmployees by 2024 with 12 offices across the US."), Document(id='a4a86c7a-38ce-48b1-9f66-57d62032765c', metadata={'source': '../data/knowledge-base/company/overview.md', 'doc_type': 'company'}, page_content='# Overview of Insurellm\n\nInsurellm is an innovative insurance tech firm with 200 employees across the US.\nInsurellm offers 4 insurance software products:\n- Carllm, a portal for auto insurance companies\n- Homellm, a portal for home insurance companies\n- Rellm, an enterprise platform for the reinsurance s

## Visualizing the Vector Store

Let's take a minute to look at the documents and their embedding vectors to see what's going on.

In [None]:
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['doc_type'] for metadata in metadatas]
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

We humans find it easier to visalize things in 2D!
Reduce the dimensionality of the vectors to 2D using t-SNE
(t-distributed stochastic neighbor embedding)

In [None]:
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

## Setup LangChain chat

In [14]:
from langchain.chat_models import init_chat_model
from langgraph.checkpoint.memory import InMemorySaver  
from langchain.tools import tool
from langchain.agents import create_agent

model = init_chat_model(selected_model, temperature=0.7)

@tool(response_format="content_and_artifact")
def retrieve_context(query: str):
    """Retrieve information to help answer a query."""
    retrieved_docs = vectorstore.similarity_search(query,k=2)    
    serialized = "\n\n".join(
        (f"metadata: {doc.metadata}\npage_content: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

tools = [retrieve_context]

system_prompt = """
   You are an expert in answering accurate questions about Insurellm, the Insurance Tech company.
   Give brief, accurate answers. If you don't know the answer, say so.
   Do not make anything up if you haven't been provided with relevant context.
 """ 

memory = InMemorySaver()

agent = create_agent(
    model=model,
    checkpointer=memory,
    tools=tools,
    system_prompt=system_prompt,
)

In [15]:
config = {"configurable": {"thread_id": "1"}}
query = "Please explain what Insurellm is in a couple of sentences"
result = result = agent.invoke({"question": query}, config)
result["messages"][-1].pretty_print()


Insurellm is an insurance tech startup founded by Avery Lancaster in 2015, aimed at disrupting the insurance industry with innovative products. Its first product, Markellm, connects consumers with insurance providers. By 2024, the company had expanded to 200 employees and 12 offices across the U.S. Additionally, Insurellm offers Homellm, which focuses on transforming home insurance with innovation and reliability.


In [None]:
# def chat(question, history):
#     result = agent.invoke({"question": question}, config)
#     return result["messages"][-1].pretty_print()
def chat(message, history):
    result = agent.invoke({
        "messages": [{"role": "user", "content": message}]
    },config)
    return result["messages"][-1].content

SyntaxError: ':' expected after dictionary key (283846273.py, line 6)

## Setup Gradio using the Chat interface

In [17]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.


Traceback (most recent call last):
  File "/home/s676922906/projetos/educational_rag/.venv/lib/python3.11/site-packages/gradio/queueing.py", line 759, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/s676922906/projetos/educational_rag/.venv/lib/python3.11/site-packages/gradio/route_utils.py", line 354, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/s676922906/projetos/educational_rag/.venv/lib/python3.11/site-packages/gradio/blocks.py", line 2116, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/s676922906/projetos/educational_rag/.venv/lib/python3.11/site-packages/gradio/blocks.py", line 1621, in call_function
    prediction = await fn(*processed_input)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/s676922906/projetos/educational_rag/.venv/lib/pyth