# Educational RAG

## Interface conversacional para recuperação de informações a partir de documentos

Este projeto usa RAG (Retrieval Augmented Generation) para garantir a acurácia das respostas.

## Setup inicial

In [1]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr

import importlib
import my_visualizer
importlib.reload(my_visualizer)
from my_visualizer import visualizer2d

In [2]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_ollama import ChatOllama
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain.chat_models import init_chat_model
from langgraph.checkpoint.memory import InMemorySaver  
from langchain.tools import tool
from langchain.agents import create_agent

In [None]:
#base_model = "gpt-4o-mini"
base_model = "gpt-5-nano"
#base_model = gpt-oss:20b
base_url = "http://localhost:11434"
temperature=0.1
sentence_transformers_model = "sentence-transformers/all-mpnet-base-v2"
nearest_neighbors = 5
db_name = "../data/educational_db"
knowledge_base = "../data/knowledge-base/*"

In [4]:
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'key-if-not-using-dotenv')
os.environ['HF_API_KEY'] = os.getenv('HF_API_KEY', 'key-if-not-using-dotenv')

## Carga de documentos

In [5]:
folders = glob.glob(knowledge_base)

def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

text_loader_kwargs={'autodetect_encoding': True}
# text_loader_kwargs = {'encoding': 'utf-8'}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])

print(f"Total de documentos carregados: {len(documents)}")


Total de documentos carregados: 31


## Configuração de chunks

In [6]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Total de chunks: {len(chunks)}")
print(f"Tipos de documentos encontrados: {set(doc.metadata['doc_type'] for doc in documents)}")

Created a chunk of size 1088, which is longer than the specified 1000


Total de chunks: 123
Tipos de documentos encontrados: {'products', 'employees', 'company', 'contracts'}


## Configuração de embeddings

In [7]:
# embeddings = OpenAIEmbeddings()

model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=sentence_transformers_model,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

print(f"Configuração do embeddings criado:\n {embeddings}")

Configuração do embeddings criado:
 model_name='sentence-transformers/all-mpnet-base-v2' cache_folder=None model_kwargs={'device': 'cpu'} encode_kwargs={'normalize_embeddings': False} query_encode_kwargs={} multi_process=False show_progress=False


## Criação de banco vetorial

In [8]:
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"Banco vetorial criado com {vectorstore._collection.count()} documentos (vetores)")
print(f"Os {count:,} vetores estão organizados em {dimensions:,} dimensões")

Banco vetorial criado com 123 documentos (vetores)
Os 123 vetores estão organizados em 768 dimensões


## Recuperação de dados no banco vetorial

In [9]:
query = "Please explain what Insurellm is in a couple of sentences"
retrieved_docs = vectorstore.similarity_search(query, nearest_neighbors)
print(retrieved_docs)

[Document(id='9e44d669-3792-4916-9d59-f29c0949b29b', metadata={'source': '../data/knowledge-base/company/overview.md', 'doc_type': 'company'}, page_content='# Overview of Insurellm\n\nInsurellm is an innovative insurance tech firm with 200 employees across the US.\nInsurellm offers 4 insurance software products:\n- Carllm, a portal for auto insurance companies\n- Homellm, a portal for home insurance companies\n- Rellm, an enterprise platform for the reinsurance sector\n- Marketllm, a marketplace for connecting consumers with insurance providers\n  \nInsurellm has more than 300 clients worldwide.'), Document(id='e298a195-bf68-49fd-adeb-38c159f3f5df', metadata={'source': '../data/knowledge-base/products/Homellm.md', 'doc_type': 'products'}, page_content='With Homellm, Insurellm is committed to transforming the landscape of home insurance, ensuring both innovation and reliability for all insurance providers and their customers. Explore the future of home insurance today with Homellm!')]


## Visualização do banco vetorial

In [10]:
visualizer2d(collection)

## Configuração do agente

In [16]:
#model = init_chat_model(base_model, temperature=0.7)
model = init_chat_model(model=base_model, temperature=temperature)
# model = ChatOllama(model=base_model, base_url=base_url, temperature=temperature)

memory = InMemorySaver()

@tool(response_format="content_and_artifact")
def retrieve_context(query: str):
    """Retrieve information to help answer a query."""
    retrieved_docs = vectorstore.similarity_search(query,k=nearest_neighbors)
    serialized = "\n\n".join(
        (f"metadata: {doc.metadata}\npage_content: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

tools = [retrieve_context]

system_prompt = """
   You are an expert in answering only accurate questions about Insurellm, the Insurance Tech company.
   Give brief, accurate answers. If you don't know the answer, say so.
   Do not make anything up if you haven't been provided with relevant context.
 """ 

agent = create_agent(
    model=model,
    checkpointer=memory,
    tools=tools,
    system_prompt=system_prompt,
)

## Acionamento do agente

In [12]:
config = {"configurable": {"thread_id": "1"}}
message = "Please explain what Insurellm is in a couple of sentences"
messages = {"role": "user", "content": message}
result = agent.invoke({"messages": [messages]}, config)
result["messages"][-1].content

'Insurellm is a U.S.-based insurtech company with about 200 employees. It offers four software products—Carllm for auto insurers, Homellm for home insurers, Rellm for the reinsurance sector, and Marketllm a marketplace connecting consumers with insurers—and serves over 300 clients worldwide.'

## Criação de função para acionamento do chat

In [13]:
def chat(message, history):
    messages = {"role": "user", "content": message}
    result = agent.invoke({"messages": [messages]}, config)
    return result["messages"][-1].content

## Configuração do Gradio com a interface do chat

In [17]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.
