In [1]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
from src.rag_agent.utils.extract_text import extract_text_from_svg

data_text = extract_text_from_svg(directory="/data/raw")

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=10,
    separators=["\n"],
)

chunks = text_splitter.create_documents([data_text])

In [4]:
print(f'number of chunks: {len(chunks)}')

for i,chunk in enumerate(chunks):
    if i < 3:
        print(f'Chunk {i+1}: len={len(chunk.page_content)}')
        print(chunk.page_content)

number of chunks: 36
Chunk 1: len=140
1º Tesoureiro Luiz Henrique Mendes Costa  Semestre: 2º Gosta de:animesmanhwagamesRPGcibersegurançacuriosidades tecnológicas  henrique@ubuntu
Chunk 2: len=179
1º Secretário João Davi Costa de Souza Semestre: 2º Gosta de:programaçãosoftware livremúsicasistemas operacionais Environment:Hyprland (WM)Kitty (terminal)VS Code (Editor) jd@arch
Chunk 3: len=214
Tesoureiro Geral Yuri Gabriel Cardoso Delgado OBS: desenvolveu o estilo de apresentação Semestre: 2º Gosta de:LinuxFicção CientíficaLow-LevelKPOPTRON Environment:Hyprland (WM)ST (terminal)Neovim (Editor) vanel@arch


In [5]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

model_name = "sentence-transformers/all-MiniLM-L6-v2"

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"device":"cpu"}
)

vector_store = Chroma.from_documents(
    documents=chunks, 
    embedding=embeddings,
    persist_directory="../data/chroma_db"
)

retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k":2}
)

In [6]:
query = "What kind of environment does Yuri use?"

relevant_docs = retriever.invoke(query)

print(f"\n '{query}'")

for i, doc in enumerate(relevant_docs):
    if i < 3:
        print(f"Document {i+1} (Context):")
        print(doc.page_content)


 'What kind of environment does Yuri use?'
Document 1 (Context):
Vice-Presidente Lucas Santos Diniz Semestre: 2º Gosta de:TecnologiaEstudarSériesFilmesJogosDesenhos Environment:Hyprland (WM)Kitty (terminal)VS Code (Editor) zost@arch
Document 2 (Context):
Tesoureiro Geral Yuri Gabriel Cardoso Delgado OBS: desenvolveu o estilo de apresentação Semestre: 2º Gosta de:LinuxFicção CientíficaLow-LevelKPOPTRON Environment:Hyprland (WM)ST (terminal)Neovim (Editor) vanel@arch


In [7]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain

from langchain_ollama import OllamaLLM
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv

load_dotenv()
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash",
                             google_api_key=os.getenv("GEMINI_API_KEY"))

#llm = OllamaLLM(model="llama3")

prompt = ChatPromptTemplate.from_template("""
You are a helpful assistant. Use the following context to answer the question.
If you don't know the answer, say that you don't have enough information in the context.

Contexto:
{context}

Pergunta: {input}
""")

document_chain = create_stuff_documents_chain(
    llm=llm,
    prompt=prompt,
)

rag_chain = create_retrieval_chain(retriever, document_chain)

query = "Tell me What Mateus Like?."

response = rag_chain.invoke({"input": query})

print(f"\nresponse: {response['answer']}")


response: Mateus likes programming, open source, Hollow Knight, Undertale, and Deltarune.
