In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader

from langchain_core.prompts import ChatPromptTemplate

from langchain_core.runnables import RunnablePassthrough, RunnableParallel

from langchain_core.output_parsers import StrOutputParser

from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

In [2]:
import os
import dotenv

In [4]:
%load_ext dotenv
%reload_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [5]:
embedding = OpenAIEmbeddings(model="text-embedding-3-small")
llm = ChatOpenAI(model_name= "gpt-3.5-turbo", max_tokens=500)


In [7]:
#Carregar o PDF

pdf_link="./doc/projeto_de_lei_inteligencia_artificial.pdf"

loader = PyPDFLoader(pdf_link, extract_images=False)

pages = loader.load_and_split()

In [8]:
len(pages)

33

In [9]:
# Splitter

child_splitter = RecursiveCharacterTextSplitter(chunk_size = 200)

parent_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 4000,
    chunk_overlap = 200,
    length_function = len,
    add_start_index = True
)

In [10]:
#Storage

store = InMemoryStore()
vectorstore = Chroma(embedding_function=embedding, persist_directory='childVectorDB')

  vectorstore = Chroma(embedding_function=embedding, persist_directory='childVectorDB')


In [None]:
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter
)

parent_document_retriever.add_documents(pages, ids=None)

In [None]:
parent_document_retriever.vectorstore.get()

In [None]:
TEMPLATE = """" 
    Voceé um especialista em legislação e tecnologia, Responda a pergunta abaixo utilizando o contexto informado.
    Query:
    {question}

    Context:
    {context}
"""

In [None]:
rag_prompt = ChatPromptTemplate.from_template(TEMPLATE)

In [None]:
setup_retrieval = RunnableParallel({"question": RunnablePassthrough(), "context": parent_document_retriever})

output_parser = StrOutputParser()

In [None]:
parent_chain_retrival = setup_retrieval | rag_prompt | llm | output_parser