In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel


from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank

In [2]:
import os
import dotenv

In [3]:
%load_ext dotenv
%reload_ext dotenv
%dotenv

In [4]:
# Carregar Modelos OPEN AI - EMBEDDING E CHAT

embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")
llm = ChatOpenAI(model_name = 'gpt-3.5-turbo', max_tokens=300)

In [5]:
# Carregar o PDF
pdf_link= './doc/projeto_de_lei_inteligencia_artificial.pdf'

loader = PyPDFLoader(pdf_link, extract_images=False)

pages = loader.load_and_split()

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 4000,
    chunk_overlap = 20,
    length_function = len,
    add_start_index = True
)

chunk = text_splitter.split_documents(pages)

In [7]:
vectordb = Chroma(embedding_function=embeddings_model, persist_directory="naive")

  vectordb = Chroma(embedding_function=embeddings_model, persist_directory="naive")


In [8]:
# Carregar o DB

naive_retreiver = vectordb.as_retriever(search_kwargs={"k": 10})

In [9]:
rerank = CohereRerank(model="rerank-english-v2.0", top_n=3)

compressor_retriever = ContextualCompressionRetriever(
    base_compressor= rerank,
    base_retriever= naive_retreiver
)