In [11]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain.chains.question_answering import load_qa_chain

In [12]:
import os
import dotenv

In [14]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [15]:
# Load dos modelos (Embeddings e LLM)

embeddings_model = OpenAIEmbeddings()
llm = ChatOpenAI(model_name="gpt-3.5-turbo", max_tokens=200)

In [None]:
# Carregar o PDF

pdf_link = "document/regulamento_dos_estagios_em_sistemas_de_Informacao_2022.pdf"

loader = PyPDFLoader(pdf_link, extract_images=False)
pages = loader.load_and_split()

In [None]:
# Separar em Chunks (Pedaços de documento)

text_spliter = RecursiveCharacterTextSplitter(
    chunk_size=4000,
    chunk_overlap=20,
    length_function=len,
    add_start_index=True,
)

chunks = text_spliter.split_documents(pages)

In [None]:
# Salvar no Vector DB 
db = Chroma.from_documents(chunks, embedding=embeddings_model, persist_directory="text_index")
db.persist()

In [None]:
# Carregar DB
vectordb = Chroma(persist_directory="text_index", embedding_function=embeddings_model)

# Load Retriever
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

# Chain - Contrução da cadeira de prompt para chamada do LLM
chain = load_qa_chain(llm, chain_type="stuff")

In [None]:
def ask(question):
    context = retriever.get_relevant_documents(question)
    answer = (chain({"input_documents": context, "question": question}, return_only_outputs=True))["output_text"]
    return answer, context

In [None]:
user_question = input("User: ")
answer, context = ask(user_question)
print("Answer: ", answer)