# Setup

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyMuPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Pinecone as PineconeVectorStore
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from pinecone import Pinecone
import yaml
import os
import zipfile

In [None]:
with open('config.yaml', 'r') as config_file:
    config = yaml.safe_load(config_file)
os.environ['PINECONE_API_KEY'] = config['PINECONE_API_KEY']
os.environ['OPENAI_API_KEY'] = config['OPENAI_API_KEY']

# Busca Semântica

In [None]:
zip_file_path = 'documentos.zip'
extracted_folder_path = 'docs'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder_path)

documents = []
for filename in os.listdir(extracted_folder_path):
    if filename.endswith(".pdf"):
        file_path = os.path.join(extracted_folder_path, filename)
        loader = PyMuPDFLoader(file_path)
        documents.extend(loader.load())

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  
    chunk_overlap=100,
    length_function=len
)
chunks = text_splitter.create_documents([doc.page_content for doc in documents])

In [None]:
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002') 

In [None]:
index_name = 'llm' 
vector_store = PineconeVectorStore.from_documents(chunks, embeddings, index_name=index_name)

In [None]:
query_1 = '''Responda apenas com base no input fornecido. Qual o número do processo que trata de Violação
de normas ambientais pela Empresa de Construção?'''
query_2 = 'Responda apenas com base no input fornecido. Qual foi a decisão no caso de fraude financeira?'
query_3 = 'Responda apenas com base no input fornecido. Quais foram as alegações no caso de negligência médica?'
query_4 = 'Responda apenas com base no input fornecido. Quais foram as alegações no caso de Número do Processo: 822162' #disputa contratual

In [None]:
llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.2)

In [None]:
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

In [None]:
chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)

In [None]:
print(chain)

In [None]:
answer_1 = chain.invoke(query_1)
answer_2 = chain.invoke(query_2)
answer_3 = chain.invoke(query_3)
answer_4 = chain.invoke(query_4)

In [None]:
print('Pergunta: ',answer_1['query'])
print('Resultado: ',answer_1['result'],'\n')
#---
print('Pergunta: ',answer_2['query'])
print('Resultado: ',answer_2['result'],'\n')
#---
print('Pergunta: ',answer_3['query'])
print('Resultado: ',answer_3['result'],'\n')
#---
print('Pergunta: ',answer_4['query'])
print('Resultado: ',answer_4['result'])