In [16]:

from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
import os
from dotenv import load_dotenv
import nest_asyncio

load_dotenv()
nest_asyncio.apply()

from llama_index.llms.ollama import Ollama

llm = Ollama(model="llama3", request_timeout=360.0)

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

Settings.chunk_size = 512
Settings.llm = llm
Settings.embed_model = embed_model




RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
def llama_parse_parser():
    if os.getenv("LLAMA_CLOUD_API_KEY") is None:
        raise ValueError(
            "LLAMA_CLOUD_API_KEY environment variable is not set. "
            "Please set it in .env file or in your shell environment then run again!"
        )
    parser = LlamaParse(result_type="text", verbose=True)
    return parser

In [None]:
reader = SimpleDirectoryReader(input_files=["contracts/8ea3b80e-2366-11ef-a2d5-c8cb9e66d08c.pdf"])
parser =  llama_parse_parser()
reader.file_extractor = {".pdf": parser}
documents = reader.load_data()
print(documents)


Started parsing the file under job_id 160e2b19-8625-4340-86f9-2eed8b0fa9b7
[Document(id_='b01faa5d-d984-4914-871e-322f90c23801', embedding=None, metadata={'file_path': 'contracts/8ea3b80e-2366-11ef-a2d5-c8cb9e66d08c.pdf', 'file_name': '8ea3b80e-2366-11ef-a2d5-c8cb9e66d08c.pdf', 'file_type': 'application/pdf', 'file_size': 1325408, 'creation_date': '2024-06-05', 'last_modified_date': '2024-06-05'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='# 20/05/2024 DOCS/IFTM - 0000601657 - EDITAL CIDADE DE SAO PAULO HABITACAO Minuta de Contrato\n\nCONTRATO N. XXX/SEHAB/2023\n\nPROCESSO ELETRÔNICO SEI N. 6014.2022/0003235-8\n\nCONTRATANTE: SECRETARIA MUNICIPAL DE HABITAÇÃO - SEHAB\n\nCONTRATADA: FULANA\n\nOBJETO: CONTRATAÇÃO DE EMPRESA ESPECIALIZADA PARA A REFORM

In [None]:
documents_simple_loader = SimpleDirectoryReader(input_files=["contracts/8ea3b80e-2366-11ef-a2d5-c8cb9e66d08c.pdf"]).load_data()
print(documents_simple_loader)

[Document(id_='ab20712a-3c6b-4cb0-924d-636aabb149dd', embedding=None, metadata={'page_label': '1', 'file_name': '8ea3b80e-2366-11ef-a2d5-c8cb9e66d08c.pdf', 'file_path': 'contracts/8ea3b80e-2366-11ef-a2d5-c8cb9e66d08c.pdf', 'file_type': 'application/pdf', 'file_size': 1325408, 'creation_date': '2024-06-05', 'last_modified_date': '2024-06-05'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='1Minuta de Contrato\nCONTRATO N. XXX/SEHAB/2023\nPROCESSO ELETRÔNICO SEI N. 6014.2022/0003235-8\nCONTRATANTE: SECRETARIA MUNICIPAL DE HABITAÇÃO - SEHAB\nCONTRATADA: FULANA\nOBJETO: CONTRATAÇÃO DE EMPRESA ESPECIALIZADA PARA A REFORMA E MELHORIA DO SISTEMA DE GÁS, DO\nSISTEMA DE PROTEÇÃO CONTRA DESCARGAS ATMOSFÉRICAS (SPDA) E DE COMBATE AO INCÊNDIO, PARA\nOBTENÇÃO DO AUT

In [None]:
from llama_index.core import VectorStoreIndex

index1 = VectorStoreIndex.from_documents(documents)
query_engine = index1.as_query_engine(similarity_top_k=5)
response = query_engine.query("CLAUSULA PRIMEIRA")
print(response)

The first clause of this contract is:

"1. A CONTRATANTE poderá contratar com a CONTRATADA para a execução de obras e serviços que sejam necessários à manutenção e ao aperfeiçoamento do objeto contratual, desde que tal contrato seja celebrado nos termos da Lei Federal n. 8.666/1993."


In [None]:
index2 = VectorStoreIndex.from_documents(documents_simple_loader)
query_engine = index2.as_query_engine(similarity_top_k=5)
response = query_engine.query("CLAUSULA PRIMEIRA")
print(response)

The contract does not have a Cláusula Primeira. The first clause mentioned in the provided text is "CLÁUSULA SÉTIMA - DA EMISSÃO DA ORDEM DE INÍCIO DE SERVIÇOS".


wtf pq ta esquisito, acho que vou ter que usar um nodeParser tb


TESTANDO MARKDWONELEMENTNODEPARSERAQ EMBAIXO


In [None]:
from llama_index.core.node_parser import MarkdownElementNodeParser


parser = MarkdownElementNodeParser()

nodes = parser.get_nodes_from_documents(documents)
base_nodes, objects = parser.get_nodes_and_objects(nodes)
print(base_nodes, objects)
index_markdownNodeParser = VectorStoreIndex(nodes=base_nodes + objects)


from llama_index.postprocessor.flag_embedding_reranker import (
    FlagEmbeddingReranker,
)

reranker = FlagEmbeddingReranker(
    top_n=5,
    model="BAAI/bge-reranker-large",
    
)

query_engine = index_markdownNodeParser.as_query_engine(similarity_top_k=15, node_postprocessors=[reranker], verbose = True)
response = query_engine.query("CLÁUSULA PRIMEIRA")
print(response)

3it [00:00, 27838.30it/s]
100%|██████████| 3/3 [01:55<00:00, 38.43s/it]


[TextNode(id_='b521a1b8-9a5e-40d3-b176-747621c0d143', embedding=None, metadata={'file_path': 'contracts/8ea3b80e-2366-11ef-a2d5-c8cb9e66d08c.pdf', 'file_name': '8ea3b80e-2366-11ef-a2d5-c8cb9e66d08c.pdf', 'file_type': 'application/pdf', 'file_size': 1325408, 'creation_date': '2024-06-05', 'last_modified_date': '2024-06-05'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='b01faa5d-d984-4914-871e-322f90c23801', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'contracts/8ea3b80e-2366-11ef-a2d5-c8cb9e66d08c.pdf', 'file_name': '8ea3b80e-2366-11ef-a2d5-c8cb9e66d08c.pdf', 'file_type': 'application/pdf', 'file_size': 1325408, 'creation_date': '2024-06-05', 'last_modified_date': '2024-06-05'}, hash='0c6

OutOfMemoryError: CUDA out of memory. Tried to allocate 978.00 MiB. GPU 