In [257]:
import fitz
import nltk
import os
import glob
from pathlib import Path
import spacy
import pprint
from sentence_transformers import SentenceTransformer, util
import time
import numpy as np
from openai import OpenAI
import dotenv
import json
import torch
import tqdm
import argparse

In [211]:
dotenv.load_dotenv()

False

In [212]:
API_KEY = os.getenv("API_KEY")
client = OpenAI(api_key=API_KEY)

In [34]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("Baixando o modelo 'punkt' do NLTK...")
    nltk.download('punkt')
    nltk.download('punkt_tab')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [196]:
model_name = 'juridics/bertimbau-base-portuguese-sts-scale'
tokenizer_sentencas = nltk.data.load('tokenizers/punkt/portuguese.pickle')

model = SentenceTransformer(model_name)

In [104]:
def processar_strings_longas(array_de_entrada, limite_de_tamanho=10):

    array_de_saida = []

    for texto in array_de_entrada:
        if len(texto) > limite_de_tamanho:
            novas_sentencas = tokenizer_sentencas.tokenize(texto)
            for sentenca in novas_sentencas:
              try:
                saida = [palavra.strip() for palavra in sentenca.split(":") if palavra]
                array_de_saida.extend(saida)
              except:
                array_de_saida.extend(novas_sentencas)

        else:
            array_de_saida.append(texto)

    return array_de_saida

In [44]:
def get_centroid(bbox):
    x0, y0, x1, y1 = bbox
    cx = (x0 + x1) / 2
    cy = (y0 + y1) / 2
    return (round(cx), round(cy))

In [45]:
def find_closest_positions(array, positions_dict, k=5, tried=None):
    if tried is None:
        tried = set()

    clean_positions = {str(key): (float(val[0]), float(val[1]))
                       for key, val in positions_dict.items()}

    results = {}

    keys = np.array(list(clean_positions.keys()))
    coords = np.array(list(clean_positions.values()), dtype=float)

    for name in array:

        if name in tried:
            continue

        try:
            target_key = name
            target = np.array(clean_positions[name], dtype=float)

        except KeyError:
            match = next((key for key in clean_positions if name in key), None)

            if match:
                tried.add(name)

                target_key = match
                target = np.array(clean_positions[match], dtype=float)
            else:
                continue

        deltas = coords - target
        distances = np.sqrt(np.sum(deltas * deltas, axis=1))

        idx = np.argpartition(distances, k)[:k]

        results[target_key] = [
            (str(keys[i]), (float(coords[i][0]), float(coords[i][1])))
            for i in idx
        ]

    return results

### Extraindo todas as frases

In [81]:
def extrair_texto_arquivo(caminho_do_arquivo):
  #Extraindo texto do arquivo
  doc = fitz.open(caminho_do_arquivo)
  page = doc.load_page(0)

  text = page.get_text("text")

  texts = text.split("\n")

  texts = [palavra.strip() for palavra in texts if palavra]
  texts = [palavra.lower() for palavra in texts]
  return texts

### String Cleaning

In [99]:
def limpa_strings_simples(texts):
  text_len = [len(text) for text in texts]
  text_len = np.array(text_len)
  avg_size = round(np.mean(text_len))
  texts_clean = processar_strings_longas(texts, avg_size)
  return texts_clean

In [102]:
def map_labels_words(labels_mapped_words, label, texts):
  cache = labels_mapped_words.setdefault(label, set())

  newly_added = []

  for text in texts:
      if text not in cache:
          cache.add(text)
          newly_added.append(text)

  return newly_added

### Pegando as posições das frases

In [111]:
def get_words_pos(caminho_do_arquivo):
  doc = fitz.open(caminho_do_arquivo)
  page = doc.load_page(0)

  data = page.get_text("dict")

  mapped_text_pos = {}

  for  i,block in enumerate(data["blocks"]):
      if data["blocks"][i]["type"] == 0:
          for j, line in enumerate(data["blocks"][i]["lines"]):
              text = data["blocks"][i]["lines"][j]["spans"][0]["text"]
              bbox = data["blocks"][i]["lines"][j]["spans"][0]["bbox"]
              centroid = get_centroid(bbox)
              text = text.strip()
              text = text.lower()
              mapped_text_pos[text] = centroid

  return mapped_text_pos

In [115]:
def calc_embeddings(words: list[str]):
  for word in words:
    word_embedding = model.encode(word, convert_to_tensor=True)
  return word_embedding

In [274]:
def consultar_modelo(key, entrada_prompt, esquema_extracao):
    prompt = f"""
      Você é um organizador de informações.
      Dado o contexto: {key}
      Com o esquema de extração: {esquema_extracao[key]}
      E a informação extraída: {entrada_prompt}

      Identifique se existe uma resposta válida nessa informação.
      - Se existir, retorne apenas no formato: resposta
      - Se não existir, retorne exatamente: null
      """

    response = client.chat.completions.create(
        model="gpt-5-mini",
        messages=[
            {"role": "system", "content": "Você é direto, objetivo e não inventa informações."},
            {"role": "user", "content": prompt}
        ]
    )

    return response.choices[0].message.content

In [151]:
def get_embeddings_with_cache(texts, model, cache):
    texts_to_encode = [text for text in texts if text not in cache]
    if texts_to_encode:
        new_embeddings = model.encode(texts_to_encode, convert_to_tensor=True)
        for text, emb in zip(texts_to_encode, new_embeddings):
            cache[text] = emb
    embeddings_list = [cache[text] for text in texts]
    if embeddings_list:
        return torch.stack(embeddings_list)
    else:
        return torch.tensor([])

In [244]:
def calc_cosine_sim(query_embedding, candidate_embeddings, candidates, n_neighbours, mapped_text_pos):
  cosine_scores = util.cos_sim(query_embedding, candidate_embeddings)
  ranked_answers = sorted(cosine_scores[0].tolist(), reverse=True)

  ranked_results = sorted(
      zip(candidates, cosine_scores[0].tolist()),
      key=lambda x: x[1],
      reverse=True
  )
  top_5_results = []
  for candidate, score in ranked_results[:n_neighbours]:
    top_5_results.append(candidate)

  #Respostas mais próximas do alvo
  best_results = find_closest_positions(top_5_results, mapped_text_pos, n_neighbours)
  return best_results

In [251]:
def adicionar(saida_problema, label, pergunta, resposta):
    saida_problema.setdefault(label, {})[pergunta] = resposta

In [265]:
def adicionar(saida_problema, label, arquivo_id, pergunta, resposta):
    doc_dict = saida_problema.setdefault(label, {})
    perguntas_dict = doc_dict.setdefault(arquivo_id, {})
    perguntas_dict[pergunta] = resposta

In [278]:
def processar(input_path: str, output_path: str, k_neighbors: int):

    data_json = {}
    for file in glob.glob(os.path.join(Path(input_path), "*.json")):
      with open(file, encoding='utf-8') as f:
          dados = json.load(f)
          data_json[Path(file).stem] = dados
    labels_mapped_words = {}
    embedding_cache = {}
    saida_problema = {}

    for file in data_json.keys():
        archives = data_json[file]

        candidate_embeddings = torch.tensor([])

        for archive in tqdm.tqdm(archives, desc=f"Processando {file}"):

            file_path = Path(archive["pdf_path"])

            textos_pdf = extrair_texto_arquivo(file_path)
            textos_pdf = limpa_strings_simples(textos_pdf)

            new_texts = map_labels_words(labels_mapped_words, archive["label"], textos_pdf)

            textos_pos = get_words_pos(file_path)

            if len(new_texts) > 0:
                new_embeddings = model.encode(new_texts, convert_to_tensor=True)

                if candidate_embeddings.numel() == 0:
                    candidate_embeddings = new_embeddings
                else:
                    candidate_embeddings = torch.cat((candidate_embeddings, new_embeddings), dim=0)

            esquema_extracao = archive["extraction_schema"]

            for key in tqdm.tqdm(esquema_extracao.keys(), desc=f"Processando Perguntas"):
                found = False

                tensor_key = calc_embeddings([key])
                best_ranked = calc_cosine_sim(tensor_key, candidate_embeddings, new_texts, k_neighbors, textos_pos)

                strings_usadas = set()

                for result in best_ranked.keys():
                  entrada_prompt = best_ranked[result]

                  entrada_filtrada = [item for item in entrada_prompt if item[0] not in strings_usadas]

                  if not entrada_filtrada:
                    continue

                  strings_usadas.update(item[0] for item in entrada_filtrada)


                  saida = consultar_modelo(key, entrada_filtrada, esquema_extracao)

                  if saida != "null":
                      adicionar(saida_problema, archive["label"], archive["pdf_path"], key, saida)
                      found = True
                      break

                if not found:
                    adicionar(saida_problema, archive["label"], archive["pdf_path"], key, None)

    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(saida_problema, f, ensure_ascii=False, indent=4)

In [None]:
if __name__ == "__main__":

    parser = argparse.ArgumentParser(description="Processador de documentos JSON com GPT.")
    parser.add_argument("--input", required=True, help="Caminho dos JSONs de entrada.")
    parser.add_argument("--output", required=True, help="Arquivo JSON para salvar os resultados.")
    parser.add_argument("--neighbors", type=int, default=5, help="Número de vizinhos para cosine similarity.")

    args = parser.parse_args()
    processar(args.input, args.output, args.neighbors)

In [279]:
processar("./jsons_path/", "saida.json", 5)

Processando dataset:   0%|          | 0/6 [00:00<?, ?it/s]
Processando Perguntas:   0%|          | 0/8 [00:00<?, ?it/s][A
Processando Perguntas:  12%|█▎        | 1/8 [00:14<01:40, 14.42s/it][A
Processando Perguntas:  25%|██▌       | 2/8 [00:19<00:52,  8.68s/it][A
Processando Perguntas:  38%|███▊      | 3/8 [00:24<00:35,  7.18s/it][A
Processando Perguntas:  50%|█████     | 4/8 [00:36<00:36,  9.00s/it][A
Processando Perguntas:  62%|██████▎   | 5/8 [00:49<00:31, 10.46s/it][A
Processando Perguntas:  75%|███████▌  | 6/8 [01:03<00:23, 11.66s/it][A
Processando Perguntas:  88%|████████▊ | 7/8 [01:22<00:14, 14.08s/it][A
Processando Perguntas: 100%|██████████| 8/8 [01:36<00:00, 12.06s/it]
Processando dataset:  17%|█▋        | 1/6 [01:37<08:09, 97.98s/it]
Processando Perguntas:   0%|          | 0/7 [00:00<?, ?it/s][A
Processando Perguntas:  14%|█▍        | 1/7 [00:06<00:36,  6.17s/it][A
Processando Perguntas:  29%|██▊       | 2/7 [00:11<00:29,  5.90s/it][A
Processando Perguntas:  43%|█