In [1]:
import re
import json
import html
import numpy as np
from tqdm.auto import tqdm

def load_jsonl(file: str):
  data = open(file).read().split("\n")[:-1]
  data = [json.loads(row) for row in data]

  return data

def unique_by(lista, key_func=None):
    vistos = set()
    resultado = []

    for item in lista:
        if key_func is None:
            k = tuple(sorted(item.items()))
        else:
            k = key_func(item)

        if k not in vistos:
            vistos.add(k)
            resultado.append(item)

    return resultado

def normalize_text(s: str) -> str:
    if s is None:
        return ""
    s = html.unescape(s)
    s = s.replace("\r", " ").replace("\n", " ").strip()
    s = re.sub(r"\s+", " ", s)
    return s

In [4]:
!gdown "1Yym-ECdgWEezSlEN8yYPvv2bjZjGbcpk"
actions = load_jsonl("/content/pl_actions.jsonl")
actions = unique_by(actions)

ementa2action = {}

for row in actions:
  ementa2action[row['ementa']] = row['acao']

len(ementa2action.keys())

Downloading...
From: https://drive.google.com/uc?id=1Yym-ECdgWEezSlEN8yYPvv2bjZjGbcpk
To: /content/pl_actions.jsonl
100% 66.0M/66.0M [00:00<00:00, 212MB/s]


217938

In [13]:
!gdown "1rqIzHPwR37YnxuNzw6UCMNFrv65kHir8"
metadata = load_jsonl("/content/pl.jsonl")
metadata = [row for row in metadata if row['ementa']]
metadata = unique_by(metadata, key_func=lambda x: (x['municipio'], x['uf'], x['ementa']))

for row in tqdm(metadata):
  row['acao'] = ementa2action[normalize_text(row['ementa'])]

len(metadata)

Downloading...
From (original): https://drive.google.com/uc?id=1rqIzHPwR37YnxuNzw6UCMNFrv65kHir8
From (redirected): https://drive.google.com/uc?id=1rqIzHPwR37YnxuNzw6UCMNFrv65kHir8&confirm=t&uuid=22edba3b-8062-425f-a485-7f7e63eac862
To: /content/pl.jsonl
100% 140M/140M [00:00<00:00, 214MB/s]


  0%|          | 0/220065 [00:00<?, ?it/s]

220065

In [14]:
from sentence_transformers import SentenceTransformer
import numpy as np

actions = [row['acao'] for row in metadata]
model = SentenceTransformer("embaas/sentence-transformers-multilingual-e5-base")

embeddings = model.encode(
    actions,
    batch_size=256,
    show_progress_bar=True,
    normalize_embeddings=True  # muito importante p/ usar distância cosseno
)

embeddings = np.asarray(embeddings, dtype=np.float16)
print(embeddings.shape)  # (250000, 768)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/714 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/860 [00:00<?, ?it/s]

(220065, 768)


In [15]:
for idx, row in enumerate(metadata):
  row['embedding'] = embeddings[idx]

In [16]:
with open("/content/dataset.npy", "wb") as f:
  np.save(f, metadata)

In [17]:
!cp "/content/dataset.npy" "/content/drive/MyDrive/FAPESP/City Manager/dataset.npy"