In [1]:
import json
import random
from pathlib import Path
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np
import joblib

# ----------------------------
# 1️ Carregar dataset JSONL
# ----------------------------
DATA_PATH = Path("../data/annotations/iphone_auto_annotations.jsonl")

def load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

data = load_jsonl(DATA_PATH)
print(f"Total de exemplos carregados: {len(data)}")

# ----------------------------
# 2️ Divisão treino/teste
# ----------------------------
random.shuffle(data)
split = int(len(data) * 0.8)
train_data = data[:split]
test_data = data[split:]

print(f"Treino: {len(train_data)} | Teste: {len(test_data)}")

# ----------------------------
# 3️ Preparar dados para SBERT
# ----------------------------
def flatten_dataset(dataset):
    texts, labels = [], []
    for item in dataset:
        text = item["text"]
        ents = item["entities"]

        # Divide o texto em tokens e calcula offsets de início/fim
        tokens = text.split()
        offsets = []
        pos = 0
        for tok in tokens:
            start = text.find(tok, pos)
            end = start + len(tok)
            offsets.append((start, end))
            pos = end

        # Inicializa rótulos como "O" (fora de entidade)
        token_labels = ["O"] * len(tokens)
        for start, end, label in ents:
            for i, (tok_start, tok_end) in enumerate(offsets):
                # Marca tokens que estão dentro do span da entidade
                if tok_start >= start and tok_end <= end:
                    token_labels[i] = label

        texts.extend(tokens)
        labels.extend(token_labels)

    return texts, labels

X_train_tokens, y_train = flatten_dataset(train_data)
X_test_tokens, y_test = flatten_dataset(test_data)

print(f"Tokens de treino: {len(X_train_tokens)}")

# ----------------------------
# 4️ Gerar embeddings com SBERT
# ----------------------------
sbert_model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(sbert_model_name)

X_train_emb = model.encode(X_train_tokens, convert_to_numpy=True, show_progress_bar=True)
X_test_emb = model.encode(X_test_tokens, convert_to_numpy=True, show_progress_bar=True)

# ----------------------------
# 5️ Treinar classificador
# ----------------------------
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_emb, y_train)

# ----------------------------
# 6️ Avaliar modelo
# ----------------------------
y_pred = clf.predict(X_test_emb)
print("\n📊 Relatório de desempenho:")
print(classification_report(y_test, y_pred))

# ----------------------------
# 7️ Salvar modelo treinado
# ----------------------------
MODEL_DIR = Path("../models/sbert_iphone_model")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Salva o classificador
joblib.dump(clf, MODEL_DIR / "logreg_model.pkl")

# Salva o nome do modelo SBERT
with open(MODEL_DIR / "sbert_model_name.txt", "w", encoding="utf-8") as f:
    f.write(sbert_model_name)

print(f" Modelo salvo em: {MODEL_DIR}")

  from .autonotebook import tqdm as notebook_tqdm


Total de exemplos carregados: 336
Treino: 268 | Teste: 68
Tokens de treino: 2393


Batches: 100%|██████████| 75/75 [00:08<00:00,  8.74it/s]
Batches: 100%|██████████| 19/19 [00:01<00:00, 15.60it/s]



📊 Relatório de desempenho:
              precision    recall  f1-score   support

   CATEGORIA       1.00      1.00      1.00        68
         COR       1.00      0.83      0.91        36
       MARCA       0.97      1.00      0.98        62
     MEMORIA       1.00      1.00      1.00        65
      MODELO       0.98      1.00      0.99       109
           O       0.98      0.98      0.98       245

    accuracy                           0.98       585
   macro avg       0.99      0.97      0.98       585
weighted avg       0.98      0.98      0.98       585

 Modelo salvo em: ../models/sbert_iphone_model


In [7]:
import joblib
from pathlib import Path
from sentence_transformers import SentenceTransformer
MODEL_DIR = Path("../models/sbert_iphone_model")

# ----------------------------
# 8️ Teste de predição (modelo carregado)
# ----------------------------
print("\nTeste de predição usando o modelo salvo:")

# Recarrega o modelo salvo
clf_loaded = joblib.load(MODEL_DIR / "logreg_model.pkl")
with open(MODEL_DIR / "sbert_model_name.txt", "r", encoding="utf-8") as f:
    sbert_name = f.read().strip()
model_loaded = SentenceTransformer(sbert_name)

# Frase de exemplo
sample_text = "iphone 14 Pro 128GB Azul"
tokens = sample_text.split()
embs = model_loaded.encode(tokens)
preds = clf_loaded.predict(embs)

for tok, label in zip(tokens, preds):
    print(f"{tok} → {label}")


Teste de predição usando o modelo salvo:


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


iphone → CATEGORIA
14 → MODELO
Pro → MODELO
128GB → MEMORIA
Azul → COR
