In [None]:
# ============================================================
# 📘 Experimento comparativo: One-hot vs Hashing vs Embeddings vs MLT
# ============================================================

import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras import layers, models




In [None]:
import pandas as pd

# ratings: user id | item id | rating | timestamp
ratings = pd.read_csv("ml-20m/ratings.csv")


import pandas as pd

# Reindexar usuários
user_encoder = {u: i for i, u in enumerate(ratings["userId"].unique())}
ratings["userId"] = ratings["userId"].map(user_encoder)
movie_encoder = {m: i for i, m in enumerate(ratings["movieId"].unique())}
ratings["movieId"] = ratings["movieId"].map(movie_encoder)


ratings["label"] = (ratings["rating"] >= 4).astype(int)

from sklearn.model_selection import train_test_split

X = ratings[["userId", "movieId"]].values   # shape (100000, 2)
y = ratings["label"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
display(ratings.head())
display(ratings.shape)


In [None]:
ratings.head()

In [None]:
# ---------------------------
# 2. Funções utilitárias
# ---------------------------

def build_model(input_dim, hidden_dim=64):
    """Rede neural simples para classificação binária"""
    model = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(hidden_dim, activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(1, activation="sigmoid")
    ])
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model


In [None]:
#3. One-Hot Encoding (MovieLens: user + movie)
X_train_df = pd.DataFrame(X_train, columns=["user", "movie"])
X_test_df  = pd.DataFrame(X_test, columns=["user", "movie"])

# ---------------------------
# 3. One-Hot Encoding (MovieLens: user + movie)
# ---------------------------
print("🔹 One-hot encoding")

try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)  # sklearn >= 1.2
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)         # sklearn < 1.2

X_train_ohe = ohe.fit_transform(X_train_df.astype(str))
X_test_ohe  = ohe.transform(X_test_df.astype(str))

# Modelo
model = build_model(X_train_ohe.shape[1])

start = time.time()
history = model.fit(X_train_ohe, y_train, epochs=3, batch_size=1024, verbose=1)
train_time = time.time() - start

start = time.time()
y_pred = (model.predict(X_test_ohe, verbose=0) > 0.5).astype(int)
infer_time = (time.time() - start) / len(X_test) * 1e6

acc = accuracy_score(y_test, y_pred)

one_hot_results = {
    "Dimensão": X_train_ohe.shape[1],
    "Reversível": "Sim",
    "Parâmetros aprendidos": 0,
    "Tempo treino (s/época)": round(train_time/3,2),
    "Tempo inferência (µs)": round(infer_time,2),
    "Acurácia (%)": round(acc*100,2)
}

print("✅ Resultados One-hot:", one_hot_results)


#✅ Resultados One-hot: {'Dimensão': 164320, 'Reversível': 'Sim', 'Parâmetros aprendidos': 0, 'Tempo treino (s/época)': 5029.9, 'Tempo inferência (µs)': 427.03, 'Acurácia (%)': 74.18}



In [None]:
# 4. Hashing Trick (MovieLens: user + movie)
# ---------------------------
print("🔹 Hashing Trick")

from sklearn.feature_extraction import FeatureHasher

fh = FeatureHasher(n_features=512, input_type="string")

# Converter pares (user, movie) em listas de strings
X_train_hash = fh.transform([[f"user_{u}", f"movie_{m}"] for u, m in X_train])
X_test_hash  = fh.transform([[f"user_{u}", f"movie_{m}"] for u, m in X_test])

# Modelo
model = build_model(X_train_hash.shape[1])

start = time.time()
history = model.fit(X_train_hash, y_train, epochs=3, batch_size=1024, verbose=1)
train_time = time.time() - start

start = time.time()
y_pred = (model.predict(X_test_hash, verbose=0) > 0.5).astype(int)
infer_time = (time.time() - start) / len(X_test) * 1e6

acc = accuracy_score(y_test, y_pred)

hashing_results = {
    "Dimensão": X_train_hash.shape[1],
    "Reversível": "Não",
    "Parâmetros aprendidos": 0,
    "Tempo treino (s/época)": round(train_time/3,2),
    "Tempo inferência (µs)": round(infer_time,2),
    "Acurácia (%)": round(acc*100,2)
}

print("✅ Resultados Hashing:", hashing_results)


#✅ Resultados Hashing: {'Dimensão': 512, 'Reversível': 'Não', 'Parâmetros aprendidos': 0, 'Tempo treino (s/época)': 144.27, 'Tempo inferência (µs)': 86.74, 'Acurácia (%)': 57.87}


In [None]:
# 5. Embeddings Aprendidos (MovieLens: user + movie)
# ---------------------------






print("🔹 Embeddings")
embedding_dim = 32

n_users = ratings["userId"].nunique()
n_movies = ratings["movieId"].nunique()

# Inputs separados
user_in = layers.Input(shape=(1,), name="user")
movie_in = layers.Input(shape=(1,), name="movie")

# Embeddings
user_emb = layers.Embedding(input_dim=n_users+1, output_dim=embedding_dim, name="user_emb")(user_in)
movie_emb = layers.Embedding(input_dim=n_movies+1, output_dim=embedding_dim, name="movie_emb")(movie_in)

# Flatten
user_emb = layers.Flatten()(user_emb)
movie_emb = layers.Flatten()(movie_emb)

# Concatenate
x = layers.Concatenate()([user_emb, movie_emb])
x = layers.Dense(64, activation="relu")(x)
x = layers.Dense(32, activation="relu")(x)
out = layers.Dense(1, activation="sigmoid")(x)

model = models.Model(inputs=[user_in, movie_in], outputs=out)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Treinar
start = time.time()
history = model.fit(
    {"user": X_train[:,0], "movie": X_train[:,1]},
    y_train,
    epochs=3, batch_size=1024, verbose=1
)
train_time = time.time() - start

# Inferência
start = time.time()
y_pred = (model.predict({"user": X_test[:,0], "movie": X_test[:,1]}, verbose=0) > 0.5).astype(int)
infer_time = (time.time() - start) / len(X_test) * 1e6

acc = accuracy_score(y_test, y_pred)

embedding_results = {
    "Dimensão": embedding_dim,
    "Reversível": "Não",
    "Parâmetros aprendidos": (n_users+1)*embedding_dim + (n_movies+1)*embedding_dim,
    "Tempo treino (s/época)": round(train_time/3,2),
    "Tempo inferência (µs)": round(infer_time,2),
    "Acurácia (%)": round(acc*100,2)
}

print("✅ Resultados Embeddings:", embedding_results)


In [None]:
import numpy as np
import time
import tensorflow as tf
from numpy.linalg import LinAlgError
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# ---------------------------
# MLT otimizado (usa preallocação)
# ---------------------------
def random_invertible_matrix_mod_p(n, p, max_tries=1000):
    for _ in range(max_tries):
        A = np.random.randint(0, p, size=(n, n), dtype=np.int16)
        if np.gcd(int(round(np.linalg.det(A))) % p, p) == 1:
            return A % p
    raise LinAlgError(f"Não foi possível gerar matriz invertível em {max_tries} tentativas.")

def mlt_encode(ids, p=47, n=4, A=None):
    ids = np.asarray(ids, dtype=np.int64)
    if A is None:
        A = random_invertible_matrix_mod_p(n, p)

    print(f"🔹 Codificando {len(ids)} IDs com base {p} e dimensão {n}...")
    codes = np.empty((len(ids), n), dtype=np.int16)
    for i, id_val in enumerate(ids):
        digits = np.empty(n, dtype=np.int16)
        x = id_val
        for j in range(n):
            digits[j] = x % p
            x //= p
        codes[i] = (A.dot(digits) % p).astype(np.int16)
        # 🔹 loga progresso em %
        if (i + 1) % max(1, len(ids)//10) == 0:  # a cada 10%
            pct = (i + 1) / len(ids) * 100
            print(f"   Progresso: {pct:.1f}% ({i+1}/{len(ids)})")
    print(f"✅ Finalizado: shape={codes.shape}")
    return codes, A

# ---------------------------
# Codificação user+movie
# ---------------------------
print("\n🔹 Iniciando codificação de usuários e filmes")
X_train_user, A_user = mlt_encode(X_train[:,0], p=47, n=64)
X_test_user, _       = mlt_encode(X_test[:,0], p=47, n=64, A=A_user)

X_train_movie, A_movie = mlt_encode(X_train[:,1], p=47, n=64)
X_test_movie, _        = mlt_encode(X_test[:,1], p=47, n=64, A=A_movie)

X_train_mlt = np.hstack((X_train_user, X_train_movie))
X_test_mlt  = np.hstack((X_test_user, X_test_movie))
print(f"🔹 Shape X_train_mlt={X_train_mlt.shape}, X_test_mlt={X_test_mlt.shape}")

# 🔹 Normalização em streaming (menos RAM)
print("\n🔹 Normalizando features...")
scaler = StandardScaler(copy=False)
X_train_mlt = scaler.fit_transform(X_train_mlt)
X_test_mlt  = scaler.transform(X_test_mlt)
print("✅ Normalização concluída")

# ---------------------------
# Criar tf.data.Dataset
# ---------------------------
batch_size = 2048
print("\n🔹 Construindo datasets...")
train_dataset = (
    tf.data.Dataset.from_tensor_slices((X_train_mlt.astype(np.float32), y_train.astype(np.float32)))
    .shuffle(buffer_size=len(X_train_mlt))
    .batch(batch_size)
    .prefetch(tf.data.AUTOTUNE)
)

test_dataset = (
    tf.data.Dataset.from_tensor_slices((X_test_mlt.astype(np.float32), y_test.astype(np.float32)))
    .batch(batch_size)
    .prefetch(tf.data.AUTOTUNE)
)
print("✅ Datasets prontos")

# ---------------------------
# Treino com dataset
# ---------------------------
print("\n🔹 Iniciando treino do modelo...")
model = build_model(X_train_mlt.shape[1])
epochs = 10

start = time.time()
for epoch in range(epochs):
    epoch_loss = []
    batch_count = 0
    for X_batch, y_batch in train_dataset:
        loss, acc = model.train_on_batch(X_batch, y_batch)
        epoch_loss.append(loss)
        batch_count += 1
        if batch_count % 50 == 0:
            print(f"   [Época {epoch+1}] Batch {batch_count} → Loss={loss:.4f}, Acc={acc:.4f}")

    print(f"📌 Época {epoch+1}/{epochs} - Loss médio: {np.mean(epoch_loss):.4f} ({batch_count} batches)")
train_time = time.time() - start
print("✅ Treino finalizado")

# ---------------------------
# Inferência
# ---------------------------
print("\n🔹 Rodando inferência...")
start = time.time()
y_pred = []
batch_count = 0
for X_batch, _ in test_dataset:
    batch_pred = (model.predict_on_batch(X_batch) > 0.5).astype(np.int8)
    y_pred.append(batch_pred)
    batch_count += 1
    if batch_count % 20 == 0:
        print(f"   Batch {batch_count} de inferência processado")
y_pred = np.vstack(y_pred).ravel()

infer_time = (time.time() - start) / len(X_test) * 1e6
acc = accuracy_score(y_test, y_pred)
print("✅ Inferência concluída")

# ---------------------------
# Resultados finais
# ---------------------------
mlt_results = {
    "Dimensão": X_train_mlt.shape[1],
    "Reversível": "Sim",
    "Parâmetros aprendidos": 0,
    "Tempo treino (s/época)": round(train_time/epochs, 2),
    "Tempo inferência (µs)": round(infer_time, 2),
    "Acurácia (%)": round(acc*100, 2)
}

print("\n🎯 Resultados finais MLT:")
for k, v in mlt_results.items():
    print(f"   {k}: {v}")


In [None]:
import numpy as np
import pandas as pd
import time
import tensorflow as tf
import pyarrow.parquet as pq
from numpy.linalg import LinAlgError
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# ---------------------------
# MLT otimizado
# ---------------------------
def random_invertible_matrix_mod_p(n, p, max_tries=1000):
    for _ in range(max_tries):
        A = np.random.randint(0, p, size=(n, n), dtype=np.int16)
        if np.gcd(int(round(np.linalg.det(A))) % p, p) == 1:
            return A % p
    raise LinAlgError(f"Não foi possível gerar matriz invertível em {max_tries} tentativas.")

def mlt_encode(ids, p=47, n=4, A=None, verbose=True):
    ids = np.asarray(ids, dtype=np.int64)
    if A is None:
        A = random_invertible_matrix_mod_p(n, p)

    if verbose:
        print(f"🔹 Codificando {len(ids)} IDs com base {p} e dimensão {n}...")

    digits = np.empty((len(ids), n), dtype=np.int16)
    x = ids.copy()
    for j in range(n):
        digits[:, j] = x % p
        x //= p

    codes = (digits @ A.T) % p
    codes = codes.astype(np.int16)

    if verbose:
        print(f"✅ Finalizado: shape={codes.shape}")

    return codes, A

# ---------------------------
# Codificação user+movie
# ---------------------------
print("\n🔹 Iniciando codificação de usuários e filmes")
X_train_user, A_user = mlt_encode(X_train[:, 0], p=7, n=128)
X_test_user, _       = mlt_encode(X_test[:, 0], p=7, n=128, A=A_user)

X_train_movie, A_movie = mlt_encode(X_train[:, 1], p=7, n=128)
X_test_movie, _        = mlt_encode(X_test[:, 1], p=7, n=128, A=A_movie)

X_train_mlt = np.hstack((X_train_user, X_train_movie))
X_test_mlt  = np.hstack((X_test_user, X_test_movie))
print(f"🔹 Shape X_train_mlt={X_train_mlt.shape}, X_test_mlt={X_test_mlt.shape}")

# ---------------------------
# Normalização em chunks (menos RAM)
# ---------------------------
print("\n🔹 Normalizando features...")
scaler = StandardScaler()
chunk_size = 1000

# Ajusta média/desvio padrão
for start in range(0, len(X_train_mlt), chunk_size):
    scaler.partial_fit(X_train_mlt[start:start + chunk_size])

# Aplica normalização inplace
for start in range(0, len(X_train_mlt), chunk_size):
    X_train_mlt[start:start + chunk_size] = scaler.transform(X_train_mlt[start:start + chunk_size])

for start in range(0, len(X_test_mlt), chunk_size):
    X_test_mlt[start:start + chunk_size] = scaler.transform(X_test_mlt[start:start + chunk_size])

print("✅ Normalização concluída")

# ---------------------------
# Salvar os Parquet
# ---------------------------
print("\n🔹 Salvando arquivos Parquet...")
df_train = pd.DataFrame(X_train_mlt, columns=[f"f{i}" for i in range(X_train_mlt.shape[1])])
df_train["label"] = y_train
df_train.to_parquet("train.parquet", index=False)

df_test = pd.DataFrame(X_test_mlt, columns=[f"f{i}" for i in range(X_test_mlt.shape[1])])
df_test["label"] = y_test
df_test.to_parquet("test.parquet", index=False)
print("✅ Arquivos train.parquet e test.parquet salvos.")

# ---------------------------
# Funções auxiliares para Parquet → Dataset
# ---------------------------
def parquet_generator(parquet_file, feature_cols, label_col, batch_size=512):
    table = pq.ParquetFile(parquet_file)
    for batch in table.iter_batches(batch_size=batch_size):
        df = batch.to_pandas()
        X = df[feature_cols].to_numpy(dtype=np.float32)
        y = df[label_col].to_numpy(dtype=np.float32)
        yield X, y

def make_dataset(parquet_file, feature_cols, label_col, batch_size=512, shuffle=False):
    output_signature = (
        tf.TensorSpec(shape=(None, len(feature_cols)), dtype=tf.float32),
        tf.TensorSpec(shape=(None,), dtype=tf.float32),
    )
    ds = tf.data.Dataset.from_generator(
        lambda: parquet_generator(parquet_file, feature_cols, label_col, batch_size),
        output_signature=output_signature
    )
    if shuffle:
        ds = ds.shuffle(buffer_size=10_000)
    return ds.prefetch(tf.data.AUTOTUNE)

# ---------------------------
# Criar tf.data.Dataset
# ---------------------------
batch_size = 512
print("\n🔹 Construindo datasets a partir de Parquet...")

schema = pq.read_schema("train.parquet")
all_cols = schema.names
label_col = "label"
feature_cols = [c for c in all_cols if c != label_col]
num_features = len(feature_cols)

train_dataset = make_dataset("train.parquet", feature_cols, label_col, batch_size=batch_size, shuffle=True)
test_dataset  = make_dataset("test.parquet", feature_cols, label_col, batch_size=batch_size, shuffle=False)

print(f"✅ Datasets prontos (streaming de Parquet) - {num_features} features detectadas")


In [None]:

# ---------------------------
# Treino com dataset
# ---------------------------
print("\n🔹 Iniciando treino do modelo...")
model = build_model(num_features)
epochs = 10

start = time.time()
for epoch in range(epochs):
    epoch_loss = []
    batch_count = 0
    for X_batch, y_batch in train_dataset:
        loss, acc = model.train_on_batch(X_batch, y_batch)
        epoch_loss.append(loss)
        batch_count += 1
        if batch_count % 50 == 0:
            print(f"   [Época {epoch+1}] Batch {batch_count} → Loss={loss:.4f}, Acc={acc:.4f}")

    print(f"📌 Época {epoch+1}/{epochs} - Loss médio: {np.mean(epoch_loss):.4f} ({batch_count} batches)")

train_time = time.time() - start
print("✅ Treino finalizado")

# ---------------------------
# Inferência
# ---------------------------
print("\n🔹 Rodando inferência...")
start = time.time()
y_pred = []
batch_count = 0
for X_batch, _ in test_dataset:
    batch_pred = (model.predict_on_batch(X_batch) > 0.5).astype(np.int8)
    y_pred.append(batch_pred)
    batch_count += 1
    if batch_count % 20 == 0:
        print(f"   Batch {batch_count} de inferência processado")
y_pred = np.vstack(y_pred).ravel()

infer_time = (time.time() - start) / len(X_test) * 1e6
acc = accuracy_score(y_test, y_pred)
print("✅ Inferência concluída")

# ---------------------------
# Resultados finais
# ---------------------------
mlt_results = {
    "Dimensão": num_features,
    "Reversível": "Sim",
    "Parâmetros aprendidos": 0,
    "Tempo treino (s/época)": round(train_time/epochs, 2),
    "Tempo inferência (µs)": round(infer_time, 2),
    "Acurácia (%)": round(acc*100, 2)
}

print("\n🎯 Resultados finais MLT:")
for k, v in mlt_results.items():
    print(f"   {k}: {v}")


In [None]:
# 6. MLT + Autoencoder 
print("🔹 MLT + Autoencoder")

p = 47
n = 64
bottleneck_dim = 16

# Codificar usuários e filmes separadamente
X_train_user, A_user = mlt_encode(X_train[:,0], p, n)
X_test_user, _       = mlt_encode(X_test[:,0], p, n, A=A_user)

X_train_movie, A_movie = mlt_encode(X_train[:,1], p, n)
X_test_movie, _        = mlt_encode(X_test[:,1], p, n, A=A_movie)

# Concatenar representações user+movie (entrada MLT crua)
X_train_mlt = np.hstack([X_train_user, X_train_movie])
X_test_mlt  = np.hstack([X_test_user, X_test_movie])

input_dim = X_train_mlt.shape[1]

# ---------------------------
# Definir Autoencoder
# ---------------------------
from tensorflow.keras import layers, models

# ---------------------------
# Autoencoder com Embedding no input (MLT tokens)
# ---------------------------
inp = layers.Input(shape=(input_dim,), dtype="int32", name="mlt_input")

# 🔹 Embedding: cada valor inteiro (0..p-1) vai para um vetor contínuo
emb = layers.Embedding(input_dim=p, output_dim=16, name="mlt_embedding")(inp)  
# Agora shape = (batch, input_dim, 16)

# 🔹 Achatar para (batch, input_dim * 16)
emb = layers.Flatten()(emb)

from tensorflow.keras import regularizers

def residual_block(x, units, dropout_rate=0.2, name="res_block"):
    """Bloco residual simples com Dense + BatchNorm + Dropout."""
    shortcut = x
    x = layers.Dense(units, activation="relu",
                     kernel_regularizer=regularizers.l2(1e-4),
                     name=f"{name}_dense")(x)
    x = layers.BatchNormalization(name=f"{name}_bn")(x)
    x = layers.Dropout(dropout_rate, name=f"{name}_drop")(x)
    # ajustar dimensão do atalho se necessário
    if shortcut.shape[-1] != units:
        shortcut = layers.Dense(units, activation=None, name=f"{name}_proj")(shortcut)
    x = layers.Add(name=f"{name}_add")([x, shortcut])
    x = layers.Activation("relu", name=f"{name}_out")(x)
    return x

# ---------------------------
# Encoder
# ---------------------------
x = residual_block(emb, 128, dropout_rate=0.3, name="enc_res1")
x = residual_block(x, 32, dropout_rate=0.2, name="enc_res2")
bottleneck = layers.Dense(bottleneck_dim, activation="linear", name="bottleneck")(x)

# ---------------------------
# Decoder
# ---------------------------
x_dec = residual_block(bottleneck, 32, dropout_rate=0.2, name="dec_res1")
x_dec = residual_block(x_dec, 128, dropout_rate=0.3, name="dec_res2")
out_recon = layers.Dense(input_dim, activation="linear", name="reconstruction")(x_dec)

# ---------------------------
# Modelos
# ---------------------------
autoenc = models.Model(inputs=inp, outputs=out_recon, name="MLT_Autoencoder")


autoenc.compile(optimizer="adam", loss="mse")


# Treinar Autoencoder (não supervisionado)
autoenc.fit(
    X_train_mlt, X_train_mlt,
    epochs=10, batch_size=1024, verbose=1,
    validation_split=0.1
)


# ---------------------------
# 2. Extrair encoder treinado
# ---------------------------
encoder = models.Model(
    inputs=autoenc.input,
    outputs=autoenc.get_layer("bottleneck").output,
    name="MLT_Encoder"
)

# ---------------------------
# Gerar embeddings comprimidos (64D) para treino do modelo padrão
# ---------------------------
X_train_emb = encoder.predict(X_train_mlt, verbose=0)
X_test_emb  = encoder.predict(X_test_mlt, verbose=0)

# ---------------------------
# Usar o modelo padrão (build_model) com bottleneck como entrada
# ---------------------------
model = build_model(bottleneck_dim)

start = time.time()
history = model.fit(X_train_emb, y_train, epochs=10, batch_size=1024, verbose=1)
train_time = time.time() - start

# Inferência
start = time.time()
y_pred = (model.predict(X_test_emb, verbose=0) > 0.5).astype(int)
infer_time = (time.time() - start) / len(X_test) * 1e6

acc = accuracy_score(y_test, y_pred)

mlt_autoenc_results = {
    "Dimensão": bottleneck_dim,
    "Reversível": "Sim (MLT+Decoder)",
    "Parâmetros aprendidos": autoenc.count_params(),
    "Tempo treino (s/época)": round(train_time/100, 2),
    "Tempo inferência (µs)": round(infer_time, 2),
    "Acurácia (%)": round(acc*100, 2)
}

print("✅ Resultados MLT + Autoencoder:", mlt_autoenc_results)
#✅ Resultados MLT + Autoencoder: {'Dimensão': 16, 'Reversível': 'Sim (MLT+Decoder)', 'Parâmetros aprendidos': 79758, 'Tempo treino (s/época)': 2.13, 'Tempo inferência (µs)': 14.35, 'Acurácia (%)': 62.53}


In [None]:
# ---------------------------
# 7. Comparação Final
# ---------------------------

# 📌 Representações fixas (justa comparação)
results_fixed = pd.DataFrame(
    [one_hot_results, hashing_results, mlt_results, mlt_autoenc_results],
    index=["One-hot", "Hashing", "MLT", "MLT+Autoencoder"]
)

# 📌 Embeddings supervisionados (baseline separado)
results_embed = pd.DataFrame(
    [embedding_results],
    index=["Embeddings (supervisionados)"]
)

print("\n📊 Resultados Comparativos – Codificações Fixas")
display(results_fixed.sort_values("Acurácia (%)", ascending=False))

print("\n📊 Resultado – Embeddings Supervisionados (baseline separado)")
display(results_embed)
