# BERT - A partir de 1 product_id o programa gera os 3 produtos mais semelhantes da Amazon

In [5]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
from tqdm.auto import tqdm
import torch

# 1) Carrega o DataFrame e seleciona colunas
df = pd.read_csv('amazon.csv')  # ajuste o caminho
df = df[['product_id', 'product_name', 'category', 'about_product']]

# 2) Monta texto combinado
def combine_text(row):
    parts = []
    for col in ['product_name', 'category', 'about_product']:
        if isinstance(row[col], str) and row[col].strip():
            parts.append(row[col].strip())
    return " ".join(parts)

df['combined_text'] = df.apply(combine_text, axis=1)

# 3) Inicializa BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model     = BertModel.from_pretrained('bert-base-uncased')
model.eval()

# 4) Função para gerar embedding CLS
def get_embedding(text: str):
    inputs = tokenizer(text, return_tensors='pt', padding=True,
                       truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[0, 0].cpu().numpy()

# 5) Gera embeddings e salva em .npy
embeddings = []
for txt in tqdm(df['combined_text'], desc='Embedding produtos'):
    embeddings.append(get_embedding(txt))
embeddings = np.vstack(embeddings)
np.save('bert_product_embeddings.npy', embeddings)

# 6) (Opcional) salve também o DataFrame reduzido para referência
df[['product_id']].to_parquet('df_product_ids.parquet', index=False)

print("✅ Embeddings salvos em bert_product_embeddings.npy")

Embedding produtos:   0%|          | 0/1465 [00:00<?, ?it/s]

✅ Embeddings salvos em bert_product_embeddings.npy


In [11]:

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_distances

# 1) Recarrega o DataFrame mínimo com IDs e nomes
df_prod = pd.read_csv('amazon.csv')  # deve conter colunas product_id, product_name

ids   = df_prod['product_id'].values
names = df_prod['product_name'].values

# 2) Carrega os embeddings pré-computados
embeddings = np.load('bert_product_embeddings.npy')

# 3) Mapeia product_id → índice
id_to_idx = {pid: i for i, pid in enumerate(ids)}

# 4) Função de busca que retorna um DataFrame com ID + nome
def find_similar_products(query_id, embeddings, id_to_idx, df_prod, top_k=3):
    if query_id not in id_to_idx:
        raise KeyError(f"Product ID {query_id} não encontrado.")
    idx = id_to_idx[query_id]
    q_emb = embeddings[idx].reshape(1, -1)
    dists = cosine_distances(q_emb, embeddings)[0]
    dists[idx] = np.inf
    nearest = np.argsort(dists)[:top_k]
    # monta o resultado com ID e nome
    return df_prod.iloc[nearest][['product_id', 'product_name']].reset_index(drop=True)



In [23]:
# 5) Uso
consulta = "B098NS6PVG"  # troque pelo product_id desejado
top3_df = find_similar_products(consulta, embeddings, id_to_idx, df_prod, top_k=3)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 200)
print(top3_df)

   product_id  \
0  B098NS6PVG   
1  B098NS6PVG   
2  B082LSVT4B   

                                                                                                                                                                                        product_name  
0  Ambrane Unbreakable 60W / 3A Fast Charging 1.5m Braided Type C Cable for Smartphones, Tablets, Laptops & other Type C devices, PD Technology, 480Mbps Data Sync, Quick Charge 3.0 (RCT15A, Black)  
1  Ambrane Unbreakable 60W / 3A Fast Charging 1.5m Braided Type C Cable for Smartphones, Tablets, Laptops & other Type C devices, PD Technology, 480Mbps Data Sync, Quick Charge 3.0 (RCT15A, Black)  
2          Ambrane Unbreakable 60W / 3A Fast Charging 1.5m Braided Type C to Type C Cable for Smartphones, Tablets, Laptops & Other Type C Devices, PD Technology, 480Mbps Data Sync (RCTT15, Black)  
