# OpenAI - A partir de um input de um usuário ele encontra o produto mais relevante

In [10]:
import os
from dotenv import load_dotenv
import openai
import pandas as pd
import numpy as np
import ast
from sklearn.metrics.pairwise import cosine_distances
from tqdm.auto import tqdm


# 1) API key
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
if not openai.api_key:
    raise ValueError("OPENAI_API_KEY não encontrada no .env")

# 2) Leitura do CSV com embeddings
df = pd.read_csv('OpenAI_embeddings.csv')

# 3) Colunas de embedding, já em ordem de maior para menor relevância
emb_cols = [
    'embedding_product_name',
    'embedding_category',
    'embedding_about_product',
    'embedding_review_title',
    'embedding_review_content'
]

# 4) Defina aqui seus pesos (devem estar na mesma ordem de emb_cols e somar 1.0)
weights = [0.30, 0.25, 0.20, 0.15, 0.10]

# 5) Converter string → np.array e computar média ponderada
def weighted_embedding(row):
    vecs = []
    for col in emb_cols:
        s = row[col]
        if isinstance(s, str) and s.strip():
            vecs.append(np.array(ast.literal_eval(s), dtype=float))
        else:
            # vetor zero caso falte embedding
            vecs.append(np.zeros_like(vecs[0]) if vecs else None)
    # multiplica cada vetor pelo peso correspondente e soma
    weighted = np.zeros_like(vecs[0])
    for w, v in zip(weights, vecs):
        weighted += w * v
    return weighted  # já é 1 536-dim

tqdm.pandas(desc="Calculando embeddings ponderados")
df['emb_weighted'] = df.progress_apply(weighted_embedding, axis=1)
emb_matrix = np.vstack(df['emb_weighted'].values)

# 6) Função para gerar embedding da query (1 536-dim)
def get_query_embedding(text: str, model: str = "text-embedding-3-small"):
    if not text or not text.strip():
        raise ValueError("Texto de busca vazio")
    resp = openai.embeddings.create(input=text, model=model)
    return np.array(resp.data[0].embedding, dtype=float)

# 7) Busca semântica ponderada
def semantic_search(query_text: str, top_k: int = 3):
    q_emb = get_query_embedding(query_text).reshape(1, -1)
    dists = cosine_distances(q_emb, emb_matrix)[0]
    nearest = np.argsort(dists)[:top_k]
    return df.iloc[nearest][['product_id', 'product_name']].reset_index(drop=True)



Calculando embeddings ponderados:   0%|          | 0/1465 [00:00<?, ?it/s]

In [13]:
# 8) Uso
query = "eu quero um headphone para usar no escritorio que evite o barulho do exterior"
top3 = semantic_search(query, top_k=3)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 200)
print(top3)

   product_id                                                                                                                                                      product_name
0  B00Y4ORQ46                                                                                                              Logitech H111 Wired On Ear Headphones With Mic Black
1  B009LJ2BXA  Hp Wired On Ear Headphones With Mic With 3.5 Mm Drivers, In-Built Noise Cancelling, Foldable And Adjustable For Laptop/Pc/Office/Home/ 1 Year Warranty (B4B09Pa)
2  B07L8KNP5F                                                              ZEBRONICS Zeb-Thunder Bluetooth Wireless Over Ear Headphone FM, mSD, 9 hrs Playback with Mic (Black)
