In [1]:
import os
import re
import time
import numpy as np
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv

In [2]:
products_df = pd.read_csv("C:/Users/benja/Desktop/Improving_Mp_Search/labeled_beauty_products.csv")

In [3]:
products_df.head()

Unnamed: 0,id,title,description,label
0,B01ALT5MKW,Babo Botanicals Sheer Zinc Continuous Spray Su...,Get the comfort of a lotion sunscreen and the ...,13
1,B0C5Z7V77F,SUNATORIA Keratin Hair Mask - Professional Tre...,Restorative Hair Mask with Hydrolyzed Keratin ...,0
2,B0BZJ5LDZ6,[Farewell] Sunny I Tip Hair Extensions Natural...,SpecificationsSunny I tip hair extensions are ...,16
3,B09MTGTV3G,Vvan Long Straight Hair Ombre Green Straight W...,Basic information 1Hair MaterialHeat Resistant...,2
4,B0722JDG4Q,"Makeup Bag,Leather Double Layer Large Makeup O...",Durable Makeup Bag Travel Accessories for Wome...,47


In [4]:
# We will sample 5 products per group and create sample queries for each of them
product_samples = products_df.groupby('label').sample(n=5, random_state=234).reset_index(drop=True)
product_samples.head(n=12)

Unnamed: 0,id,title,description,label
0,B07N84MBPB,CLIO Kill Cover Glow Fitting Cushion | Makeup ...,CLEAR BRIGHT RADIANCE ALL DAY LONG Get your gl...,-1
1,B0BYCBXF5X,"Handle Grip Nail Brush, Larbois Nail Brushes H...",New Design Compared with the styles on the mar...,-1
2,B09XBVZ6BY,Pronto 100% Acetone Gel Nail Polish Remover - ...,PROFESSIONAL NAIL POLISH REMOVER Our acetone 1...,-1
3,B01N418G6J,"Neutrogena Moisturizing Sheer Body Oil-Lotion,...",Product Description Help soften and smooth you...,-1
4,B00J70GIFA,Andalou Naturals Hyaluronic Dmae Lift Firm Ski...,Andalou Naturals Hyaluronic DMAE lift and firm...,-1
5,B079THCJHD,BOLDIFY Hair Volumizer and Hair Texture Powder...,Messfree lift fuller finish Flat fine or badly...,0
6,B07N8LMVQT,Alberto Vo5 Hot Oil Intense Conditioning Treat...,Alberto Vo5 Hot Oil Intense Conditioning Treat...,0
7,B00N2JN1L6,OKAY | Extra Dark 100% Natural Black Jamaican ...,OKAY Brand Castor Oil seeds are harvested and ...,0
8,B001UGL84U,Selsun Blue Medicated Anti-dandruff Shampoo wi...,Selsun Blue Medicated Antidandruff Shampoo for...,0
9,B00H7NKNE6,Foxbrim Naturals Extra Virgin Argan Oil for Ha...,EXTRA VIRGIN COLD PRESSED FIRST PRESSED ONLY ...,0


In [5]:
# Shuffle the products so that different groups are passed together
product_samples = product_samples.sample(frac=1).reset_index(drop=True)
product_samples.head()

Unnamed: 0,id,title,description,label
0,B0C523T7C5,L’Oréal Paris Paradise Enchanted Scented Eyesh...,LOreal Paris presents the Paradise Enchanted E...,49
1,B07939KVKK,Viva Naturals Charcoal Face Mask Set (8 Pack) ...,Take your skin care routine to the next level ...,45
2,B08ZDDGBDP,9 Pcs Satin Hair Scrunchies Soft Elastic Hair ...,Product Information Material satin Size4 in Co...,56
3,B09577STQ1,RikView Press on Nails Short French Tip Nails ...,RikView We focus on the design and sales of fa...,15
4,B007GMTJRK,"Kevyn Aucoin The Volume Mascara, Black: Precis...",Kevyn Aucoin The Volume Mascara Black A volume...,36


In [6]:
load_dotenv()
openai_api_key = os.environ['OPENAI_API_KEY']

In [8]:
class QueryGenerator:
    def __init__(self, api_key):
        """Initialize the generator with API key and standard prompts"""
        self.client = OpenAI(api_key=api_key)
        
        # Store the system and example prompts
        self.system_prompt = """You are an expert in e-commerce search optimization. Given a product description, generate a list of 5 realistic search queries a customer might use to find the product, even if the customer does not know the exact name of the product they are looking for.

        Guidelines:
        - Keep queries short (2-10 words)
        - Avoid exact repetition of product specs; focus on what a customer would type
        - Include a mix of general, descriptive, and feature-focused queries
        - Consider different use cases for each product (e.g. baking soda can be used for cooking, cleaning, and removing bad odors).
        - Use casual, natural language
        - Return ONLY the list of 5 queries enclosed in [] and separated by ',' with no additional text"""

        self.example_prompt = """Here are some examples:

        Product Description: "Restorative Hair Mask with Hydrolyzed Keratin for dry damaged hair deeply repairs and conditions hair to restore softness shine and bounce Damaged hair treatment provides restoration and lasting protection from drying out leaving hair fabulously healthy and easy to manage Unique keratin formula  Smooth Forces Hair Mask Contains Vitamin E to help replenish the shine that the hair loses when exposed to harsh chemicals heat and styling products Omega 3 and Omega 9 increase moisture and hair elasticity This keratin treatment at home is perfect for dry damaged brittle and colortreated hair of any texture It can help treat splitends caused by straightening and years of color treatments You can use this keratin hair treatment at home to get salonquality results Sodium Sulfate and Parabenfree this hair treatment is qualitytested to ensure purity The hair masque is a deep hair hydrating conditioner that contains 100 TOP Grade Hydrolyzed Keratin Our hair mask for damaged hair repairing was exclusively designed for companies in the United Kingdom using the best available ingredients It is a keratin complex treatment that delivers the highest possible quality"
        
        Queries:
        [restorative hair mask, keratin hair mask, hair moisturizer, damaged hair repair, my hair is dry]
        
        Product Description: "Developed with dermatologists CeraVe SA Cream for Rough  Bumpy Skin has a unique formula that exfoliates and moisturizes while helping to restore the protective skin barrier Salicylic acid exfoliates and softens to smooth rough skin lactic acid exfoliates to renew skins surface and three essential ceramides 1 3 6II help to restore the protective skin barrier NoncomedogenicFragrancefreeHypoallergenicGentle nonirritating formula Key Ingredients  Salicylic Acid and Lactic Acid  Gently exfoliates to eliminate dead skin cells Vitamin D  Enriched formula MVE Technology  This patented delivery system continually releases moisturizing ingredients for 24hour hydration Ceramides  Essential for healthy skin ceramides help restore and maintain the skins natural barrier Hyaluronic Acid  This ingredient attracts hydration to the skins surface and helps the skin retain moisture Niacinamide Helps calm the skin This product is nonsealed  CERAVE SA CREAM  Body moisturizer that gently exfoliates to help soften and smooth dry scaly or rough and bumpy skin on legs and upper arms Rich velvety texture leaves skin feeling smooth Absorbed quickly and has a nongreasy feel  EXFOLIATE  MOISTURIZE  Salicylic Acid  Lactic Acid to help exfoliate Hyaluronic Acid to help retain skins moisture and Niacinamide to help calm skin SA is a Beta Hydroxy Acid BHA an exfoliator that removes dead skin cells and smooths skin  GENTLE ON SKIN  Suitable for body Fragrance free allergytested and noncomedogenic Can be paired with CeraVe Salicylic Acid Body Wash for Rough and Bumpy Skin to cleanse  exfoliate  3 ESSENTIAL CERAMIDES  Ceramides are found naturally in the skin and make up 50 of the lipids in the skin barrier All CeraVe products are formulated with three essential ceramides 1 3 6II to help restore and maintain the skins natural barrier  DEVELOPED WITH DERMATOLOGISTS  CeraVe Skincare is developed with dermatologists and has products suitable for dry skin sensitive skin oily skin acneprone and more"
        
        Queries:
        [salicylic acid cream for face, dermatologist approved cream, exfoliating cream, i have rough skin with bumps, i want to restore skin barrier]
        """

    def generate_single(self, product_description):
        """Generate queries for a single product description"""
        try:
            response = self.client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": self.system_prompt},
                    {"role": "user", "content": self.example_prompt},
                    {"role": "user", "content": f"Now generate queries for this product description: {product_description}"}
                ],
                temperature=0.7,
                max_tokens=50
            )
            
            res = response.choices[0].message.content
            return res.replace('[', '').replace(']', '').split(', ')
            
        except Exception as e:
            print(f"Error generating queries: {e}")
            return []

    def generate_batch(self, product_descriptions, batch_size: int = 5):
        """
        Generate queries for multiple products efficiently.
        Uses batching and includes rate limiting.
        """
        batch_results = []
        
        # Process in batches
        for i in range(0, len(product_descriptions), batch_size):
            batch = product_descriptions[i:i + batch_size]
            
            # Create a single prompt for the batch
            batch_prompt = "Generate queries for each of these products. For each product, start with 'Product X:' where X is the product number:\n\n"
            for idx, desc in enumerate(batch, 1):
                batch_prompt += f"Product {idx}: {desc}\n\n"
            
            try:
                response = self.client.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "system", "content": self.system_prompt},
                        {"role": "user", "content": self.example_prompt},
                        {"role": "user", "content": batch_prompt}
                    ],
                    temperature=0.7,
                    max_tokens=50 * len(batch)  # Adjust tokens based on batch size
                )
                
                # Parse the batch response
                response_text = response.choices[0].message.content
                response_list = re.sub("Product \d: ", "", response_text).replace('[', '').replace(']', '').split('\n')
                batch_results.extend([text.split(', ') for text in response_list])
                                
                # Rate limiting - wait 1 second between batches
                if i + batch_size < len(product_descriptions):
                    time.sleep(5)
                    
            except Exception as e:
                print(f"Error processing batch starting at index {i}: {e}")
                continue
        
        return batch_results

  response_list = re.sub("Product \d: ", "", response_text).replace('[', '').replace(']', '').split('\n')


In [9]:
generator = QueryGenerator(openai_api_key)

In [10]:
# Number of products for which we are generating queries
product_samples['description'].shape

(340,)

In [11]:
# Create example queries for all descriptions
results = generator.generate_batch(product_samples['description'])

In [12]:
len(results)

420

We have more results than products, so let's go ahead and parse through the results so that we correctly extract the right information.

In [14]:
[result for result in results if len(result[0]) <= 0]

[[''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 ['']]

The above results are just empty strings, so we can filter them out

In [15]:
cleaned_results = [result for result in results if len(result[0]) > 0]
len(cleaned_results)

348

We now only have one more result than we need.

In [16]:
np.unique([len(result) for result in cleaned_results])

array([1, 5])

It looks like we have a variable number of queries in some of the results. Let's examine each of the results that don't have the desired number of queries (5).

In [None]:
# Indices of results that don't have 5 queries
inspect_index = [i for i in range(len(cleaned_results)) if len(cleaned_results[i]) != 5]
inspect_index

[11, 13, 15, 17, 250, 252, 254, 256]

In [18]:
[cleaned_results[i] for i in inspect_index]

[['        '],
 ['        '],
 ['        '],
 ['        '],
 ['        '],
 ['        '],
 ['        '],
 ['        ']]

In [23]:
for i in range(20):
    print(cleaned_results[i])

['warm eyeshadow palette', 'fruity scented eye shadow', 'enchanting eye makeup', 'eye shadow with coral tones', 'step by step eye makeup guide']
['charcoal face mask set', 'hydrating sheet masks', 'glowy skin face mask', 'dermatologist approved facial masks', 'sheet mask set with essential oils']
['satin scrunchies set', 'colorful hair ties', 'gentle hair accessories', 'trendy ponytail holders', 'satin hairbands variety pack']
['fashionable fake nails set', 'salon comparable false nails', 'eco-friendly press on nails', 'reusable square nails', 'easy application nail art']
['volumizing tubing mascara', 'Kevyn Aucoin mascara', 'mascara for defined lashes', 'tubing mascara with jojoba oil', 'smudge-resistant lash lengthener']
['boar bristle hair brush', 'detangling hair brush', 'vented hair brush for blow drying', 'oversized curved hair brush', 'hair brush for all hair types']
['self-adhesive eyelashes', 'eyelashes without glue', 'natural looking lashes', 'reusable self-adhesive lashes', 

All the values in inspect_index are missing? Hence I'll remove them

In [24]:
cleaned_results_filtered = [item for i, item in enumerate(cleaned_results) if i not in inspect_index]

In [25]:
print(f"Number of results: {len(cleaned_results_filtered)}")
print(f"Number of queries per result: {np.unique([len(result) for result in cleaned_results_filtered])}")

Number of results: 340
Number of queries per result: [5]


In [26]:
# Add queries to original products
product_samples.loc[:, ['query_1', 'query_2', 'query_3', 'query_4', 'query_5']] = cleaned_results_filtered

In [27]:
product_samples.head()

Unnamed: 0,id,title,description,label,query_1,query_2,query_3,query_4,query_5
0,B0C523T7C5,L’Oréal Paris Paradise Enchanted Scented Eyesh...,LOreal Paris presents the Paradise Enchanted E...,49,warm eyeshadow palette,fruity scented eye shadow,enchanting eye makeup,eye shadow with coral tones,step by step eye makeup guide
1,B07939KVKK,Viva Naturals Charcoal Face Mask Set (8 Pack) ...,Take your skin care routine to the next level ...,45,charcoal face mask set,hydrating sheet masks,glowy skin face mask,dermatologist approved facial masks,sheet mask set with essential oils
2,B08ZDDGBDP,9 Pcs Satin Hair Scrunchies Soft Elastic Hair ...,Product Information Material satin Size4 in Co...,56,satin scrunchies set,colorful hair ties,gentle hair accessories,trendy ponytail holders,satin hairbands variety pack
3,B09577STQ1,RikView Press on Nails Short French Tip Nails ...,RikView We focus on the design and sales of fa...,15,fashionable fake nails set,salon comparable false nails,eco-friendly press on nails,reusable square nails,easy application nail art
4,B007GMTJRK,"Kevyn Aucoin The Volume Mascara, Black: Precis...",Kevyn Aucoin The Volume Mascara Black A volume...,36,volumizing tubing mascara,Kevyn Aucoin mascara,mascara for defined lashes,tubing mascara with jojoba oil,smudge-resistant lash lengthener


In [28]:
product_samples.to_csv('C:/Users/benja/Desktop/Improving_Mp_Search/beauty_products_with_sample_queries.csv', index=False)