In [1]:
import os
import re
import time
import numpy as np
import pandas as pd
from openai import OpenAI

In [2]:
products_df = pd.read_csv("data/labeled_tools_and_home_improvement_products.csv")

In [3]:
products_df.head()

Unnamed: 0,id,title,description,label
0,B09M8J9LQ9,Decute 200LED Christmas String Lights Outdoor ...,Specifications Light Color Multicolored Applic...,22
1,B0BNZ888C7,Upgrade Hydro Shower Jet Head High Pressure Hy...,HighPressure Filtered Shower Head with 3 Repla...,10
2,B07TLCQ3NR,"1"" Adhesive Furniture Sliders -Furnigear Chair...",Adhesive Furniture Glides SlidersFurnigear Hea...,17
3,B09HRSKRFB,"Sunlite 40455 LED A19 Colored Light Bulb, 3 Wa...",Sunlites colored decorative A19 household ligh...,3
4,B014AV8FY2,Dixie Belle Paint Company Chalk Finish Furnitu...,CHALK MINERAL PAINT Amethyst is a rich deep m...,9


In [4]:
# We will sample 5 products per group and create sample queries for each of them
product_samples = products_df.groupby('label').sample(n=5, random_state=234).reset_index(drop=True)
product_samples.head(n=12)

Unnamed: 0,id,title,description,label
0,B08LBHKCG6,ESTWING Rock Pick - 22 oz Geological Hammer wi...,Product Description Estwing Rock Picks are the...,-1
1,B08X4LDHS7,"SOAIY Star Projector, Night Light 3IN1 Starry ...",Galaxy Light Projector Smart LifeWorks with Al...,-1
2,B0BG3ZNVTX,Christmas Decorative Laser Lights Projector Ou...,OUTDOOR USE Longdistance projection is still ...,-1
3,B000JFLMBM,DANCO Brass Closet Bolts with Nuts and Washers...,Product Description Danco is one of the larges...,-1
4,B09KC9BG3D,"Moon Lamp, LOGROTATE 16 Colors LED Night Light...",A Unique Kids Night Light Moon lamp with the d...,-1
5,B0B2RPLQKS,"Cordless Screwdriver, VIGRUE Rechargeable Elec...",Mutifunction Drill Driver and Electric Screwdr...,0
6,B0C49QGX4L,Dremel Lite 7760 N/10 4V Li-Ion Cordless Rotar...,Dremel Lite 7760 N10 4V LiIon Cordless Rotary ...,0
7,B077ZYMK1W,Milwaukee Electric Tools MLW2553-20 M12 Fuel 1...,The M12 FUEL 14 in Hex impact driver once agai...,0
8,B0BWN8YQLT,Winzwon Cutting Wheels Set 44 Pcs for Dremel R...,Diamond Cutting Wheels 15 Pcs Match with 2pcs ...,0
9,B00Z82EYZ2,Ridgid R840095 Gen5X Genuine OEM Dual Chemistr...,If you have Ridgid 18V batteries youll want to...,0


In [5]:
# Shuffle the products so that different groups are passed together
product_samples = product_samples.sample(frac=1).reset_index(drop=True)
product_samples.head()

Unnamed: 0,id,title,description,label
0,B0BFJL5LD1,VGYVGYCC Outdoor Solar Garden Lights - 2 Pack ...,2022 Newest Version2 pack solar tulip lights a...,12
1,B0002H49E4,LEATHERMAN - Standard Nylon Sheath with Pocket...,Product Description This nylon belt sheath is ...,55
2,B084GYHQFY,"Makita MAC210Q Quiet Series, 1 HP, 2 Gallon, O...",Compressors are workhorse tools on the job sit...,31
3,B09ZP7M7R1,"Greenclick Landscape Lighting, 3W 12V Extendab...",Safe Low Voltage Landscape LightsLow voltage l...,35
4,B0BRGRNK2H,20 oz Big Gap Filler Insulating Foam Sealant (...,Product Description GREAT STUFF Big Gap Filler...,8


In [6]:
openai_api_key = os.environ['OPENAI_API_KEY']

In [7]:
class QueryGenerator:
    def __init__(self, api_key):
        """Initialize the generator with API key and standard prompts"""
        self.client = OpenAI(api_key=api_key)
        
        # Store the system and example prompts
        self.system_prompt = """You are an expert in e-commerce search optimization. Given a product description, generate a list of 5 realistic search queries a customer might use to find the product, even if the customer does not know the exact name of the product they are looking for.

        Guidelines:
        - Keep queries short (2-10 words)
        - Avoid exact repetition of product specs; focus on what a customer would type
        - Include a mix of general, descriptive, and feature-focused queries
        - Consider different use cases for each product (e.g. baking soda can be used for cooking, cleaning, and removing bad odors).
        - Use casual, natural language
        - Return ONLY the list of 5 queries enclosed in [] and separated by ',' with no additional text"""

        self.example_prompt = """Here are some examples:

        Product Description: "Desk Lamps with Smooth Dimming Function from 100 to 0The rotary knob on the lamp base allows you to set the brightness to any level from 0 to 100 Easy to change the visual appearance and mood of your space Bedside Table Lamps with 2 USB Charging Ports and 1 AC OutletEach lamp comes with two USB charging ports5V21A and one 2prong AC power outlet120V which are available for use whether the lights on or off Super convenient for you to charge your cellphone iPad Kindle diffuser Apple Watch tablet and other electronic devices Save you from the trouble of not having enough wall sockets to charge multiple devices at the same time EyeCaring LED Edison Bulbs IncludedTwo vintage LED bulbs are included in the package so you can have two workable table lamps instantly upon receiving Save your time in finding compatible ones These 60Watt equivalent LED bulbs offer brightness up to 800LM by drawing only 7W energy The 5000K light these bulbs emit is similar to natural daylight making your surrounding clear and helping you stay focused Minimalist Upright DesignFeaturing clearline metal body in black finish and seeded glass shade this table lamp complements any decor styles from industrial modern antique to midcentury Ideal light source for all areas like bedroom living room kitchen office kids room nursery room and guestroom Lovely Gift Idea Perfect gift choice for family and friends on festivals birthdays graduation and housewarming days"
        
        Queries:
        [adjustable desk lamp, light with charging port for devices, light for working on my desk, basic design lamp, multipurpose lamp]
        
        Product Description: "Keyless Entry Door Lock SMONET Smart Lock can recognize your fingerprint in just 05 seconds and unlock your door in 1 second faster than fumbling for your keys Compatible with Alexa or Google Assistant Requires a wifi gateway to hook it up to Internet sold separately Smart Bluetooth Keypad Lock Not only bring you security and trust but also bring you a brandnew smart home Five ways to unlock your door  Mobile Keypad Fingerprint IC Fob and Mechanical Keys Customize Management  Sign in Bluetooth Locks for Front Door you could check the records about unlocking and wrong passcode in real time Code Door Lock allow setting generate Timed Permanent Onetime or Customized passcode for guests friends housekeepers or employees Smart Locks for Front Door Intuitive touchscreen display alloy body long battery life 4pcs AA batteries can provide 10000 openings approx 12 months For security reason it will automatically lock for 5 minutes when enter the password incorrectly more than 5 times A great solution for home hotel Airbnb etc Excellent Customer Service  1year warranty and free lifetime technical support Any question such as installation or operation problemsor for more gateway information please feel free to contact us"
        
        Queries:
        [I want a more secure lock for my front door, lock and unlock door with my phone, open door with my finger, door lock that doesn't use key, fancy tech door lock]
        """

    def generate_single(self, product_description):
        """Generate queries for a single product description"""
        try:
            response = self.client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": self.system_prompt},
                    {"role": "user", "content": self.example_prompt},
                    {"role": "user", "content": f"Now generate queries for this product description: {product_description}"}
                ],
                temperature=0.7,
                max_tokens=50
            )
            
            res = response.choices[0].message.content
            return res.replace('[', '').replace(']', '').split(', ')
            
        except Exception as e:
            print(f"Error generating queries: {e}")
            return []

    def generate_batch(self, product_descriptions, batch_size: int = 5):
        """
        Generate queries for multiple products efficiently.
        Uses batching and includes rate limiting.
        """
        batch_results = []
        
        # Process in batches
        for i in range(0, len(product_descriptions), batch_size):
            batch = product_descriptions[i:i + batch_size]
            
            # Create a single prompt for the batch
            batch_prompt = "Generate queries for each of these products. For each product, start with 'Product X:' where X is the product number:\n\n"
            for idx, desc in enumerate(batch, 1):
                batch_prompt += f"Product {idx}: {desc}\n\n"
            
            try:
                response = self.client.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "system", "content": self.system_prompt},
                        {"role": "user", "content": self.example_prompt},
                        {"role": "user", "content": batch_prompt}
                    ],
                    temperature=0.7,
                    max_tokens=50 * len(batch)  # Adjust tokens based on batch size
                )
                
                # Parse the batch response
                response_text = response.choices[0].message.content
                response_list = re.sub("Product \d: ", "", response_text).replace('[', '').replace(']', '').split('\n')
                batch_results.extend([text.split(', ') for text in response_list])
                                
                # Rate limiting - wait 1 second between batches
                if i + batch_size < len(product_descriptions):
                    time.sleep(5)
                    
            except Exception as e:
                print(f"Error processing batch starting at index {i}: {e}")
                continue
        
        return batch_results

In [8]:
generator = QueryGenerator(openai_api_key)

In [9]:
# Number of products for which we are generating queries
product_samples['description'].shape

(295,)

In [10]:
# Create example queries for all descriptions
results = generator.generate_batch(product_samples['description'])

In [11]:
len(results)

361

We have more results than products, so let's go ahead and parse through the results so that we correctly extract the right information.

In [15]:
[result for result in results if len(result[0]) <= 0]

[[''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 ['']]

The above results are just empty strings, so we can filter them out

In [80]:
cleaned_results = [result for result in results if len(result[0]) > 0]
len(cleaned_results)

296

We now only have one more result than we need.

In [81]:
np.unique([len(result) for result in cleaned_results])

array([1, 5, 6, 7, 8])

It looks like we have a variable number of queries in some of the results. Let's examine each of the results that don't have the desired number of queries (5).

In [82]:
# Indices of results that don't have 5 queries
inspect_index = [i for i in range(len(cleaned_results)) if len(cleaned_results[i]) != 5]
inspect_index

[21, 23, 128, 129, 246, 284]

In [83]:
[cleaned_results[i] for i in inspect_index]

[['versatile cross sliding vise for drill press',
  'durable cast iron construction vise',
  'precise milling machine attachment',
  'swivel crank handles for woodworking',
  'high-quality jaws and screws vise',
  'adjustable shims and crank handles'],
 ['voice-controlled smart light bulbs compatible with Alexa',
  'Google Home',
  'and Siri',
  'remote control via Smart Life app',
  'multi-color and dimmable brightness settings',
  'timer and customized scenes features',
  'energy-saving quality LED bulbs'],
 ['solar-powered globe string lights with auto on/off',
  '8 different lighting modes for versatile use',
  'IP65 waterproof for outdoor installation',
  'memory function to save mode settings',
  'perfect for patio',
  'garden',
  'Christmas',
  'and weddings'],
 ['kitchen cabinet knobs with screws',
  'durable stainless steel round pull knobs',
  'classic design for various furniture',
  'widely used for cabinets',
  'drawers',
  'closets',
  'and more'],
 ['        '],
 ['keyle

Due to the fact that we separated results by ',', this messed up the separation of some list items. Let's correct this for each of the above results.

In [84]:
cleaned_results[21]

['versatile cross sliding vise for drill press',
 'durable cast iron construction vise',
 'precise milling machine attachment',
 'swivel crank handles for woodworking',
 'high-quality jaws and screws vise',
 'adjustable shims and crank handles']

It is hard to determine which query got split in the above results, so let's just remove the last result

In [85]:
cleaned_results[21] = cleaned_results[21][:5]

In [86]:
cleaned_results[23]

['voice-controlled smart light bulbs compatible with Alexa',
 'Google Home',
 'and Siri',
 'remote control via Smart Life app',
 'multi-color and dimmable brightness settings',
 'timer and customized scenes features',
 'energy-saving quality LED bulbs']

In the above result, the list "Alexa, Google Home, and Siri" were accidentally split up. Let's recombine these into a single query.

In [87]:
cleaned_results[23] = [', '.join(cleaned_results[23][:3])] + cleaned_results[23][3:]

In [88]:
cleaned_results[128]

['solar-powered globe string lights with auto on/off',
 '8 different lighting modes for versatile use',
 'IP65 waterproof for outdoor installation',
 'memory function to save mode settings',
 'perfect for patio',
 'garden',
 'Christmas',
 'and weddings']

A similar issue occurred here in the last result. It is also not clear from the last description that these are supposed to be lights, so let's append that as well.

In [89]:
cleaned_results[128] = cleaned_results[128][:4] + ['string lights ' + ', '.join(cleaned_results[128][4:])]

In [90]:
# Looks like only 4 results were generated here, so we will add one of our own
cleaned_results[129] = cleaned_results[129][:3] + ['knobs ' + ', '.join(cleaned_results[129][3:])] + ['handles to open kitchen cupboards']

In [93]:
# Simply an empty value
cleaned_results[246]

['        ']

In [94]:
del cleaned_results[246]

In [100]:
# Decreased index seen above by 1 because we deleted an entry
cleaned_results[283] = cleaned_results[283][:2] + [', '.join(cleaned_results[283][2:3])] + [cleaned_results[283][4] + 'door handle'] + ['door lock keypad with ' + cleaned_results[283][5]]

In [101]:
print(f"Number of results: {len(cleaned_results)}")
print(f"Number of queries per result: {np.unique([len(result) for result in cleaned_results])}")

Number of results: 295
Number of queries per result: [5]


In [106]:
# Add queries to original products
product_samples.loc[:, ['query_1', 'query_2', 'query_3', 'query_4', 'query_5']] = cleaned_results

In [111]:
product_samples.head()

Unnamed: 0,id,title,description,label,query_1,query_2,query_3,query_4,query_5
0,B0BFJL5LD1,VGYVGYCC Outdoor Solar Garden Lights - 2 Pack ...,2022 Newest Version2 pack solar tulip lights a...,12,Large decorative solar lights,Easy to install garden lights,Waterproof outdoor tulip lights,Gift idea for garden decor,High-quality solar tulip lights
1,B0002H49E4,LEATHERMAN - Standard Nylon Sheath with Pocket...,Product Description This nylon belt sheath is ...,55,Sturdy nylon belt sheath,Sheath for Leatherman Wave,Multitool sheath with pockets,Durable vertical carry sheath,Leatherman Wave belt sheath
2,B084GYHQFY,"Makita MAC210Q Quiet Series, 1 HP, 2 Gallon, O...",Compressors are workhorse tools on the job sit...,31,Makita quiet air compressor,Portable electric air compressor,Lightweight 2-gallon compressor,Oil-free electric compressor,Low-noise air compressor
3,B09ZP7M7R1,"Greenclick Landscape Lighting, 3W 12V Extendab...",Safe Low Voltage Landscape LightsLow voltage l...,35,Safe low voltage landscape lights,Extendable garden spotlights,Bright outdoor Christmas lights,Waterproof garden spotlights,Landscape lighting set with transformer
4,B0BRGRNK2H,20 oz Big Gap Filler Insulating Foam Sealant (...,Product Description GREAT STUFF Big Gap Filler...,8,Big gap filler foam sealant,Insulating foam for large gaps,Airtight sealant for drafts,Paintable foam insulation,Weather-tight foam sealant


In [113]:
product_samples.to_csv('data/home_products_with_sample_queries', index=False)