In [None]:
import os
import re
import time
import numpy as np
import pandas as pd
from openai import OpenAI

In [None]:
products_df = pd.read_csv("labeled_pet_products.csv")

In [None]:
products_df.head()

Unnamed: 0.1,Unnamed: 0,title,rating_number,features,description,combined_description,label
0,0,Pawprints Pet Memorial Frame with Pawprints Le...,2950,['Beautiful dog memorial frame with a sentimen...,[],Beautiful dog memorial frame with a sentimenta...,38
1,1,PetSafe Exterior / Interior Cat Door: Staywell...,2341,['PET SIZE: The large flap opening is perfect ...,['The PetSafe Staywell Big Cat/Small Dog Pet D...,The PetSafe Staywell Big CatSmall Dog Pet Door...,100
2,2,ASPCA Cat House & Cat Scratcher w/Bonus Catnip...,9248,['Cardboard cat house 2 in 1 with scratching b...,['Give kitty a relaxing place to play and hide...,Give kitty a relaxing place to play and hide w...,67
3,3,"BUUOC Dog Training Collar with Remote 2000Ft, ...",2208,['【3 Modes Train Dogs Like A Pro】 3 Safe effec...,[],3 Modes Train Dogs Like A Pro 3 Safe effective...,18
4,4,KOOLTAIL Basic Dog Hoodie - Soft and Warm Dog ...,5164,['Size:The Size of the dog sweater is L Neck G...,[],SizeThe Size of the dog sweater is L Neck Girt...,114


In [None]:
# Select rows with non-empty descriptions
non_empty_descriptions_df = products_df[products_df['description'].str.strip().ne('')]

products_df = non_empty_descriptions_df


Number of items: 12262


In [None]:
# We will sample 3 products per group and create sample queries for each of them
product_samples = products_df.groupby('label', group_keys=False).apply(lambda x: x.sample(n=min(len(x), 3), random_state=234)).reset_index(drop=True)


  product_samples = products_df.groupby('label', group_keys=False).apply(lambda x: x.sample(n=min(len(x), 3), random_state=234)).reset_index(drop=True)


In [None]:
# Shuffle the products so that different groups are passed together
product_samples = product_samples.sample(frac=1).reset_index(drop=True)

In [None]:
generator = QueryGenerator(openai_api_key)

In [None]:
class QueryGenerator:
    def __init__(self, api_key):
        """Initialize the generator with API key and standard prompts"""
        self.client = OpenAI(api_key=api_key)

        # Store the system and example prompts
        self.system_prompt = """You are an expert in e-commerce search optimization. Given a product description, generate a list of 5 realistic search queries a customer might use to find the product, even if the customer does not know the exact name of the product they are looking for.

        Guidelines:
        - Keep queries short (2-10 words)
        - Avoid exact repetition of product specs; focus on what a customer would type
        - Include a mix of general, descriptive, and feature-focused queries
        - Consider different use cases for each product (e.g. baking soda can be used for cooking, cleaning, and removing bad odors).
        - Use casual, natural language
        - Return ONLY the list of 5 queries enclosed in [] and separated by ',' with no additional text"""

        self.example_prompt = """Here are some examples:

        Product Description: "Go ahead turn the bag around and look at our ingredients PureBites are made with only 1 ingredient 100 Pure USA Sourced  Made Turkey Breast Dogs love the taste of PureBites because our treats are freeze dried RAW to lock in the aroma texture and freshness they crave Dog parents love PureBites because our treats are 100 pure and rich in nutrients for a happy and healthy life Go ahead turn the bag around and look at the ingredients Purebites chicken Jerky are made with only 1 ingredient Dogs love the taste of Purebites because our treats are gently dried to lock in the aroma Our products are ideal for dogs with health issues or pets that are overweight"


        Queries:
        [healty dog food, weight loss dog food, organic and natural dog food, raw completely natural dog food]

        Product Description: "About the Bird Nest Hammock This bird nest gives your lovely parrots or birds a warm and comfortable hut to rest Your bird or pets will snuggle this soft fleece tent with perch to keep warm in winter Soft comfortable sleeping tests can alleviate a birds stress and provide them with a sense of security Your bird will feel safe and warm in the confined space and soft fabric of the hammock Reduce boredom by adding to your birds environmental stimulation Easy and portable to hang with the bilateral hooks also easy to take it off and wash it  About the sizes  Small Measurements LWH 63 x 394 x 472  161012cm  For A pair of tiger skins a peony 24 pearl birds  Medium Measurements LWH 98x 59 x 7  251518cm  For Parakeets Lovebirds Finches Cockatiels Large Measurements LWH 126 x 67 x984  321725cm  For Cockatiels Conures and Amazons Package Included 1 x Bird Net with hook The bird hammock is made of fleece it is very comfortable to rest in it as a bird bed or bird sleeping hut This bird hut gives your lovely parrots or birds a warm and comfortable hut to rest hide play and sleep Soft comfort sleeping tent can alleviate a birds stress and provide them with a sense of security It is very easy and portable to hang with the bilateral hooks Three size availableSmallLWH 63 x 394 x 472suggest for A pair of tiger skins a peony 24 pearl birds Medium98 x 59 x 7 suggest for Parakeets Lovebirds Finches Cockatiels L126 x 67 x984 suggest for Cockatiels Conures and Amazons The warm nest is a best gift for Parrot Macaw Budgies Eclectus Parakeet Cockatiels Cockatoo Lovebird 90 days warranty please feel free to contact us in case there is any problem"

        Queries:
        [I want a bed for my bird, warm and soft bird hut, portable bird home, soft bird cage with hooks, fancy bird hut]
        """

    def generate_single(self, product_description):
        """Generate queries for a single product description"""
        try:
            response = self.client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": self.system_prompt},
                    {"role": "user", "content": self.example_prompt},
                    {"role": "user", "content": f"Now generate queries for this product description: {product_description}"}
                ],
                temperature=0.7,
                max_tokens=50
            )

            res = response.choices[0].message.content
            return res.replace('[', '').replace(']', '').split(', ')

        except Exception as e:
            print(f"Error generating queries: {e}")
            return []

    def generate_batch(self, product_descriptions, batch_size: int = 5):
        """
        Generate queries for multiple products efficiently.
        Uses batching and includes rate limiting.
        """
        batch_results = []

        # Process in batches
        for i in range(0, len(product_descriptions), batch_size):
            batch = product_descriptions[i:i + batch_size]

            # Create a single prompt for the batch
            batch_prompt = "Generate queries for each of these products. For each product, start with 'Product X:' where X is the product number:\n\n"
            for idx, desc in enumerate(batch, 1):
                batch_prompt += f"Product {idx}: {desc}\n\n"

            try:
                response = self.client.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "system", "content": self.system_prompt},
                        {"role": "user", "content": self.example_prompt},
                        {"role": "user", "content": batch_prompt}
                    ],
                    temperature=0.7,
                    max_tokens=50 * len(batch)  # Adjust tokens based on batch size
                )

                # Parse the batch response
                response_text = response.choices[0].message.content
                response_list = re.sub("Product \d: ", "", response_text).replace('[', '').replace(']', '').split('\n')
                batch_results.extend([text.split(', ') for text in response_list])

                # Rate limiting - wait 1 second between batches
                if i + batch_size < len(product_descriptions):
                    time.sleep(5)

            except Exception as e:
                print(f"Error processing batch starting at index {i}: {e}")
                continue

        return batch_results

In [None]:
generator = QueryGenerator(openai_api_key)

In [None]:
# Define a function to identify non-informative descriptions
def is_valid_description(description):
    # Strip leading/trailing spaces, check if it's not empty, and ensure it's not a placeholder like 'Description' or '[]'
    return bool(description.strip()) and description.strip() not in ["Description", "[]", "['']"]

# Filter the dataframe using the is_valid_description function
product_samples = product_samples[product_samples['description'].apply(is_valid_description)]



In [None]:
# Number of products for which we are generating queries
product_samples['description'].shape

(326,)

In [None]:
product_samples.head()

Unnamed: 0.1,Unnamed: 0,title,rating_number,features,description,combined_description,label
0,926,Barkbox Dog Rope Toys - Durable Tug Toys for C...,2224,['Playtime Has Never Been More Fun: Stuffed wi...,['Barkbox Rope Tug and Plush Dog Toy - Black a...,Barkbox Rope Tug and Plush Dog Toy Black and ...,0
3,6642,Fiebing's Pure Neatsfoot Oil Leather Condition...,1605,"['Naturally replaces evaporated oils', 'Preser...","[""Fiebing's 100% Pure Neatsfoot Oil, 32 oz. - ...",Fiebings 100 Pure Neatsfoot Oil 32 oz Natural...,1
4,1123,"Hill's Science Diet Dry Cat Food, Adult 11+, I...",3451,['Specially formulated to fuel the energy need...,"['Product Description', ""Your may have an olde...",Product Description Your may have an older cat...,1
5,12009,Wisdom Panel Breed Discovery Dog DNA Kit: Most...,19707,"['With the world’s most accurate pet DNA test,...","[""Know every detail of your dog's breed mix. W...",Know every detail of your dogs breed mix With ...,1
6,3985,"MarineLand LED Aquarium Hood - 30 x 12 inch, b...",4264,"['SIZE: Fits MOST 30 inches by 12 inches.', 'D...",['Marineland brand is the world’s leading bran...,Marineland brand is the worlds leading brand o...,2


In [None]:
# Create example queries for all descriptions
results = generator.generate_batch(product_samples['description'])

In [None]:
print(f"First few entries in results (to check output structure): {results[:10]}")


First few entries in results (to check output structure): [['dog toy black and blue', 'rope tug plush toy', 'beetle dog toy', 'interactive dog toy', 'durable pet toy'], ['leather preservative', 'natural leather oil', 'neatsfoot oil for leather', 'saddle oil', 'leather conditioner'], ['senior cat food', 'indoor cat diet', 'mature cat nutrition', "Hill's Science Diet 11+ chicken", 'easy digestion cat food'], ['dog breed mix test', 'accurate breed reporting', 'personalized dog care', 'DNA test for dogs', 'breed identification for pets'], ['Marineland LED hood', 'aquarium lighting system', 'freshwater tank light', 'underwater LED effects', 'aquarium illumination'], ['aquarium fish net', 'nylon mesh fish net', 'fish net with plastic handle', 'aquarium maintenance tool', 'skimming fish net'], ['fish tank hiding place', 'aquarium decoration with weighted base', 'underwater decor gift', 'fish tank ornament', 'aquatic hideout'], ['Jackson Galaxy Crinkle Flies', 'realistic butterfly cat toy', 'c

In [None]:
len(results)

369

We have more results than products, so let's go ahead and parse through the results so that we correctly extract the right information.

In [None]:
cleaned_results = [result for result in results if len(result[0]) > 0]
len(cleaned_results)

325

In [None]:
np.unique([len(result) for result in cleaned_results])

array([1, 4, 5, 6])

It looks like we have a variable number of queries in some of the results. Let's examine each of the results that don't have the desired number of queries (5).

In [None]:
# Indices of results that don't have 5 queries
inspect_index = [i for i in range(len(cleaned_results)) if len(cleaned_results[i]) != 5]
inspect_index

[71, 110, 281, 318]

In [None]:
[cleaned_results[i] for i in inspect_index]

[['Paw prints desk keepsake',
  'pet paw print photo frame',
  'clay paw print kit',
  "furry buddy's paw impression",
  'pet safe paw print',
  'paw print photo frame'],
 ['Drinkwell Original Fountain Pump replacement',
  'PetSafe fountain pump backup',
  'compatible PetSafe fountain pump',
  'Drinkwell pet fountain pump'],
 ['No relevant information provided'],
 ["'antibacterial antifungal skin wipes for pets'",
  "'veterinarian recommended skin barrier treatment'",
  "'non-prescription topical brand for pets'",
  "'Douxo skin treatment for dogs and cats',"]]

In [None]:
cleaned_results[318] = cleaned_results[318][:5]

In [None]:
cleaned_results[71] = cleaned_results[71][:5]

In [None]:
cleaned_results[281].append("Drinkwell pump replacement")


In [None]:
cleaned_results[318].append("antifungal topical skin pet")

In [None]:
print(f"Number of results: {len(cleaned_results)}")
print(f"Number of queries per result: {np.unique([len(result) for result in cleaned_results])}")

Number of results: 325
Number of queries per result: [2 4 5]


In [None]:
index = [i for i in range(len(cleaned_results)) if len(cleaned_results[i]) != 5]
print(index)
[cleaned_results[i] for i in index]

[110, 281]


[['Drinkwell Original Fountain Pump replacement',
  'PetSafe fountain pump backup',
  'compatible PetSafe fountain pump',
  'Drinkwell pet fountain pump'],
 ['No relevant information provided', 'Drinkwell pump replacement']]

In [None]:
# Add "Drinkwell pump replacement" to index 110
cleaned_results[110].append("Drinkwell pump replacement")

# Delete index 281
del cleaned_results[281]


In [None]:
print(f"Number of results: {len(cleaned_results)}")
print(f"Number of queries per result: {np.unique([len(result) for result in cleaned_results])}")

Number of results: 324
Number of queries per result: [5]


In [None]:
product_samples2 = product_samples.iloc[:len(cleaned_results)]


In [None]:
product_samples2.loc[:, ['query_1', 'query_2', 'query_3', 'query_4', 'query_5']] = cleaned_results


In [None]:
product_samples2.head()

Unnamed: 0.1,Unnamed: 0,title,rating_number,features,description,combined_description,label,query_1,query_2,query_3,query_4,query_5
0,926,Barkbox Dog Rope Toys - Durable Tug Toys for C...,2224,['Playtime Has Never Been More Fun: Stuffed wi...,['Barkbox Rope Tug and Plush Dog Toy - Black a...,Barkbox Rope Tug and Plush Dog Toy Black and ...,0,dog toy black and blue,rope tug plush toy,beetle dog toy,interactive dog toy,durable pet toy
3,6642,Fiebing's Pure Neatsfoot Oil Leather Condition...,1605,"['Naturally replaces evaporated oils', 'Preser...","[""Fiebing's 100% Pure Neatsfoot Oil, 32 oz. - ...",Fiebings 100 Pure Neatsfoot Oil 32 oz Natural...,1,leather preservative,natural leather oil,neatsfoot oil for leather,saddle oil,leather conditioner
4,1123,"Hill's Science Diet Dry Cat Food, Adult 11+, I...",3451,['Specially formulated to fuel the energy need...,"['Product Description', ""Your may have an olde...",Product Description Your may have an older cat...,1,senior cat food,indoor cat diet,mature cat nutrition,Hill's Science Diet 11+ chicken,easy digestion cat food
5,12009,Wisdom Panel Breed Discovery Dog DNA Kit: Most...,19707,"['With the world’s most accurate pet DNA test,...","[""Know every detail of your dog's breed mix. W...",Know every detail of your dogs breed mix With ...,1,dog breed mix test,accurate breed reporting,personalized dog care,DNA test for dogs,breed identification for pets
6,3985,"MarineLand LED Aquarium Hood - 30 x 12 inch, b...",4264,"['SIZE: Fits MOST 30 inches by 12 inches.', 'D...",['Marineland brand is the world’s leading bran...,Marineland brand is the worlds leading brand o...,2,Marineland LED hood,aquarium lighting system,freshwater tank light,underwater LED effects,aquarium illumination


In [None]:
product_samples2.to_csv('pet_products_sample_queries.csv', index=False)

Performed post-processing after to guarantee match.