In [1]:
import time
from openai import OpenAI

In [2]:
# Google Colab Drive setup (if needed)
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
import re
import json

In [4]:
products_df = pd.read_csv("labeled_musical_instruments.csv")

In [5]:
products_df.head()

Unnamed: 0,id,title,description,label
0,B0B89ZSYS7,Shure SM7B Vocal Dynamic Microphone for Broadc...,The SM7B dynamic microphone has a smooth flat ...,0
1,B0891SBM1L,"Fender Classic Series Case Stand, 3-Guitar, Black",Product Description Fender Instrument Case Sta...,28
2,B006O64JMY,PylePro Full Size Electric Guitar Package w/ A...,Product Description Get rocking with this begi...,1
3,B0B8M5FJ9W,D'Addario Accessories Locking Guitar Strap - G...,Product Description Our patented design Planet...,11
4,B0BRYDVCK2,Vondynote Studio Monitor Stands Pair Heavy Dut...,Adjustable Tilt AngleThe top plate can be adju...,43


In [6]:
product_samples = products_df.groupby('label').sample(n=5, random_state=234).reset_index(drop=True)
product_samples.head(n=12)

Unnamed: 0,id,title,description,label
0,B0002D081Q,Ahead Drummer Gloves Large Pair,Professional Drummers Gloves by AHEAD feature ...,-1
1,B000Z7567A,Yamaha MG82CX 8 Input Stereo Mixer with Digita...,The MCG82CX is a smallbutpowerful 8input stere...,-1
2,B074CCYJ2P,Ibanez GRG 7 String Solid-Body Electric Guitar...,Product Description Patterned after the legend...,-1
3,B0CBK1WSMR,SWIFF High-Grade Electronic Guitar Wireless Sy...,HighGrade adopts the latest wireless UHF frequ...,-1
4,B01878M5JK,Focusrite Scarlett Solo Compact (1st Gen) USB ...,Scarlett Solo is a compact USB audio interface...,-1
5,B01HTMA69Q,Neewer NW-800 Microphone Set Including (1) NW-...,NOTE 1The Condenser Microphone only works with...,0
6,B0BR2VFB5M,Sennheiser MD 46 cardioid interview microphone,Product Description Omni microphones tend to p...,0
7,B00BXTHTHE,Audio-Technica AT2020USB Cardioid Condenser US...,Product Description The new AT2020 USB offers ...,0
8,B0BSCGXCVP,Marantz Professional MPM-1000 - Studio Recordi...,The MPM1000 is perfect for podcasting applicat...,0
9,B081CCY3PT,"XIIVIO USB Microphone, Computer Microphone for...",USB Microphone Easy To StallUSB microphone for...,0


In [7]:
product_samples = product_samples.sample(frac=1).reset_index(drop=True)
product_samples.head()

Unnamed: 0,id,title,description,label
0,B074CCYJ2P,Ibanez GRG 7 String Solid-Body Electric Guitar...,Product Description Patterned after the legend...,-1
1,B081CCY3PT,"XIIVIO USB Microphone, Computer Microphone for...",USB Microphone Easy To StallUSB microphone for...,0
2,B07W17441F,Mendini By Cecilio Violin - MV500+92D - Size 4...,ELEGANT DESIGN As beautiful as most stringed m...,36
3,B09F8T38PY,YONHAN 2 Pack Karaoke Microphone for Kids Fun ...,4in1 karaoke Bluetooth microphone These porta...,16
4,B07PPQGTWK,Kuyal Keyboard Bag Electric Piano Padded Case ...,Introductions Here is an easytouse Electronic ...,40


In [8]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [11]:
from dotenv import load_dotenv, find_dotenv
_=load_dotenv(find_dotenv())

In [12]:
openai_api_key = os.environ['OPENAI_API_KEY']

In [28]:
class QueryGenerator:
    def __init__(self, api_key):
        """Initialize the generator with API key and standard prompts"""
        self.client = OpenAI(api_key=api_key)

        # Store the system and example prompts
        self.system_prompt = """You are an expert in e-commerce search optimization. Given a product description, generate a list of 5 realistic search queries a customer might use to find the product, even if the customer does not know the exact name of the product they are looking for.

        Guidelines:
        - Keep queries short (2-10 words)
        - Avoid exact repetition of product specs; focus on what a customer would type
        - Include a mix of general, descriptive, and feature-focused queries
        - Consider different use cases for each product (e.g. headphones can be used for listening as well as noise cancellation).
        - Use casual, natural language
        - Make sure the queries are unique for each product, DO NOT repeat queries for other products
        - Return ONLY the list of 5 queries enclosed in [] and separated by ',' with no additional text"""


        self.example_prompt = """Here are some examples:

        Product Description: "Product Description Fender Instrument Case Stands are a great way to display and protect multiple instruments This case stand looks like a traditional case but easily turns into a stageworthy guitar stand Crafted with roadreliable materials this 3ply hardshell wooden case boasts a vinylwrapped steel carry handle and steel latches The soft crushedacrylic plush interior lining ensures your guitars remain scratch and damagefree From the Manufacturer Fender Instrument Case Stands are a great way to display and protect multiple instruments This case stand looks like a traditional case but easily turns into a stageworthy guitar stand Crafted with roadreliable materials this 3ply hardshell wooden case boasts a vinylwrapped steel carry handle and steel latches The soft crushedacrylic plush interior lining ensures your guitars remain scratch and damagefree Tweed or black vinylwrapped 3ply hardshell wooden case Vinylwrapped steelcore carry handle Crushedacrylic plush interior lining Steel latches Can hold up to 3 instruments"

        Queries:
        [convertible guitar case, leather guitar case, guitar shaped box, case to keep and cary guitar, guitar plus other instrument carrier]

        Product Description: Music Classroom Supplies 10 music classroom posters include Pop Rock Country Blues Folk Jazz Classical Disco Hip Hop and Reggae The best music poster you can select for music room decor Expanded Learning Each music genre poster includes the common sound instruments used origins and creators Teach kids about different cultures and historical events through todays diverse music genres Help jump start your students musical journey with these education supplies for teachers Vibrant Decor These vibrant visual aids are the ideal music teaching supplies or music bulletin board decorations Encourage music appreciation by displaying these bright and eyecatching teaching decorations Fun Activities Use these music education posters to aid in interactive learning Turn up the music and transform your classroom into an entertaining learning environment Have students name famous musicians of each genre and era or create a band using the information provided on the posters Size Each music education poster is 13x17 inches with a gloss lamination and flat packed for easy wall hanging and framing HandsOn Learning These music teaching posters are the ideal introduction to various musical genres throughout time Use dryerase markers to draw on the posters and encourage experiential learning Introduce more sub genres as children grow and gain a sound knowledge of the first ten genres Continue Learning Looking for additional elementary music posters Check out our Hubble Bubble Kids page for more music classroom decor"

        Queries:
        [music posters, band pictures to hang, music related art, music classroom decorations, different genres on posters]
        """

    def generate_single(self, product_description):
        """Generate queries for a single product description"""
        try:
            response = self.client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": self.system_prompt},
                    {"role": "user", "content": self.example_prompt},
                    {"role": "user", "content": f"Now generate queries for this product description: {product_description}"}
                ],
                temperature=0.7,
                max_tokens=50
            )

            res = response.choices[0].message.content
            return res.replace('[', '').replace(']', '').split(', ')

        except Exception as e:
            print(f"Error generating queries: {e}")
            return []

    def generate_batch(self, product_descriptions, batch_size: int = 5):
        """
        Generate queries for multiple products efficiently.
        Uses batching and includes rate limiting.
        """
        batch_results = []

        # Process in batches
        for i in range(0, len(product_descriptions), batch_size):
            batch = product_descriptions[i:i + batch_size]

            # Create a single prompt for the batch
            batch_prompt = "Generate queries for each of these products. For each product, start with 'Product X:' where X is the product number:\n\n"
            for idx, desc in enumerate(batch, 1):
                batch_prompt += f"Product {idx}: {desc}\n\n"

            try:
                response = self.client.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "system", "content": self.system_prompt},
                        {"role": "user", "content": self.example_prompt},
                        {"role": "user", "content": batch_prompt}
                    ],
                    temperature=0.7,
                    max_tokens=50 * len(batch)  # Adjust tokens based on batch size
                )

                # Parse the batch response
                response_text = response.choices[0].message.content
                response_list = re.sub("Product \d: ", "", response_text).replace('[', '').replace(']', '').split('\n')
                batch_results.extend([text.split(', ') for text in response_list])

                # Rate limiting - wait 1 second between batches
                if i + batch_size < len(product_descriptions):
                    time.sleep(5)

            except Exception as e:
                print(f"Error processing batch starting at index {i}: {e}")
                continue

        return batch_results

In [22]:
generator = QueryGenerator(openai_api_key)

In [23]:
# Number of products for which we are generating queries
product_samples['description'].shape

(340,)

In [34]:
# Create example queries for all descriptions
results = generator.generate_batch(product_samples['description'])

In [35]:
len(results)

411

In [36]:
[result for result in results if len(result[0]) <= 0]

[[''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 ['']]

In [37]:
cleaned_results = [result for result in results if len(result[0]) > 0]
len(cleaned_results)

340

In [47]:
np.unique([len(result) for result in cleaned_results])

array([5, 6])

In [39]:
# Indices of results that don't have 5 queries
inspect_index = [i for i in range(len(cleaned_results)) if len(cleaned_results[i]) != 5]
inspect_index

[119, 240]

In [40]:
[cleaned_results[i] for i in inspect_index]

[['Drum practice pad set',
  '2-sided drum pad',
  'drumsticks and bag included',
  'silent drum practice',
  'shock absorption and anti-skid',
  'high-density rubber surface'],
 ['finger jingles for percussion',
  'add bright jingle sound',
  'MEINL Finger Jingles for cajon',
  'djembe accessories',
  'crystal clear tambourine effect',
  'aluminum jingles for hand drums']]

In [48]:
cleaned_results[119]

['Drum practice pad set',
 '2-sided drum pad',
 'drumsticks and bag included',
 'silent drum practice',
 'shock absorption and anti-skid',
 'high-density rubber surface']

In [49]:
cleaned_results[240] = cleaned_results[240][:4] + cleaned_results[240][5:]
cleaned_results[119] = cleaned_results[119][:2] + cleaned_results[119][3:]

In [50]:
print(f"Number of results: {len(cleaned_results)}")
print(f"Number of queries per result: {np.unique([len(result) for result in cleaned_results])}")

Number of results: 340
Number of queries per result: [5]


In [51]:
# Add queries to original products
product_samples.loc[:, ['query_1', 'query_2', 'query_3', 'query_4', 'query_5']] = cleaned_results

In [52]:
product_samples.head()

Unnamed: 0,id,title,description,label,query_1,query_2,query_3,query_4,query_5
0,B074CCYJ2P,Ibanez GRG 7 String Solid-Body Electric Guitar...,Product Description Patterned after the legend...,-1,electric guitar high gloss finish,metal monster guitar,7-string guitar jumbo frets,Ibanez guitar for metal rockers,affordable high-speed guitar
1,B081CCY3PT,"XIIVIO USB Microphone, Computer Microphone for...",USB Microphone Easy To StallUSB microphone for...,0,USB microphone for computer,clear sound quality mic,USB mic accessories,vocal USB microphone,microphone for singing and chatting
2,B07W17441F,Mendini By Cecilio Violin - MV500+92D - Size 4...,ELEGANT DESIGN As beautiful as most stringed m...,36,hand-carved violin for adults,violin learning kit,violin size for beginners,delicate violin fiddle kit,violin with accessories
3,B09F8T38PY,YONHAN 2 Pack Karaoke Microphone for Kids Fun ...,4in1 karaoke Bluetooth microphone These porta...,16,4-in-1 karaoke Bluetooth microphone,portable wireless microphone,microphone speaker recorder,voice changer microphone,karaoke machine with voice change
4,B07PPQGTWK,Kuyal Keyboard Bag Electric Piano Padded Case ...,Introductions Here is an easytouse Electronic ...,40,electronic keyboard bag,durable keyboard bag,fashionable keyboard case,soft fabric keyboard case,ergonomic keyboard bag handle


In [53]:
product_samples.to_csv('musical_instruments_with_sample_queries', index=False)