In [1]:
import os
import re
import time
from dotenv import load_dotenv
import numpy as np
import pandas as pd
from openai import OpenAI

In [2]:
products_df = pd.read_csv("labeled_cds_vinyl_products.csv")

In [3]:
products_df.head()

Unnamed: 0,id,title,description,label
0,B073LP9GRN,Urban Hymns,urban hymns is the 3rd studio album by english...,60
1,B097CFN5W4,Metallica Remastered,the black albumis one of the most commercially...,6
2,B00YB9BL7W,In Through the out Door,on its release in the summer of 1979 in throug...,58
3,B008L1PYQU,Destroyer: Resurrected,resurrected newly remixed from the original ma...,36
4,B000007WOP,Urban Hymns,urban hymns is the third studio album by engli...,50


In [4]:
label_counts = products_df['label'].value_counts()
label_counts.tail()

label
68    14
43    11
13     7
4      4
7      1
Name: count, dtype: int64

In [5]:
small_clusters = label_counts[label_counts < 3].index
filtered_products_df = products_df[~products_df['label'].isin(small_clusters)]
print(filtered_products_df['label'].value_counts().tail())  # last few labels after filtering, checks that label 7 is gone

label
35    15
68    14
43    11
13     7
4      4
Name: count, dtype: int64


In [6]:
# We will sample 4 products per group and create sample queries for each of them (around 300 products is the goal)
product_samples = filtered_products_df.groupby('label').sample(n=4, random_state=160).reset_index(drop=True)
product_samples.head(n=12)

Unnamed: 0,id,title,description,label
0,B00382X4X2,Mob Rules,digitally remastered and expanded deluxe two c...,0
1,B006OAB6EY,Innuendo,digitally remastered deluxe two cd edition of ...,0
2,B004ZNA5JO,A Momentary Lapse Of Reason,from 1987 a momentary lapse of reason is the f...,0
3,B0039TD7PY,Live At The Troubadour,note the product is a cd and a dvd and will ha...,0
4,B000X640OS,Legend: The Best Of Bob Marley And The Wailers...,audio cassette legend the best of bob marley a...,1
5,B09BNM5DC8,Love For Sale,celebrating 10 years since they first recorded...,1
6,B00006O0PT,Surrealistic Pillow,feed your head with the rare mono edition of t...,1
7,B00ZW048DI,Nathaniel Rateliff & The Night Sweats,nathaniel rateliff the night sweats practical...,1
8,B08P87XTFG,25,the third studio album from worldwide phenomen...,2
9,B08T43FGYJ,Chief Blue,eric churchs platinum certified album which wa...,2


In [7]:
# Shuffle the products so that different groups are passed together
product_samples = product_samples.sample(frac=1).reset_index(drop=True)
product_samples.head()

Unnamed: 0,id,title,description,label
0,B00BV9RZSO,Elephant,doulbe 180gm vinyl lp pressing of this 2003 al...,67
1,B0009ELZAG,The Beach Boys: Pet Sounds,the tracks are 1 wouldnt it be nice 2 you stil...,45
2,B00M889IDM,Eric Clapton & Friends: The Breeze,eric clapton has often stated that jj cale is ...,54
3,B00NQKWA6S,The Endless River,2014 release the 15th and final studio album f...,3
4,B08HGPZ1Q5,American Beauty,the crown jewel of the deads studio output fea...,63


In [8]:
load_dotenv()
openai_api_key = os.environ['OPENAI_API_KEY']

In [9]:
# List of target IDs
target_ids = ["B07FZ99HXL", "B084XTMZVS", "B075JKQT4N"]

# Filter DataFrame to only include rows with matching IDs and select the 'description' column
descriptions = products_df.loc[products_df['id'].isin(target_ids), ['id', 'title', 'description']]

# Print the full descriptions
for index, row in descriptions.iterrows():
    print(f"Title: {row['title']}")
    print(f"Description: {row['description']}")

Title: ASTROWORLD       Explicit Lyrics
Description: double vinyl lp pressing including digital download astroworld is the third studio album by rapper travis scott it was released on august 3 2018 by cactus jack records epic records and grand hustle records the album follows his second studio album birds in the trap sing mcknight 2016 and his collaborative album huncho jack jack huncho 2017 with quavo the album features guest vocals from kid cudi frank ocean drake the weeknd james blake swae lee gunna nav 21 savage quavo takeoff juice wrld sheck wes and don toliver among others production was handled by multiple producers including mike dean allen ritter hitboy wondagurl tay keith tame impala frank dukes sonny digital and thundercat
Title: After Hours       Explicit Lyrics
Description: the weeknds latest chapter via xorepublic records after hours features global hits like heartless and blinding lights filtering rb pop and hiphop through an ambitious widescreen lens quietly took over p

In [10]:
class QueryGenerator:
    def __init__(self, api_key):
        """Initialize the generator with API key and standard prompts"""
        self.client = OpenAI(api_key=api_key)
        
        # Store the system and example prompts
        self.system_prompt = """You are an expert in e-commerce search optimization. Given a product description, generate a list of 5 realistic search queries a customer might use to find the product, even if the customer does not know the exact name of the product they are looking for.

        Guidelines:
        - Keep queries short (2-10 words)
        - Avoid exact repetition of product specs; focus on what a customer would type
        - Include a mix of general, descriptive, and feature-focused queries
        - Consider different use cases for each product (e.g. baking soda can be used for cooking, cleaning, and removing bad odors).
        - Use casual, natural language
        - Return ONLY the list of 5 queries enclosed in [] and separated by ',' with no additional text"""

        self.example_prompt = """Here are some examples:

        Product Description: "double vinyl lp pressing including digital download astroworld is the third studio album by rapper travis scott it was released on august 3 2018 by cactus jack records epic records and grand hustle records the album follows his second studio album birds in the trap sing mcknight 2016 and his collaborative album huncho jack jack huncho 2017 with quavo the album features guest vocals from kid cudi frank ocean drake the weeknd james blake swae lee gunna nav 21 savage quavo takeoff juice wrld sheck wes and don toliver among others production was handled by multiple producers including mike dean allen ritter hitboy wondagurl tay keith tame impala frank dukes sonny digital and thundercat"
       
        Queries:
        ["travis scott vinyl album", "astroworld double lp", "hip hop records with drake", "popular rap albums on vinyl", "travis scott 2018 album"]

        Product Description: "the weeknds latest chapter via xorepublic records after hours features global hits like heartless and blinding lights filtering rb pop and hiphop through an ambitious widescreen lens quietly took over popular music and culture on his own terms as a result the multiplatinum twotime grammyr award winner has emerged as one of the most successful and significant artists of the 21st century double lp set housed in gatefold jacket packaging"
        
        Queries:
        ["the weeknd after hours vinyl", "blinding lights album record", "rb vinyl albums 2020", "double lp the weeknd", "popular weeknd albums on vinyl"]

        Product Description: "limited ocard packaging 2017 holiday release christmas with elvis and the royal philharmonic orchestra once again pairs the king of rock and roll with englands most beloved orchestra for a new symphonic take on elvis timeless christmas favorites with arrangements recorded at abbey road studios in london includes blue christmas here comes santa claus right down santa claus lane merry christmas baby silent night and more this deluxe edition also includes newlyorchestrated version of the four gospel hymns from elvis peace in the valley ep including therell be peace in the valley for me and take my hand precious lord"

        Queries:
        ["elvis christmas album vinyl", "holiday records with orchestra", "classic christmas music lp", "elvis presley festive vinyl", "remastered christmas records"]
        """

    def generate_single(self, product_description):
        """Generate queries for a single product description"""
        try:
            response = self.client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": self.system_prompt},
                    {"role": "user", "content": self.example_prompt},
                    {"role": "user", "content": f"Now generate queries for this product description: {product_description}"}
                ],
                temperature=0.7,
                max_tokens=50
            )
            
            res = response.choices[0].message.content
            return res.replace('[', '').replace(']', '').split(', ')
            
        except Exception as e:
            print(f"Error generating queries: {e}")
            return []

    def generate_batch(self, product_descriptions, batch_size: int = 5):
        """
        Generate queries for multiple products efficiently.
        Uses batching and includes rate limiting.
        """
        batch_results = []
        
        # Process in batches
        for i in range(0, len(product_descriptions), batch_size):
            batch = product_descriptions[i:i + batch_size]
            
            # Create a single prompt for the batch
            batch_prompt = "Generate queries for each of these products. For each product, start with 'Product X:' where X is the product number:\n\n"
            for idx, desc in enumerate(batch, 1):
                batch_prompt += f"Product {idx}: {desc}\n\n"
            
            try:
                response = self.client.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "system", "content": self.system_prompt},
                        {"role": "user", "content": self.example_prompt},
                        {"role": "user", "content": batch_prompt}
                    ],
                    temperature=0.7,
                    max_tokens=50 * len(batch)  # Adjust tokens based on batch size
                )
                
                # Parse the batch response
                response_text = response.choices[0].message.content
                response_list = re.sub("Product \d: ", "", response_text).replace('[', '').replace(']', '').split('\n')
                batch_results.extend([text.split(', ') for text in response_list])
                                
                # Rate limiting - wait 1 second between batches
                if i + batch_size < len(product_descriptions):
                    time.sleep(5)
                    
            except Exception as e:
                print(f"Error processing batch starting at index {i}: {e}")
                continue
        
        return batch_results

In [11]:
generator = QueryGenerator(openai_api_key)

In [12]:
# Number of products for which we are generating queries
product_samples['description'].shape

(276,)

In [13]:
single_result = generator.generate_single(product_samples.loc[0, 'description'])

In [14]:
single_result # analyze the results

['"white stripes vinyl lp"',
 '"garage rock revival albums"',
 '"2003 rock music records"',
 '"jack white band on vinyl"',
 '"low-fi garage rock music"']

In [15]:
# create example queries for all descriptions
results = generator.generate_batch(product_samples['description'])

In [16]:
print(len(results))
print(results)

414
[[' "white stripes vinyl LP 2003"', '"garage rock revival music"', '"analog recording equipment bands"', '"Detroit indie music albums"', '"raw simplicity music composition" '], [''], [' "Pet Sounds bonus tracks vinyl"', '"Beach Boys album 1966"', '"Brian Wilson music compositions"', '"California pop music LP"', '"melodic pop rock records" '], [''], [' "Eric Clapton tribute album JJ Cale"', '"rock history appreciation music"', '"famous musicians collaborations LP"', '"JJ Cale covers by artists"', '"1972 single call me the breeze" '], [''], [' "David Gilmour final studio album"', '"British rock band unreleased tracks"', '"Pink Floyd leftovers project"', '"veteran rock musicians LP"', '"coproduced by Phil Manzanera" '], [''], [' "Grateful Dead crown jewel LP"', '"50th-anniversary remastered vinyl"', '"psychedelic rock classics"', '"180-gram vinyl reissue"', '"tracklist box of rain" '], ['"Willie Nelson 2020 album"', '"celebratory country album 2020"', '"new songs by Willie Nelson"', '

We have more results than products, so let's go ahead and parse through the results so that we correctly extract the right information.

In [17]:
[result for result in results if len(result[0]) <= 0]

[[''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 ['']]

The above results are just empty strings, so we can filter them out

In [18]:
cleaned_results = [result for result in results if len(result[0]) > 0]
len(cleaned_results)

281

We now only have one more result than we need.

In [19]:
np.unique([len(result) for result in cleaned_results])

array([1, 4, 5])

It looks like we have a variable number of queries in some of the results. Let's examine each of the results that don't have the desired number of queries (5).

In [20]:
# Indices of results that don't have 5 queries
inspect_index = [i for i in range(len(cleaned_results)) if len(cleaned_results[i]) != 5]
inspect_index  # shows indices of problematic entries

[75, 77, 79, 81, 83, 89]

In [21]:
[cleaned_results[i] for i in inspect_index]


[['Product 1:'],
 ['Product 2:'],
 ['Product 3:'],
 ['Product 4:'],
 ['Product 5:'],
 ['"supertramp crime of the century vinyl"',
  '"limited edition grey vinyl lp"',
  '"supertramp 1974 album remastered"',
  '"supertr']]

Remove the 'Product X: '

In [30]:
cleaned_results = [result for result in cleaned_results if not (len(result) == 1 and result[0].startswith("Product"))]
len(cleaned_results)

276

In [31]:
inspect_index = [i for i in range(len(cleaned_results)) if len(cleaned_results[i]) != 5]
print(inspect_index) 

[84]


In [32]:
[cleaned_results[i] for i in inspect_index]

[['"supertramp crime of the century vinyl"',
  '"limited edition grey vinyl lp"',
  '"supertramp 1974 album remastered"',
  '"supertr']]

In [35]:
cleaned_results[84]

['"supertramp crime of the century vinyl"',
 '"limited edition grey vinyl lp"',
 '"supertramp 1974 album remastered"',
 '"supertr']

In [36]:
extra_query = '"Supertramp progressive rock album"'
cleaned_results[84].append(extra_query)
cleaned_results[84]

['"supertramp crime of the century vinyl"',
 '"limited edition grey vinyl lp"',
 '"supertramp 1974 album remastered"',
 '"supertr',
 '"Supertramp progressive rock album"']

In [37]:
inspect_index = [i for i in range(len(cleaned_results)) if len(cleaned_results[i]) != 5]
print(inspect_index) 

[]


In [38]:
print(f"Number of results: {len(cleaned_results)}")
print(f"Number of queries per result: {np.unique([len(result) for result in cleaned_results])}")

Number of results: 276
Number of queries per result: [5]


In [39]:
# Add queries to original products
product_samples.loc[:, ['query_1', 'query_2', 'query_3', 'query_4', 'query_5']] = cleaned_results

In [42]:
product_samples.head()

Unnamed: 0,id,title,description,label,query_1,query_2,query_3,query_4,query_5
0,B00BV9RZSO,Elephant,doulbe 180gm vinyl lp pressing of this 2003 al...,67,"""white stripes vinyl LP 2003""","""garage rock revival music""","""analog recording equipment bands""","""Detroit indie music albums""","""raw simplicity music composition"""
1,B0009ELZAG,The Beach Boys: Pet Sounds,the tracks are 1 wouldnt it be nice 2 you stil...,45,"""Pet Sounds bonus tracks vinyl""","""Beach Boys album 1966""","""Brian Wilson music compositions""","""California pop music LP""","""melodic pop rock records"""
2,B00M889IDM,Eric Clapton & Friends: The Breeze,eric clapton has often stated that jj cale is ...,54,"""Eric Clapton tribute album JJ Cale""","""rock history appreciation music""","""famous musicians collaborations LP""","""JJ Cale covers by artists""","""1972 single call me the breeze"""
3,B00NQKWA6S,The Endless River,2014 release the 15th and final studio album f...,3,"""David Gilmour final studio album""","""British rock band unreleased tracks""","""Pink Floyd leftovers project""","""veteran rock musicians LP""","""coproduced by Phil Manzanera"""
4,B08HGPZ1Q5,American Beauty,the crown jewel of the deads studio output fea...,63,"""Grateful Dead crown jewel LP""","""50th-anniversary remastered vinyl""","""psychedelic rock classics""","""180-gram vinyl reissue""","""tracklist box of rain"""


In [43]:
product_samples.to_csv('cd_vinyl_products_with_sample_queries', index=False)