In [None]:
%env OPENAI_API_KEY=<PUT_YOUR_API_KEY_HERE>


In [None]:
from openai import OpenAI
import os

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
COMPLETION_MODEL = "gpt-3.5-turbo"


def generate_data_by_prompt(prompt):
    response = client.chat.completions.create(
        model=COMPLETION_MODEL,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.5,
        max_tokens=2048,
        top_p=1,
    )
    message = response.choices[0].message.content
    return message


prompt = """Please generate 50 product titles for items on Amazon, each about 30 words long, in the category of children's toys. The titles often include some promotional information. One title per line."""
data = generate_data_by_prompt(prompt)


In [None]:
import pandas as pd

product_names = data.strip().split('\n')
df = pd.DataFrame({'product_name': product_names})
df.head()


In [6]:
df.product_name = df.product_name.apply(lambda x: x.split('.')[1].strip())
df.head()


Unnamed: 0,product_name
0,Fun Educational Wooden Puzzle Set for Kids - D...
1,Soft Plush Puppy Dog Toy - Perfect for Kids to...
2,Colorful Building Blocks Set - Encourages Crea...
3,Classic Wooden Pull-Along Toy - Teaches Cause ...
4,Magnetic Drawing Board - Enhances Writing and ...


In [7]:
clothes_prompt = """Please generate 50 product titles for digital technology products on Amazon, each about 30 words long. The titles often include promotional information. List one title per line."""
clothes_data = generate_data_by_prompt(clothes_prompt)
clothes_product_names = clothes_data.strip().split('\n')
clothes_df = pd.DataFrame({'product_name': clothes_product_names})
clothes_df.product_name = clothes_df.product_name.apply(lambda x: x.split('.')[1].strip())
clothes_df.head()


Unnamed: 0,product_name
0,Wireless Bluetooth Headphones – High Quality S...
1,Portable Digital Camera with HD Video Recordin...
2,USB Charging Station – Fast Charging for All Y...
3,"Smart TV Box – Stream Movies, TV Shows, Music ..."
4,Wireless Home Security Camera System – 24/7 Pr...


In [8]:
df = pd.concat([df, clothes_df], axis=0)
df = df.reset_index(drop=True)
display(df)


Unnamed: 0,product_name
0,Fun Educational Wooden Puzzle Set for Kids - D...
1,Soft Plush Puppy Dog Toy - Perfect for Kids to...
2,Colorful Building Blocks Set - Encourages Crea...
3,Classic Wooden Pull-Along Toy - Teaches Cause ...
4,Magnetic Drawing Board - Enhances Writing and ...
...,...
95,Digital Photo Frame – Display Your Favourite P...
96,"Streaming Media Player – Stream Movies, TV Sho..."
97,Wi-Fi Camera – Monitor & Record Your Home or O...
98,Bluetooth Stereo – Enjoy High Quality Audio On...


In [None]:
import numpy as np


def get_embeddings(text, model):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


In [11]:
import openai, os, backoff

openai.api_key = os.environ.get("OPENAI_API_KEY")
embedding_model = "text-embedding-ada-002"


In [9]:


batch_size = 100

@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
def get_embeddings_with_backoff(prompts, engine):
    embeddings = []
    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i+batch_size]
        embeddings += get_embeddings(list_of_text=batch, engine=engine)
    return embeddings

prompts = df.product_name.tolist()
prompt_batches = [prompts[i:i+batch_size] for i in range(0, len(prompts), batch_size)]

embeddings = []
for batch in prompt_batches:
    batch_embeddings = get_embeddings_with_backoff(prompts=batch, engine=embedding_model)
    embeddings += batch_embeddings

df["embedding"] = embeddings
df.to_parquet("./data/amazon_product_titles.parquet", index=False)


  if _pandas_api.is_sparse(col):


In [10]:
# search through the reviews for a specific product
def search_product(df, query, n=3, pprint=True):
    product_embedding = get_embeddings(
        query,
        engine=embedding_model
    )
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .product_name
    )
    if pprint:
        for r in results:
            print(r)
    return results

results = search_product(df, "Xbox One X LE Bundle - CyberPunk", n=3)


Virtual Reality Headset – Experience the Future of Entertainment
Gaming Headset – Immersive Audio & Comfort-Fit Design
Smart TV Box – Stream Movies, TV Shows, Music & More


In [14]:
def recommend_product(df, product_name, n=3, pprint=True):
    product_embedding = df[df['product_name'] == product_name].iloc[0].embedding
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .product_name
    )
    if pprint:
        for r in results:
            print(r)
    return results

results = recommend_product(df, "Wireless Keyboard & Mouse Set – Perfect for Home & Office Use", n=3)


Wireless Keyboard & Mouse Set – Perfect for Home & Office Use
Bluetooth Keyboard – Type Comfortably & Easily On the Go
Wireless Game Controller – Play Your Favourite Games with Ease


In [15]:
%conda install -c conda-forge faiss


done
Solving environment: | 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - conda-forge/noarch::gradio-client==0.7.0=pyhd8ed1ab_0
  - conda-forge/noarch::jupyterlab_server==2.25.0=pyhd8ed1ab_0
  - conda-forge/noarch::fastapi==0.99.1=pyhd8ed1ab_0
  - conda-forge/noarch::notebook-shim==0.2.3=pyhd8ed1ab_0
  - conda-forge/noarch::jupyter_server==2.10.0=pyhd8ed1ab_0
  - conda-forge/noarch::gradio==3.24.1=pyhd8ed1ab_0
  - conda-forge/noarch::jupyterlab==4.0.8=pyhd8ed1ab_0
  - conda-forge/noarch::starlette==0.27.0=pyhd8ed1ab_0
  - conda-forge/noarch::httpx==0.25.1=pyhd8ed1ab_0
  - conda-forge/noarch::jupyter-lsp==2.2.0=pyhd8ed1ab_0
  - conda-forge/noarch::httpcore==1.0.2=pyhd8ed1ab_0
done


  current version: 23.5.2
  latest version: 23.10.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install c

In [3]:
import pandas as pd

file_path = './data/amazon_product_titles.parquet'

df = pd.read_parquet(file_path)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

print(df.head())


                                        product_name  \
0  Fun Educational Wooden Puzzle Set for Kids - D...   
1  Soft Plush Puppy Dog Toy - Perfect for Kids to...   
2  Colorful Building Blocks Set - Encourages Crea...   
3  Classic Wooden Pull-Along Toy - Teaches Cause ...   
4  Magnetic Drawing Board - Enhances Writing and ...   

                                           embedding  
0  [-0.002125626662746072, 0.011036969721317291, ...  
1  [-0.0030388429295271635, 0.009870966896414757,...  
2  [-0.02469971589744091, 0.011756803840398788, -...  
3  [-0.005898129194974899, -0.0008016020874492824...  
4  [-0.026974711567163467, 0.010626200586557388, ...  


In [3]:
import faiss
import numpy as np

def load_embeddings_to_faiss(df):
    embeddings = np.array(df['embedding'].tolist()).astype('float32')
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index

index = load_embeddings_to_faiss(df)


In [12]:
def search_index(index, df, query, k=5):
    query_vector = np.array(get_embeddings(query, engine=embedding_model)).reshape(1, -1).astype('float32')
    distances, indexes = index.search(query_vector, k)

    results = []
    for i in range(len(indexes)):
        product_names = df.iloc[indexes[i]]['product_name'].values.tolist()
        results.append((distances[i], product_names))
    return results

products = search_index(index, df, "Apple iPhone 15 Pro (1 TB) - Blue Titanium", k=3)

for distances, product_names in products:
    for i in range(len(distances)):
        print(product_names[i], distances[i])


USB Microphone – Professional Quality Audio Recording On the Go 0.40481743
Bluetooth Keyboard – Type Comfortably & Easily On the Go 0.41031334
Bluetooth Stereo – Enjoy High Quality Audio On the Go 0.41114482
