In [1]:
# imports
import pandas as pd
import pickle
import os
import openai
import numpy as np
from openai.embeddings_utils import (
    get_embedding,
    distances_from_embeddings,
    tsne_components_from_embeddings,
    chart_from_components,
    indices_of_nearest_neighbors_from_distances,
)
from tenacity import retry, wait_random_exponential, stop_after_attempt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Add the path to the constants file to the system path
import sys
sys.path.append('../../')
from constants import (
    RANDOM_STATE, 
    OPENAI_API_KEY,
    EMBEDDING_MODEL
)
# OpenAI API Key
openai.api_key = OPENAI_API_KEY

# Get the current directory of the notebook
current_dir = os.path.dirname(os.path.abspath("../../data/amazon-beauty/parse_and_clean_meta_data.ipynb"))
# # Get the current directory of the notebook
embedding_model_current_dir = os.path.dirname(os.path.abspath("../../models/embedding/parse_and_clean_meta_data.ipynb"))
print(f"current directory: {current_dir}")
print(f"embedding model current directory: {embedding_model_current_dir}")

n_examples = 5

current directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty
embedding model current directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/models/embedding


In [2]:
# Construct the path to data file
data_path = os.path.join(current_dir, 'large_merged_data.csv')
print(f'data path: {data_path}')
# load data (full dataset available at http://groups.di.unipi.it/~gulli/AG_corpus_of_news_products.html)
dataset_path = data_path
df = pd.read_csv(dataset_path)

df.head(n_examples)

data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/large_merged_data.csv


Unnamed: 0,rating,reviewerID,asin,reviewText,summary,category,title
0,1.0,A2RYSCZOPEXOCQ,9790787006,"I use a lot of perfume, I go through a new bot...",This is not going to be my favorite scent.,[],Jenna Jameson Heartbreaker Perfume for women 3...
1,5.0,A141OPVE376YFI,B000050B65,"First, a little background. I've switched bet...","Finally, a razor that lives up to the ads",[],Norelco 6885XL Deluxe Quadra Action Cord/Cordl...
2,5.0,A141OPVE376YFI,B000050B65,"First, a little background. I've switched bet...","Finally, a razor that lives up to the ads",[],Norelco 6885XL Deluxe Quadra Action Cord/Cordl...
3,5.0,A1TVTDKNMSQ7XU,B000050B6B,I've had many Norelco razors in my 50 years of...,Just like new.....,[],Philips Norelco HQ5 Shaving Heads
4,5.0,A1TVTDKNMSQ7XU,B000050B6B,I've had many Norelco razors in my 50 years of...,Just like new.....,[],Philips Norelco HQ5 Shaving Heads


In [3]:
# print the title, reviewText, and rating of each example
for idx, row in df.head(n_examples).iterrows():
    print("")
    print(f"Title: {row['title']}")
    print(f"Review: {row['reviewText']}")
    print(f"Rating: {row['rating']}")


Title: Jenna Jameson Heartbreaker Perfume for women 3.4 oz Eau De Parfum Spray
Review: I use a lot of perfume, I go through a new bottle every couple of weeks, and I never bought the same scent twice. I`m still looking for my favorite scent. This is not going to be it. I`m going to use it, but definitely not purchase again. Someone else may like it, it just does not lure my scents. Bottle is a very pretty, red glass, and it smells classier, than the name suggests.

I got this for evaluation, 50% off, in order that I might provide this review.

* I originally gave this 3 stars, but as I tried to use this further, I found the smell being just completely wrong. I asked three friends separately, what they think of it, and each one said, that this smells manly. And that was the exact word I was looking for. First two girls did not want to take this from me for free, the third one took it, but stated that really just a little at a time can be used. I downgraded this to 1 star, as it was in 

# Build cache to save embeddings (OpenAI API)

+ Save our embeddings so we can re-use them later.
+ The cache is a dictionary that maps tuples of `(text, model)` to an embedding, which is a list of floats. The cache is saved as a Python pickle file.
+ The embedded vectors are a numerical representation of the input text's meaning, capturing both its inherent semantics and its context within the provided input. 
+ OpenAI embeddings are normalized to length 1, which means that:
    + Cosine similarity can be computed slightly faster using just a dot product
    + Cosine similarity and Euclidean distance will result in the identical rankings
+ Aggregation process of embedding is not documented


In [4]:
# establish a cache of embeddings to avoid recomputing
# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file

# set path to embedding cache
# Construct the path to data file
embedding_cache_path = os.path.join(current_dir, 'amazon_embeddings_cache.pkl')

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

# define a function to retrieve embeddings from the cache if present, and otherwise request via the API
def embedding_from_string(
    string: str,
    model: str = EMBEDDING_MODEL,
    embedding_cache=embedding_cache
) -> list:
    """Return embedding of given string, using a cache to avoid recomputing."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]

In [5]:
# as an example, take the first title from the dataset
example_string = df["title"].values[0]
print(f"\nExample string: {example_string}")

# print the first 10 dimensions of the embedding
example_embedding = embedding_from_string(example_string)
print(f"\nExample embedding: {example_embedding[:10]}...")


Example string: Jenna Jameson Heartbreaker Perfume for women 3.4 oz Eau De Parfum Spray

Example embedding: [-0.018199129030108452, 0.007970279082655907, -0.027546744793653488, -0.020601309835910797, 0.006847520358860493, 0.015914447605609894, -0.008583879098296165, -0.018721342086791992, -0.011932571418583393, -0.014726411551237106]...


# Recommend similar products based on embeddings

+ Get the similarity embeddings of all the product title
+ Calculate the distance between a source title and all other products
+ Print out the other products closest to the source title

In [7]:
def print_recommendations_from_strings(
    strings: list[str],
    index_of_source_string: int,
    k_nearest_neighbors: int = 1,
    model=EMBEDDING_MODEL,
) -> list[int]:
    """Print out the k nearest neighbors of a given string."""
    # get embeddings for all strings
    embeddings = [embedding_from_string(string, model=model) for string in strings]
    # get the embedding of the source string
    query_embedding = embeddings[index_of_source_string]
    # get distances between the source embedding and other embeddings (function from embeddings_utils.py)
    distances = distances_from_embeddings(query_embedding, embeddings, distance_metric="cosine")
    # get indices of nearest neighbors (function from embeddings_utils.py)
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)

    # print out source string
    query_string = strings[index_of_source_string]
    print(f"Source string: {query_string}")
    # print out its k nearest neighbors
    k_counter = 0
    for i in indices_of_nearest_neighbors:
        # skip any strings that are identical matches to the starting string
        if query_string == strings[i]:
            continue
        # stop after printing out k products
        if k_counter >= k_nearest_neighbors:
            break
        k_counter += 1

        # print out the similar strings and their distances
        print(
            f"""
        --- Recommendation #{k_counter} (nearest neighbor {k_counter} of {k_nearest_neighbors}) ---
        String: {strings[i]}
        Distance: {distances[i]:0.3f}"""
        )

    return indices_of_nearest_neighbors

product_titles = df["title"].tolist()

tony_blair_products = print_recommendations_from_strings(
    strings=product_titles,  # let's base similarity off of the product title
    index_of_source_string=0,  # let's look at products similar to the first one about
    k_nearest_neighbors=5,  # let's look at the 5 most similar products
)

In [None]:
# calculate RMSE and MAE manually
def calculate_rmse_and_mae(actual_ratings, predicted_ratings):
    differences = [actual - predicted for actual, predicted in zip(actual_ratings, predicted_ratings)]
    
    # RMSE
    squared_differences = [diff ** 2 for diff in differences]
    mean_squared_difference = sum(squared_differences) / len(squared_differences)
    rmse = mean_squared_difference ** 0.5

    # MAE
    absolute_differences = [abs(diff) for diff in differences]
    mae = sum(absolute_differences) / len(absolute_differences)

    return rmse, mae

# Regression using embedding

+ Obtain embeddings for each unique user ID.
+ For each data point, concatenate the title embedding with the user embedding to form a combined feature vector.
+ Split the dataset into training and test sets.
+ Train the model on the combined embeddings and predict the test set.
+ Evaluate using RMSE and MAE metrics.

In [None]:
%%time

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embeddings(texts: list[str], model="text-embedding-ada-002") -> list[list[float]]:
    return [item["embedding"] for item in openai.Embedding.create(input=texts, model=model)["data"]]

# Get embeddings for titles in batches
batch_size = 100
title_embeddings = []

for i in range(0, len(df['title']), batch_size):
    batch_texts = df['title'].iloc[i:i+batch_size].tolist()
    title_embeddings.extend(get_embeddings(batch_texts))

# Get embeddings for unique users
unique_users = df['reviewerID'].unique().tolist()
user_embeddings_dict = {}
user_embeddings = get_embeddings(unique_users)
for user, embedding in zip(unique_users, user_embeddings):
    user_embeddings_dict[user] = embedding

# Create combined embeddings: title_embedding + user_embedding
combined_embeddings = []
for idx, row in df.iterrows():
    combined_embedding = title_embeddings[idx] + user_embeddings_dict[row['reviewerID']]
    combined_embeddings.append(combined_embedding)

X_openai = np.array(combined_embeddings)

# Splitting the dataset
X_train_openai, X_test_openai, y_train, y_test = train_test_split(X_openai, df['rating'], test_size=0.2, random_state=42)

# Train RandomForest on OpenAI embeddings
rfr_openai = RandomForestRegressor(n_estimators=100)
rfr_openai.fit(X_train_openai, y_train)

# Predict and evaluate
preds_openai = rfr_openai.predict(X_test_openai)
rmse_openai = np.sqrt(mean_squared_error(y_test, preds_openai))  # Calculating RMSE
mae_openai = mean_absolute_error(y_test, preds_openai)

print(f"OpenAI embedding performance: rmse={rmse_openai:.4f}, mae={mae_openai:.4f}")

OpenAI embedding performance: rmse=1.59, mae=1.13
CPU times: user 370 ms, sys: 13.5 ms, total: 383 ms
Wall time: 3.7 s


# References

+ https://cookbook.openai.com/examples/recommendation_using_embeddings
+ https://github.com/openai/openai-python/blob/main/openai/embeddings_utils.py
+ https://help.openai.com/en/products/6824809-embeddings-frequently-asked-questions
+ https://platform.openai.com/docs/guides/embeddings/use-cases