In [8]:
# imports
import pandas as pd
import pickle
import os
import openai
import numpy as np
from openai.embeddings_utils import (
    get_embedding,
    distances_from_embeddings,
    tsne_components_from_embeddings,
    chart_from_components,
    indices_of_nearest_neighbors_from_distances,
)
from tenacity import retry, wait_random_exponential, stop_after_attempt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Add the path to the constants file to the system path
import sys
sys.path.append('../../')
from constants import (
    RANDOM_STATE, 
    OPENAI_API_KEY,
    N_ESTIMATORS,
    EMBEDDING_MODEL
)
# OpenAI API Key
openai.api_key = OPENAI_API_KEY

# Get the current directory of the notebook
current_dir = os.path.dirname(os.path.abspath("../../data/amazon-beauty/parse_and_clean_meta_data.ipynb"))
# # Get the current directory of the notebook
embedding_model_current_dir = os.path.dirname(os.path.abspath("../../models/embedding/parse_and_clean_meta_data.ipynb"))
print(f"current directory: {current_dir}")
print(f"embedding model current directory: {embedding_model_current_dir}")

n_examples = 5

current directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty
embedding model current directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/models/embedding


In [10]:
# Construct the path to data file
data_path = os.path.join(current_dir, 'merged_data.csv')
print(f'data path: {data_path}')
# load data (full dataset available at http://groups.di.unipi.it/~gulli/AG_corpus_of_news_products.html)
dataset_path = data_path
df = pd.read_csv(dataset_path)

df.head(n_examples)

data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/merged_data.csv


Unnamed: 0,rating,reviewerID,asin,reviewText,summary,title
0,5.0,ANV9L0JU6BNL,B000052YAN,best floss i've used. does not break as easily...,best floss i've used,Reach Dentotape Waxed Dental Floss with Extra ...
1,5.0,ANV9L0JU6BNL,B000052YAN,best floss i've used. does not break as easily...,best floss i've used,Reach Dentotape Waxed Dental Floss with Extra ...
2,2.0,A2TU781PWGS09X,B00006L9LC,Doesnt smell,Two Stars,Citre Shine Moisture Burst Shampoo - 16 fl oz
3,2.0,A2TU781PWGS09X,B00006L9LC,Doesnt smell,Two Stars,Citre Shine Moisture Burst Shampoo - 16 fl oz
4,5.0,A3A8F2URN7MEPR,B00006L9LC,My favorite powder!,Five Stars,Citre Shine Moisture Burst Shampoo - 16 fl oz


In [11]:
# print the title, reviewText, and rating of each example
for idx, row in df.head(n_examples).iterrows():
    print("")
    print(f"Title: {row['title']}")
    print(f"Review: {row['reviewText']}")
    print(f"Rating: {row['rating']}")


Title: Reach Dentotape Waxed Dental Floss with Extra Wide Cleaning Surface for Large Spaces between Teeth, Unflavored, 100 Yards
Review: best floss i've used. does not break as easily as others, and i have tight teeth.
Rating: 5.0

Title: Reach Dentotape Waxed Dental Floss with Extra Wide Cleaning Surface for Large Spaces between Teeth, Unflavored, 100 Yards
Review: best floss i've used. does not break as easily as others, and i have tight teeth.
Rating: 5.0

Title: Citre Shine Moisture Burst Shampoo - 16 fl oz
Review: Doesnt smell
Rating: 2.0

Title: Citre Shine Moisture Burst Shampoo - 16 fl oz
Review: Doesnt smell
Rating: 2.0

Title: Citre Shine Moisture Burst Shampoo - 16 fl oz
Review: My favorite powder!
Rating: 5.0


# Build cache to save embeddings

+ Save our embeddings so we can re-use them later.
+ The cache is a dictionary that maps tuples of `(text, model)` to an embedding, which is a list of floats. The cache is saved as a Python pickle file.
+ The embedded vectors are a numerical representation of the input text's meaning, capturing both its inherent semantics and its context within the provided input. 
+ OpenAI embeddings are normalized to length 1, which means that:
    + Cosine similarity can be computed slightly faster using just a dot product
    + Cosine similarity and Euclidean distance will result in the identical rankings
+ Aggregation process of embedding is not documented


In [12]:
# establish a cache of embeddings to avoid recomputing
# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file

# set path to embedding cache
# Construct the path to data file
embedding_cache_path = os.path.join(embedding_model_current_dir, 'amazon_embeddings_cache.pkl')

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

# define a function to retrieve embeddings from the cache if present, and otherwise request via the API
def embedding_from_string(
    string: str,
    model: str = EMBEDDING_MODEL,
    embedding_cache=embedding_cache
) -> list:
    """Return embedding of given string, using a cache to avoid recomputing."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]

In [13]:
# as an example, take the first title from the dataset
example_string = df["title"].values[0]
print(f"\nExample string: {example_string}")

# print the first 10 dimensions of the embedding
example_embedding = embedding_from_string(example_string)
print(f"\nExample embedding: {example_embedding[:10]}...")


Example string: Reach Dentotape Waxed Dental Floss with Extra Wide Cleaning Surface for Large Spaces between Teeth, Unflavored, 100 Yards

Example embedding: [0.008420577272772789, 0.004031274002045393, 0.006344694644212723, -0.005752569064497948, 0.0015130186220631003, 0.003907340578734875, 0.001457076519727707, -0.006193220615386963, -0.007332718465477228, 0.0009880235884338617]...


# Recommend similar products based on embeddings

+ Get the similarity embeddings of all the product title
+ Calculate the distance between a source title and all other products
+ Print out the other products closest to the source title

In [14]:
def print_recommendations_from_strings(
    strings: list[str],
    index_of_source_string: int,
    k_nearest_neighbors: int = 1,
    model=EMBEDDING_MODEL,
) -> list[int]:
    """Print out the k nearest neighbors of a given string."""
    # get embeddings for all strings
    embeddings = [embedding_from_string(string, model=model) for string in strings]
    # get the embedding of the source string
    query_embedding = embeddings[index_of_source_string]
    # get distances between the source embedding and other embeddings (function from embeddings_utils.py)
    distances = distances_from_embeddings(query_embedding, embeddings, distance_metric="cosine")
    # get indices of nearest neighbors (function from embeddings_utils.py)
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)

    # print out source string
    query_string = strings[index_of_source_string]
    print(f"Source string: {query_string}")
    # print out its k nearest neighbors
    k_counter = 0
    for i in indices_of_nearest_neighbors:
        # skip any strings that are identical matches to the starting string
        if query_string == strings[i]:
            continue
        # stop after printing out k products
        if k_counter >= k_nearest_neighbors:
            break
        k_counter += 1

        # print out the similar strings and their distances
        print(
            f"""
        --- Recommendation #{k_counter} (nearest neighbor {k_counter} of {k_nearest_neighbors}) ---
        String: {strings[i]}
        Distance: {distances[i]:0.3f}"""
        )

    return indices_of_nearest_neighbors

product_titles = df["title"].tolist()

tony_blair_products = print_recommendations_from_strings(
    strings=product_titles,  # let's base similarity off of the product title
    index_of_source_string=0,  # let's look at products similar to the first one about
    k_nearest_neighbors=5,  # let's look at the 5 most similar products
)

Source string: Reach Dentotape Waxed Dental Floss with Extra Wide Cleaning Surface for Large Spaces between Teeth, Unflavored, 100 Yards

        --- Recommendation #1 (nearest neighbor 1 of 5) ---
        String: Astra Platinum Double Edge Safety Razor Blades ,100 Blades (20 x 5)
        Distance: 0.204

        --- Recommendation #2 (nearest neighbor 2 of 5) ---
        String: Zapzyt Maximum Strength 10% Benzoyl Peroxide Acne Treatment Gel, 1 Ounce
        Distance: 0.215

        --- Recommendation #3 (nearest neighbor 3 of 5) ---
        String: Zapzyt Maximum Strength 10% Benzoyl Peroxide Acne Treatment Gel, 1 Ounce
        Distance: 0.215

        --- Recommendation #4 (nearest neighbor 4 of 5) ---
        String: Avalon Grapefruit and Geranium Smoothing Shampoo, 11 Ounce
        Distance: 0.233

        --- Recommendation #5 (nearest neighbor 5 of 5) ---
        String: Avalon Grapefruit and Geranium Smoothing Shampoo, 11 Ounce
        Distance: 0.233


In [15]:
# calculate RMSE and MAE manually
def calculate_rmse_and_mae(actual_ratings, predicted_ratings):
    differences = [actual - predicted for actual, predicted in zip(actual_ratings, predicted_ratings)]
    
    # RMSE
    squared_differences = [diff ** 2 for diff in differences]
    mean_squared_difference = sum(squared_differences) / len(squared_differences)
    rmse = mean_squared_difference ** 0.5

    # MAE
    absolute_differences = [abs(diff) for diff in differences]
    mae = sum(absolute_differences) / len(absolute_differences)

    return rmse, mae

# Slit train and test set

In [18]:
%%time

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embeddings(texts: list[str], model=EMBEDDING_MODEL) -> list[list[float]]:
    return [item["embedding"] for item in openai.Embedding.create(input=texts, model=model)["data"]]

# Get embeddings for titles in batches
batch_size = 100
title_embeddings = []

for i in range(0, len(df['title']), batch_size):
    batch_texts = df['title'].iloc[i:i+batch_size].tolist()
    title_embeddings.extend(get_embeddings(batch_texts))

# Get embeddings for unique users
unique_users = df['reviewerID'].unique().tolist()
user_embeddings_dict = {}
user_embeddings = get_embeddings(unique_users)
for user, embedding in zip(unique_users, user_embeddings):
    user_embeddings_dict[user] = embedding

# Create combined embeddings: title_embedding + user_embedding
combined_embeddings = []
for idx, row in df.iterrows():
    combined_embedding = title_embeddings[idx] + user_embeddings_dict[row['reviewerID']]
    combined_embeddings.append(combined_embedding)

X_openai = np.array(combined_embeddings)

# Splitting the dataset
X_train_openai, X_test_openai, y_train, y_test = train_test_split(X_openai, df['rating'], test_size=0.2, random_state=42)

CPU times: user 23.4 ms, sys: 4.21 ms, total: 27.6 ms
Wall time: 1.97 s


In [23]:
# Create a new column to store embeddings
df['embedding'] = combined_embeddings
embeddings_csv_path = os.path.join(current_dir, 'merged_data_with_embeddings.csv')
df.to_csv(embeddings_csv_path, index=False)
print(f"Embeddings saved to {embeddings_csv_path}")

Embeddings saved to /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/merged_data_with_embeddings.csv


# Regression using embedding

+ Obtain embeddings for each unique user ID.
+ For each data point, concatenate the title embedding with the user embedding to form a combined feature vector.
+ Split the dataset into training and test sets.
+ Train the model on the combined embeddings and predict the test set.
+ Evaluate using RMSE and MAE metrics.

In [19]:
%%time

# Train RandomForest on OpenAI embeddings
rfr_openai = RandomForestRegressor(n_estimators=N_ESTIMATORS)
rfr_openai.fit(X_train_openai, y_train)

# Predict and evaluate
preds_openai = rfr_openai.predict(X_test_openai)
rmse_openai = np.sqrt(mean_squared_error(y_test, preds_openai))  # Calculating RMSE
mae_openai = mean_absolute_error(y_test, preds_openai)

print(f"OpenAI embedding performance: rmse={rmse_openai:.2f}, mae={mae_openai:.2f}")

OpenAI embedding performance: rmse=1.60, mae=1.14
CPU times: user 40 ms, sys: 2.61 ms, total: 42.6 ms
Wall time: 46.1 ms


# Classification using the embedding features

In [20]:
%%time

# load embeddings from disk embedding_cache_path = os.path.join(embedding_model_current_dir, 'amazon_embeddings_cache.pkl')
with open(embedding_cache_path, "rb") as embedding_cache_file:
    embedding_cache = pickle.load(embedding_cache_file)


CPU times: user 1.54 ms, sys: 1.47 ms, total: 3.01 ms
Wall time: 3.34 ms


In [22]:
%%time
#  RandomForest for classification:

# Train RandomForest on OpenAI embeddings
rfr_openai = RandomForestClassifier(n_estimators=N_ESTIMATORS)
rfr_openai.fit(X_train_openai, y_train)

# Predict and evaluate
preds_openai = rfr_openai.predict(X_test_openai)
print(classification_report(y_test, preds_openai))


              precision    recall  f1-score   support

         2.0       1.00      0.50      0.67         2
         3.0       0.00      0.00      0.00         2
         4.0       0.00      0.00      0.00         1
         5.0       0.33      1.00      0.50         2

    accuracy                           0.43         7
   macro avg       0.33      0.38      0.29         7
weighted avg       0.38      0.43      0.33         7

CPU times: user 18.9 ms, sys: 2.39 ms, total: 21.3 ms
Wall time: 23.2 ms


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# References

+ https://cookbook.openai.com/examples/recommendation_using_embeddings
+ https://github.com/openai/openai-python/blob/main/openai/embeddings_utils.py
+ https://help.openai.com/en/products/6824809-embeddings-frequently-asked-questions