In [4]:
# imports
import pandas as pd
import pickle
import os
import openai

from openai.embeddings_utils import (
    get_embedding,
    distances_from_embeddings,
    tsne_components_from_embeddings,
    chart_from_components,
    indices_of_nearest_neighbors_from_distances,
)
# Add the path to the constants file to the system path
import sys
sys.path.append('../../')
from constants import RANDOM_STATE, OPENAI_API_KEY
# OpenAI API Key
openai.api_key = OPENAI_API_KEY
# constants
EMBEDDING_MODEL = "text-embedding-ada-002"

# Get the current directory of the notebook
current_dir = os.path.dirname(os.path.abspath("../../data/amazon-beauty/parse_and_clean_meta_data.ipynb"))
# # Get the current directory of the notebook
embedding_model_current_dir = os.path.dirname(os.path.abspath("../../models/embedding/parse_and_clean_meta_data.ipynb"))
print(f"current directory: {current_dir}")
print(f"embedding model current directory: {embedding_model_current_dir}")

n_examples = 5

current directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty
embedding model current directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/models/embedding


In [2]:
# Construct the path to data file
data_path = os.path.join(current_dir, 'merged_data.csv')
print(f'data path: {data_path}')
# load data (full dataset available at http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html)
dataset_path = data_path
df = pd.read_csv(dataset_path)

df.head(n_examples)

data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/merged_data.csv


Unnamed: 0,rating,reviewerID,asin,reviewText,summary,title
0,5.0,ANV9L0JU6BNL,B000052YAN,best floss i've used. does not break as easily...,best floss i've used,Reach Dentotape Waxed Dental Floss with Extra ...
1,5.0,ANV9L0JU6BNL,B000052YAN,best floss i've used. does not break as easily...,best floss i've used,Reach Dentotape Waxed Dental Floss with Extra ...
2,2.0,A2TU781PWGS09X,B00006L9LC,Doesnt smell,Two Stars,Citre Shine Moisture Burst Shampoo - 16 fl oz
3,2.0,A2TU781PWGS09X,B00006L9LC,Doesnt smell,Two Stars,Citre Shine Moisture Burst Shampoo - 16 fl oz
4,5.0,A3A8F2URN7MEPR,B00006L9LC,My favorite powder!,Five Stars,Citre Shine Moisture Burst Shampoo - 16 fl oz


In [3]:
# print the title, reviewText, and rating of each example
for idx, row in df.head(n_examples).iterrows():
    print("")
    print(f"Title: {row['title']}")
    print(f"Review: {row['reviewText']}")
    print(f"Rating: {row['rating']}")


Title: Reach Dentotape Waxed Dental Floss with Extra Wide Cleaning Surface for Large Spaces between Teeth, Unflavored, 100 Yards
Review: best floss i've used. does not break as easily as others, and i have tight teeth.
Rating: 5.0

Title: Reach Dentotape Waxed Dental Floss with Extra Wide Cleaning Surface for Large Spaces between Teeth, Unflavored, 100 Yards
Review: best floss i've used. does not break as easily as others, and i have tight teeth.
Rating: 5.0

Title: Citre Shine Moisture Burst Shampoo - 16 fl oz
Review: Doesnt smell
Rating: 2.0

Title: Citre Shine Moisture Burst Shampoo - 16 fl oz
Review: Doesnt smell
Rating: 2.0

Title: Citre Shine Moisture Burst Shampoo - 16 fl oz
Review: My favorite powder!
Rating: 5.0


# Build cache to save embeddings

+ Save our embeddings so we can re-use them later.
+ The cache is a dictionary that maps tuples of `(text, model)` to an embedding, which is a list of floats. The cache is saved as a Python pickle file.



In [5]:
# establish a cache of embeddings to avoid recomputing
# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file

# set path to embedding cache
# Construct the path to data file
embedding_cache_path = os.path.join(current_dir, 'amazon_embeddings_cache.pkl')

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

# define a function to retrieve embeddings from the cache if present, and otherwise request via the API
def embedding_from_string(
    string: str,
    model: str = EMBEDDING_MODEL,
    embedding_cache=embedding_cache
) -> list:
    """Return embedding of given string, using a cache to avoid recomputing."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]

In [7]:
# as an example, take the first description from the dataset
example_string = df["title"].values[0]
print(f"\nExample string: {example_string}")

# print the first 10 dimensions of the embedding
example_embedding = embedding_from_string(example_string)
print(f"\nExample embedding: {example_embedding[:10]}...")


Example string: Reach Dentotape Waxed Dental Floss with Extra Wide Cleaning Surface for Large Spaces between Teeth, Unflavored, 100 Yards

Example embedding: [0.008420577272772789, 0.004031274002045393, 0.006344694644212723, -0.005752569064497948, 0.0015130186220631003, 0.003907340578734875, 0.001457076519727707, -0.006193220615386963, -0.007332718465477228, 0.0009880235884338617]...


# References

+ https://cookbook.openai.com/examples/recommendation_using_embeddings