In [18]:

# imports
import pandas as pd
import os
import tiktoken
import openai
from dotenv import load_dotenv

from openai.embeddings_utils import get_embedding

In [19]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

ModuleNotFoundError: No module named 'plotly'

In [50]:
!pip install python-dotenv

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple, https://pypi.ngc.nvidia.com
Collecting python-dotenv
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/44/2f/62ea1c8b593f4e093cc1a7768f0d46112107e790c3e478532329e434f00b/python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.0


In [8]:
!pip install matplotlib scipy scikit-learn

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple, https://pypi.ngc.nvidia.com
Collecting scipy
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/ec/e3/b06ac3738bf365e89710205a471abe7dceec672a51c244b469bc5d1291c7/scipy-1.10.1-cp310-cp310-win_amd64.whl (42.5 MB)
                                              0.0/42.5 MB ? eta -:--:--
                                              0.1/42.5 MB 2.2 MB/s eta 0:00:20
                                              0.2/42.5 MB 2.0 MB/s eta 0:00:22
                                              0.4/42.5 MB 2.5 MB/s eta 0:00:17
                                              0.5/42.5 MB 2.7 MB/s eta 0:00:16
                                              0.6/42.5 MB 2.7 MB/s eta 0:00:16
                                              0.8/42.5 MB 3.0 MB/s eta 0:00:14
                                              1.0/42.5 MB 3.2 MB/s eta 0:00:13
     -                                        1.2/42.5 MB 3.4 MB/s eta 0:00:13
     -          

In [10]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

In [11]:
# load & inspect dataset
input_datapath = "data/Reviews.csv"  # to save space, we provide a pre-filtered dataset
df = pd.read_csv(input_datapath, index_col=0)
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()
df["combined"] = (
    "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)
df.head(2)

Unnamed: 0_level_0,Time,ProductId,UserId,Score,Summary,Text,combined
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1303862400,B001E4KFG0,A3SGXH7AUHU8GW,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,Title: Good Quality Dog Food; Content: I have ...
2,1346976000,B00813GRG4,A1D87F6ZCVE5NK,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Title: Not as Advertised; Content: Product arr...


In [12]:

# subsample to 1k most recent reviews and remove samples that are too long
top_n = 1000
df = df.sort_values("Time").tail(top_n * 2)  # first cut to first 2k entries, assuming less than half will be filtered out
df.drop("Time", axis=1, inplace=True)

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)

1000

In [20]:
# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage

# This may take a few minutes
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model))
df.to_csv("data/fine_food_reviews_with_embeddings_1k.csv")

In [21]:
response = openai.Embedding.create(
    input="My cat loves it",
    model="text-embedding-ada-002"
)
embeddings = response['data'][0]['embedding']

In [27]:
df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, query_embedding))

In [23]:
query_embedding = embeddings

In [25]:
from openai.embeddings_utils import cosine_similarity

In [28]:
df["similarity"]

Id
284932    0.729518
220697    0.740588
107908    0.730831
107800    0.734972
205313    0.779441
            ...   
7178      0.765406
401972    0.781411
462088    0.761610
267549    0.738286
542497    0.753610
Name: similarity, Length: 1000, dtype: float64

In [29]:
df_similarity = df.sort_values(by="similarity", ascending=False)

In [33]:
df_similarity.head()

Unnamed: 0_level_0,ProductId,UserId,Score,Summary,Text,combined,n_tokens,embedding,similarity
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
454773,B003194PBC,A2FSDQY5AI6TNX,5,My furbabies LOVE these!,Shake the container and they come running. Eve...,Title: My furbabies LOVE these!; Content: Shak...,47,"[-0.009749102406203747, -0.0068712360225617886...",0.858803
271218,B0009ET7TC,A2FSDQY5AI6TNX,5,My furbabies LOVE these!,Shake the container and they come running. Eve...,Title: My furbabies LOVE these!; Content: Shak...,47,"[-0.009749102406203747, -0.0068712360225617886...",0.858803
281914,B002OVO5EK,A2FSDQY5AI6TNX,5,My furbabies LOVE these!,Shake the container and they come running. Eve...,Title: My furbabies LOVE these!; Content: Shak...,47,"[-0.009692925959825516, -0.006840511690825224,...",0.858724
336872,B0012KB4U2,AGQBI6601XH2R,5,Both cats love these!,They only like this brand and flavor of treat....,Title: Both cats love these!; Content: They on...,69,"[-0.01054252777248621, -0.018018875271081924, ...",0.852078
194114,B003J9HAU2,A23WYVBCNE75X1,3,it's alright,My kitten prefers Kitten Chow or Iams. He lea...,Title: it's alright; Content: My kitten prefer...,35,"[0.002302631502971053, 0.010298383422195911, 0...",0.847203


In [49]:
df_similarity.iloc[0]

ProductId                                            B003194PBC
UserId                                           A2FSDQY5AI6TNX
Score                                                         5
Summary                                My furbabies LOVE these!
Text          Shake the container and they come running. Eve...
combined      Title: My furbabies LOVE these!; Content: Shak...
n_tokens                                                     47
embedding     [-0.009749102406203747, -0.0068712360225617886...
similarity                                             0.858803
Name: 454773, dtype: object

In [51]:
from dotenv import load_dotenv

In [52]:
import os

In [55]:
print(os.getenv("OPENAI_API_KEY"))

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


In [54]:
load_dotenv()

True

In [None]:
from openai.embeddings_utils import cosine_similarity