In [None]:
# needed installs for embedding
! pip install pandas
! pip install tiktoken
! pip install openai
! pip install matplotlib
! pip install plotly
! pip install scipy
! pip install scikit-learn
! pip install python-dotenv


In [None]:
#imports
import pandas as pd
import tiktoken
from openai.embeddings_utils import get_embedding
from dotenv import load_dotenv
import os
import openai

In [None]:
#embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base" # this the encoding for text-embedding-ada-002
max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191

In [36]:
# Load and inspect dataset
input_datapath = "C:/Users/Raj/repos/openai-stackhack-2023/playground/redis/reviews_1k.csv"
df = pd.read_csv(input_datapath, index_col=0)
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df.dropna()
df["combined"] = ("Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip())
df.head(2)

Unnamed: 0_level_0,Time,ProductId,UserId,Score,Summary,Text,combined
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1303862400,B001E4KFG0,A3SGXH7AUHU8GW,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,Title: Good Quality Dog Food; Content: I have ...
2,1346976000,B00813GRG4,A1D87F6ZCVE5NK,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Title: Not as Advertised; Content: Product arr...


In [37]:
# subsample to 1k most recent reviews and remove samples that are too long
top_n = 25 # initially set this to 1000 but I ran into rate limit issues of 60/minute so cutting down to 25 for testing
df = df.sort_values("Time").tail(top_n * 2) # first cut to first 50 entries, assuming less than half will be filtered out
df.drop("Time", axis=1, inplace=True)

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)

25

### 2. Get embeddings and save them for future reuse

In [38]:
# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
# Get embeddings for each review and save to disk for later use in the search engine
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model))
df.to_csv("C:/Users/Raj/repos/openai-stackhack-2023/playground/redis/reviews_1k_embeddings.csv")