In [29]:
import pandas as pd
import tiktoken
from openai import OpenAI

In [30]:
input_data = "data/fine_food_reviews_1k.csv"
df = pd.read_csv(input_data, index_col=0)
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()
df["combined"] = ("Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip())
df.head(2)

Unnamed: 0,Time,ProductId,UserId,Score,Summary,Text,combined
0,1351123200,B003XPF9BO,A3R7JR3FMEBXQB,5,where does one start...and stop... with a tre...,Wanted to save some to bring to my Chicago fam...,Title: where does one start...and stop... wit...
1,1351123200,B003JK537S,A3JBPC3WFUT5ZP,1,Arrived in pieces,"Not pleased at all. When I opened the box, mos...",Title: Arrived in pieces; Content: Not pleased...


In [31]:
df['combined']

0      Title: where does one  start...and stop... wit...
1      Title: Arrived in pieces; Content: Not pleased...
2      Title: It isn't blanc mange, but isn't bad . ....
3      Title: These also have SALT and it's not sea s...
4      Title: Happy with the product; Content: My dog...
                             ...                        
995    Title: Delicious!; Content: I have ordered the...
996    Title: Good Training Treat; Content: My dog wi...
997    Title: Jamica Me Crazy Coffee; Content: Wolfga...
998    Title: Party Peanuts; Content: Great product f...
999    Title: I love Maui Coffee!; Content: My first ...
Name: combined, Length: 1000, dtype: object

In [32]:
embedding_model = 'text-embedding-v3'
embedding_encoding = 'cl100k_base'
max_tokens = 8000

In [33]:
top_n = 1000
df = df.sort_values("Time").tail(top_n * 2)
df.drop("Time", axis = 1, inplace=True)
encoding = tiktoken.get_encoding(embedding_encoding)
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)

1000

In [34]:
from openai import OpenAI

client = OpenAI(
    api_key = "sk-81c01436f2784241a5d94f2c4fb14cbf",
    base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
)

In [35]:
res = client.embeddings.create(input="abc", model=embedding_model)
print(res.data[0].embedding)

[-0.025253048166632652, 0.0386127233505249, -0.074822336435318, 0.0174836628139019, -0.05559743568301201, -0.04675886780023575, 0.010936198756098747, 0.010355786420404911, 0.034234173595905304, 0.029183562844991684, 0.005595382768660784, 0.0032228182535618544, -0.0071889725513756275, -0.03162740543484688, 0.019856227561831474, -0.031586673110723495, -0.016679231077432632, -0.004559295251965523, 0.004027250222861767, -0.05478282272815704, -0.04488525539636612, 0.030079638585448265, -0.01996823586523533, 0.009642998687922955, 0.01001975778490305, 0.09710203856229782, -0.08256117254495621, -0.030548041686415672, -0.02999817579984665, -0.10606279969215393, 0.008945484645664692, -0.005992507562041283, 0.010681631974875927, -0.03103681094944477, 0.007214429322630167, -0.007525000721216202, -0.008044317364692688, -0.013746618293225765, -0.12642815709114075, 0.005284811370074749, 0.008303975686430931, 0.008848749101161957, 0.013094927184283733, -0.03492659330368042, 0.017351288348436356, -0.08

In [36]:
def embedding_text(text, model="text-embedding-v3"):
    res = client.embeddings.create(input=text, model=model)
    return res.data[0].embedding

In [37]:
df["embedding"] = df.combined.apply(embedding_text)
output_datapath = "data/fine_food_reviews_with_embeddings_1k_330.csv"
df.to_csv(output_datapath)

In [38]:
e0 = df["embedding"][0]
e0

[-0.0028806296177208424,
 0.06060419976711273,
 -0.036652032285928726,
 0.0022823079489171505,
 -0.053694549947977066,
 -0.039412032812833786,
 0.002441538730636239,
 0.08090853691101074,
 -0.04763412848114967,
 -0.0402226597070694,
 0.04485482722520828,
 0.041689515113830566,
 -0.043928395956754684,
 -0.02779300883412361,
 0.05975497141480446,
 -0.01697496697306633,
 0.009901260025799274,
 0.00867566466331482,
 -0.0929521769285202,
 -0.08824280649423599,
 -0.042615946382284164,
 0.007652727887034416,
 -0.04697790741920471,
 0.06342210620641708,
 0.02779300883412361,
 0.02956867404282093,
 -0.02225370891392231,
 -0.012265595607459545,
 -0.017746994271874428,
 -0.05307692661881447,
 0.002834790386259556,
 0.012477902695536613,
 0.03487636521458626,
 -0.02746489830315113,
 0.056589655578136444,
 -0.011976084671914577,
 -0.0025935317389667034,
 -0.00755622424185276,
 -0.04500923305749893,
 0.00961657427251339,
 0.014234267175197601,
 0.021057065576314926,
 0.009732378646731377,
 -0.063460

In [None]:
embedding_datapath = "data/find_food_reviews_with_embeddings_1k_330.csv"
df_embedding = pd.read_csv(embedding_datapath)