# Embeddings

- [Embedding 모델 Leaderboard] https://huggingface.co/spaces/mteb/leaderboard

In [1]:
from dotenv import load_dotenv

load_dotenv()


True

In [3]:
from openai import OpenAI

client = OpenAI()
text = "짜장면, 짬뽕, 탕수육, 탕탕후루후루"

response = client.embeddings.create(
    model="text-embedding-3-small",
    input=[text]
    # encoding_format="float",
    # dimensions=1536,
    # user="user_1234567890",
    # response_format={"type": "float", "dimensions": 1536}
)

In [4]:
response

CreateEmbeddingResponse(data=[Embedding(embedding=[0.014717625454068184, 0.0007785032503306866, -0.007428399752825499, 0.017071980983018875, -0.035257354378700256, -0.014323299750685692, 0.039873283356428146, 0.040082044899463654, -0.012676410377025604, -0.012467649765312672, 0.016236938536167145, -0.015946993604302406, -0.009823348373174667, 0.013453464023768902, -0.03697383031249046, -0.013128724880516529, -0.006332406308501959, 0.0028849560767412186, 0.02802031673491001, 0.06012306734919548, 0.0217227041721344, -0.008246045559644699, -0.0508912056684494, 0.01598178781569004, 0.03927019611001015, 0.045231472700834274, -0.011081710457801819, 0.0514942929148674, 0.023334801197052002, -0.01620214618742466, -0.02876257710158825, -0.03558209165930748, -0.045463427901268005, -0.08034965395927429, 0.015390298329293728, 0.002076008589938283, -0.000739723036531359, 0.04801494628190994, -0.013070736080408096, -0.02551518939435482, -0.03310016170144081, 0.047272689640522, -0.015216330997645855,

In [7]:
print(response.data[0].embedding)
print(len(response.data[0].embedding))

[0.014717625454068184, 0.0007785032503306866, -0.007428399752825499, 0.017071980983018875, -0.035257354378700256, -0.014323299750685692, 0.039873283356428146, 0.040082044899463654, -0.012676410377025604, -0.012467649765312672, 0.016236938536167145, -0.015946993604302406, -0.009823348373174667, 0.013453464023768902, -0.03697383031249046, -0.013128724880516529, -0.006332406308501959, 0.0028849560767412186, 0.02802031673491001, 0.06012306734919548, 0.0217227041721344, -0.008246045559644699, -0.0508912056684494, 0.01598178781569004, 0.03927019611001015, 0.045231472700834274, -0.011081710457801819, 0.0514942929148674, 0.023334801197052002, -0.01620214618742466, -0.02876257710158825, -0.03558209165930748, -0.045463427901268005, -0.08034965395927429, 0.015390298329293728, 0.002076008589938283, -0.000739723036531359, 0.04801494628190994, -0.013070736080408096, -0.02551518939435482, -0.03310016170144081, 0.047272689640522, -0.015216330997645855, -0.03210274875164032, 0.03504859283566475, 0.0348

### 음식 리뷰 데이터 활용

In [10]:
import pandas as pd

df = pd.read_csv("fine_food_reviews_1k.csv")
display(df.head())
print(df.shape)

Unnamed: 0.1,Unnamed: 0,Time,ProductId,UserId,Score,Summary,Text
0,0,1351123200,B003XPF9BO,A3R7JR3FMEBXQB,5,where does one start...and stop... with a tre...,Wanted to save some to bring to my Chicago fam...
1,1,1351123200,B003JK537S,A3JBPC3WFUT5ZP,1,Arrived in pieces,"Not pleased at all. When I opened the box, mos..."
2,2,1351123200,B000JMBE7M,AQX1N6A51QOKG,4,"It isn't blanc mange, but isn't bad . . .",I'm not sure that custard is really custard wi...
3,3,1351123200,B004AHGBX4,A2UY46X0OSNVUQ,3,These also have SALT and it's not sea salt.,I like the fact that you can see what you're g...
4,4,1351123200,B001BORBHO,A1AFOYZ9HSM2CZ,5,Happy with the product,My dog was suffering with itchy skin. He had ...


(1000, 7)


In [14]:
import tiktoken

gpt4o_encoding = tiktoken.encoding_for_model("text-embedding-3-small")
df['n_tokens'] = df['Text'].apply(lambda x: len(gpt4o_encoding.encode(x)))
df[['Text', 'n_tokens']].head()

Unnamed: 0,Text,n_tokens
0,Wanted to save some to bring to my Chicago fam...,34
1,"Not pleased at all. When I opened the box, mos...",26
2,I'm not sure that custard is really custard wi...,249
3,I like the fact that you can see what you're g...,223
4,My dog was suffering with itchy skin. He had ...,77


In [15]:
df['n_tokens'].describe()

count    1000.000000
mean       85.637000
std        73.562903
min        22.000000
25%        38.000000
50%        60.000000
75%       106.500000
max       623.000000
Name: n_tokens, dtype: float64

In [16]:
# 임베딩 처리
def texts_to_embedding(texts):

    # 전처리
    texts = [text.replace("\n", " ") for text in texts]

    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=texts,
    )

    return [data.embedding for data in response.data]

In [17]:
df['embedding'] = texts_to_embedding(df['Text'].tolist())

In [18]:
df['embedding']

0      [0.01677853614091873, -0.008555943146348, -0.0...
1      [-0.005216312129050493, 0.040469057857990265, ...
2      [0.005564768798649311, -0.012970144860446453, ...
3      [-0.016292475163936615, 0.008886804804205894, ...
4      [-0.004322985652834177, -0.06378211826086044, ...
                             ...                        
995    [0.01907997578382492, -0.037228021770715714, -...
996    [-0.037051208317279816, -0.013656404800713062,...
997    [-0.04671481251716614, -0.07131096720695496, -...
998    [-0.014545817859470844, -0.03374110162258148, ...
999    [0.014047509990632534, -0.013696948997676373, ...
Name: embedding, Length: 1000, dtype: object

In [19]:
embed_df = df['embedding'].to_frame('embedding')
embed_df.index = df['Text']
embed_df

Unnamed: 0_level_0,embedding
Text,Unnamed: 1_level_1
Wanted to save some to bring to my Chicago family but my North Carolina family ate all 4 boxes before I could pack. These are excellent...could serve to anyone,"[0.01677853614091873, -0.008555943146348, -0.0..."
"Not pleased at all. When I opened the box, most of the rings were broken in pieces. A total waste of money.","[-0.005216312129050493, 0.040469057857990265, ..."
"I'm not sure that custard is really custard without eggs. But this comes close. I got it for use in a ""Vegan pancake"" recipe. We were having houseguests who were Vegan and I wanted to make some special breakfasts while they were here. One of the cooking/recipe sites had a recipe using this and there were lots of great reviews. I tried the recipe and it turned out like wallpaper paste -- yuck!<br />However, the so-called custard isn't so bad. I think it's probably just cornstarch and annatto (yellow coloring with a slight flavor). It's fun playing with it. You could dress it up with fruit. Seems to come out on the thin side when you make it as directed, so I use less milk because I like my custards to set firm. As a custard sauce it's fine. I would say it tastes something between a pudding and a custard.<br /><br />If you want a really good egg-free ""custard"" get an original recipe for ""blanc mange."" It takes a lot longer to make, but it's certainly worth the difference.","[0.005564768798649311, -0.012970144860446453, ..."
"I like the fact that you can see what you're getting and that there are no bones or dark meat. There are 7 nice big chunks in every jar.<br /><br />These taste like tuna in a can but, because they're preserved in glass, you don't have to worry about either aluminum or BPA; BUT ... they are not just tuna and spring water.<br /><br />There is salt in there, too, and it's not healthy sea salt, it's toxic table salt.<br /><br />I am trying to contact Tonnino to confirm that. I might be wrong because the label states that the ingredients are ""tuna fish"" but the sticker on the top clarifies that it is the smaller (healthier) yellowfin, so the ""salt"" listed in the ingredients might be sea salt but, if it was, why don't they say so?<br /><br />Without confirmation, I will continue to look for a salt-free olive-oil free tuna preserved in glass.<br /><br />If you know of one, please contact me!","[-0.016292475163936615, 0.008886804804205894, ..."
My dog was suffering with itchy skin. He had been eating Natural Choice brand (cheaper) since he was a puppy. I was nervous to change foods. The vet suggested to change foods sand see if the skin issues cleared up. Wellness brand did the job. My dog seems to love the food and the skin issues cleared up within a few weeks.,"[-0.004322985652834177, -0.06378211826086044, ..."
...,...
I have ordered these raisins multiple times. They are always great and arrive timely. I can't go back to store bought chocolate covered raisins now! Love this product.,"[0.01907997578382492, -0.037228021770715714, -..."
My dog will come in from outside when I am training her and look at the cupboard waiting for her treat. When I use the clicker training method she comes because she knows she has something special.,"[-0.037051208317279816, -0.013656404800713062,..."
Wolfgang Puck's Jamaica Me Crazy is that wonderful blend of island flavors in a coffee. Have loved it from the first time tasting. Great product.,"[-0.04671481251716614, -0.07131096720695496, -..."
Great product for the price. Mix with the Asian rice crackers for an excellent snack. Big container lasts a long time. Only lightly slighted. Peanuts are whole and large.,"[-0.014545817859470844, -0.03374110162258148, ..."


- 코사인 유사도 측정

In [20]:
import numpy as np

cos_sim = lambda a, b: np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def get_similar_texts(query, embed_df, top_n=5):
    query_embed = texts_to_embedding(query)[0]
    embed_df['cos_sim'] = embed_df['embedding'].apply(lambda x: cos_sim(x, query_embed))

    return embed_df.sort_values('cos_sim', ascending=False).head(top_n)

In [23]:
get_similar_texts(["pizza"], embed_df)

Unnamed: 0_level_0,embedding,cos_sim
Text,Unnamed: 1_level_1,Unnamed: 2_level_1
Makes very good break sticks.. Also can be used for a pizza crust.<br /><br />My wife is a celiac so we both enjoy this either as bread sticks or pizza crust.,"[-0.0031160328071564436, 0.0032433217857033014...",0.415317
The Barilla Mezze Penne with spicy marinara sauce is easy to prepare and tastes better than similar products. The sauce is not as spicy as I expected it to be but does have a flavorful tomato-y taste. The sauce is separate from the pasta and must be combined after cooking.,"[-0.023196613416075706, -0.03185791149735451, ...",0.3558
"Love these packs. I have made pretzel dogs, bites and sticks. I have even used the pretzel mix to make calzones and pizza. Both came out great.","[0.021732978522777557, -0.02915913611650467, -...",0.35085
try it & we shared with the familys/all han thumbs up!!!!!!!!!!cut it with good lite olive oil /sherry or whatever you like/on meat/chicken & fish $ pork!!,"[0.016222583130002022, -0.04311098903417587, -...",0.341087
"About time, this is the best organic gluten free basil pasta sauce I have ever had. My wife is Italian and said she has never had better.","[-0.05429341644048691, -0.0073051839135587215,...",0.339322
