In [1]:
from IPython.display import display, HTML
display(HTML("""<style>
div.container{width:86% !important;}
div.cell.code_cell.rendered{width:100%;}
div.CodeMirror {font-family:Consolas; font-size:12pt;}
div.output {font-size:12pt; font-weight:bold;}
div.input {font-family:Consolas; font-size:12pt;}
div.prompt {min-width:70px;}
div#toc-wrapper{padding-top:120px;}
div.text_cell_render ul li{fontsize:12pt;padding:5px;}
table.dataframe{font-size:12px;}))
</style>
"""))

# <span style="color:red">ch09 01 vector Embedding Model 성능비교 </span>

# 문장 → 벡터(1차원 숫자 배열 [8.1,9.1, 2, 5, 4, 3....])

- openAi API : https://platform.openai.com/의 키(OPENAI_API_KEY)를 .env등록
- upstage : https://console.upstage.ai/의 키(UPSTAGE_API_KEY)를 .env등록

# 1. 환경변수 load

In [6]:
from dotenv import load_dotenv

load_dotenv()

True

# 2. 유사도 계산하는 방법 : https://www.pinecone.io/learn/vector-similarity

    1. 유클리드 거리 : 두 벡터간의 거리가 가까운지
    2. 코사인유사도 : 두 벡터간 방향이 유사한지
    3. dot product : 두 벡터간의 곱을 사용하여 거리와 방향을 모두 고려

In [7]:
import numpy as np

def cosine_similarity(vec1, vec2) :
    """두 백터 사이의 코사인 유사도 계산"""
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)  # 벡터의 길이
    norm_vec2 = np.linalg.norm(vec2)
    if norm_vec1 == 0 or norm_vec2 == 0 :
        return 0.0
    return dot_product / (norm_vec1 * norm_vec2)

# 3. openAI API의 embedding model 사용

In [8]:
from openai import OpenAI

openai_client = OpenAI()

In [18]:
# text-embedding-3-large

response = openai_client.embeddings.create(
    input = "king",
    model = "text-embedding-3-large"
)

In [19]:
import numpy as np

king_vector = np.array(response.data[0].embedding)
print(king_vector.shape)
print(king_vector)

(3072,)
[ 0.01040417  0.02499519 -0.0014776  ...  0.00835009  0.01049861
 -0.00254005]


In [21]:
queen_response = openai_client.embeddings.create(
    input = "queen",
    model = "text-embedding-3-large"
)

In [22]:
queen_vector = np.array(queen_response.data[0].embedding)
print(queen_vector)
print(queen_vector.shape)

[-0.01385735  0.0008602  -0.0167823  ...  0.00017693  0.01159847
  0.00638929]
(3072,)


In [26]:
king_queen_similarity = cosine_similarity(king_vector, queen_vector)
print('king과 queen의 유사도 :', king_queen_similarity)

king과 queen의 유사도 : 0.5552268369726675


In [25]:
slave_response = openai_client.embeddings.create(
    input = "slave",
    model = "text-embedding-3-large"
)
slave_vector = np.array(slave_response.data[0].embedding)
print(slave_vector.shape)
print(slave_vector)

(3072,)
[-0.01999537  0.00620363  0.01191717 ...  0.00094749 -0.02679118
 -0.0058524 ]


In [27]:
king_slave_similarity = cosine_similarity(king_vector, slave_vector)
print('king과 slave의 유사도 :', king_slave_similarity)

king과 slave유사도 : 0.2947745074537996


In [28]:
# 한국어 문장을 벡터로 바꿔도 유사도는 비슷해야 할 듯

In [29]:
kor_king_response = openai_client.embeddings.create(
    input = "왕",
    model = "text-embedding-3-large"
)

In [30]:
kor_king_vector = np.array(kor_king_response.data[0].embedding)
print(kor_king_vector.shape)

(3072,)


In [31]:
kor_queen_response = openai_client.embeddings.create(
    input = "여왕",
    model = "text-embedding-3-large"
)
kor_queen_vector = np.array(kor_queen_response.data[0].embedding)
print(kor_queen_vector.shape)

(3072,)


In [32]:
kor_king_queen_similarity = cosine_similarity(kor_king_vector, kor_queen_vector)
print('왕과 여왕의 유사도 :', kor_king_queen_similarity)

np.float64(0.48753581462091106)

In [33]:
kor_slave_response = openai_client.embeddings.create(
    input = "거지",
    model = "text-embedding-3-large"
)
kor_slave_vector = np.array(kor_slave_response.data[0].embedding)
print(kor_slave_vector.shape)

(3072,)


In [34]:
kor_king_slave_similarity = cosine_similarity(kor_king_vector, kor_slave_vector)
print('왕과 거지의 유사도 :', kor_king_slave_similarity)

np.float64(0.2552452064791607)

In [36]:
kor_king_king_similarity = cosine_similarity(king_vector, kor_king_vector)
print('king과 왕의 유사도 :', kor_king_king_similarity)

np.float64(0.5474873912140233)

# 4. upstage의 embedding model 사용

- 한국에 embedding에는 openai보다 성능이 훨씬 좋다

In [39]:
import os
upstage_api_key = os.getenv("UPSTAGE_API_KEY")
upstage_client = OpenAI(
    api_key=upstage_api_key,
    base_url="https://api.upstage.ai/v1"
)

In [41]:
up_king_response = upstage_client.embeddings.create(
    input = "king",
    model = "embedding-query"
)

In [42]:
up_king_vector = np.array(up_king_response.data[0].embedding)
print(up_king_vector.shape)
print(up_king_vector)

(4096,)
[-0.01187134 -0.02062988 -0.00674057 ... -0.01081848  0.00247955
  0.01520538]


In [43]:
up_queen_response = upstage_client.embeddings.create(
    input = "queen",
    model = "embedding-query"
)
up_queen_vector = np.array(up_queen_response.data[0].embedding)
print(up_queen_vector.shape)

(4096,)


In [44]:
up_king_queen_similarity = cosine_similarity(up_king_vector, up_queen_vector)
print('king과 queen의 유사도 :', up_king_queen_similarity)

np.float64(0.6279103035110143)

In [45]:
up_kor_king_response = upstage_client.embeddings.create(
    input = "왕",
    model = "embedding-query"
)
up_kor_king_vector = np.array(up_kor_king_response.data[0].embedding)
print(up_kor_king_vector.shape)

(4096,)


In [46]:
up_kor_king_king_similarity = cosine_similarity(up_king_vector, up_kor_king_vector)
print('king과 왕의 유사도 :', up_kor_king_king_similarity)

np.float64(0.8522292879902242)