# 문장 -> 벡터(1차원 숫자 배열)

- openAI API : https://platform.openai.com/ 의 키를 .env등록
- upstage : https://console.upstage.ai/docs/ 의 키를 .env등록

# 1. 환경변수 load

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

# 2. 유사도 계산하는 방법 : 
### 1. 유클리드 거리 : 두 벡터간의 거리가 가까운지
### 2. 코사인유사도 : 두 벡터간 방향이 유사한지
### 3. dot_porduct : 두 벡터간의 곱응 사용하여 벡터의 가중치를 계산

In [11]:
import numpy as np
def cosine_similarity(vec1, vec2):
    """두 백터 사이의 코사인 유사도 계산"""
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1) 
    norm_vec2 = np.linalg.norm(vec2)
    if norm_vec1==0 or norm_vec2==0:
        return 0.0
    return dot_product / (norm_vec1*norm_vec2)


# 3. openAI API의 embedding model 사용

In [None]:
from openai import OpenAI
import numpy as np

client = OpenAI() # api key는 .env 파일에 저장됨

In [21]:
# king 벡터 생성
response = client.embeddings.create(
    input="king",
    model="text-embedding-3-large"
)
king_vector = np.array(response.data[0].embedding)
print(king_vector.shape)
print(king_vector)


(3072,)
[ 0.00875099  0.02395172 -0.00106452 ...  0.00805435  0.00888405
 -0.00222297]


In [None]:
# queen 벡터 생성
queen_response = client.embeddings.create(
    input="queen",
    model="text-embedding-3-large"
)
queen_vector = np.array(queen_response.data[0].embedding)
print(king_vector.shape)
print(king_vector)

In [None]:
# king과 queen의 유사도 계산
king_queen_similarity = cosine_similarity(king_vector, queen_vector)
print(king_queen_similarity)

0.5552268369726675


In [None]:
# slave 벡터 생성
slave_response = client.embeddings.create(
    input="slave",
    model="text-embedding-3-large"
)
slave_vector = np.array(slave_response.data[0].embedding)

In [None]:
# king과 slave의 유사도 계산
king_queen_similarity = cosine_similarity(king_vector, slave_vector)
print(king_queen_similarity)

0.29483842539319616


In [None]:
# 한국어 문장을 벡터로 바꿔도 유사도는 비슷해야 할 듯

In [22]:
# 왕 벡터 생성
kor_king_response = client.embeddings.create(
    input="왕",
    model="text-embedding-3-large"
)
kor_king_vector = np.array(kor_king_response.data[0].embedding)
print(king_vector.shape)
print(king_vector)


(3072,)
[ 0.00875099  0.02395172 -0.00106452 ...  0.00805435  0.00888405
 -0.00222297]


In [18]:
# 여왕 벡터 생성
kor_queen_response = client.embeddings.create(
    input="여왕",
    model="text-embedding-3-large"
)
kor_queen_vector = np.array(kor_queen_response.data[0].embedding)
print(kor_queen_vector.shape)
print(kor_queen_vector)

(3072,)
[-0.01306162 -0.00922657 -0.00530037 ... -0.00485867 -0.00205424
  0.02038819]


In [None]:
# 한국어 king과 queen의 유사도 계산
kor_king_queen_similarity = cosine_similarity(kor_king_vector, kor_queen_vector)
print(kor_king_queen_similarity)

0.3103962084426481


In [23]:
# king과 왕의 유사도
print(cosine_similarity(king_vector, kor_king_vector))

0.5483076178519545


# 4. upstage의 embedding model 사용
- 한국어 embedding 에는 openai보다 성능이 훨씬 좋다

In [24]:
from openai import OpenAI # openai==1.52.2
import os 
upstage_api_key = os.environ.get("UPSTAGE_API_KEY")
upstage_client = OpenAI(
    api_key=upstage_api_key,
    base_url="https://api.upstage.ai/v1"
)
 
response = upstage_client.embeddings.create(
    input="King",
    model="embedding-query"
)
 
upstage_king_vector = np.array(response.data[0].embedding)
print(upstage_king_vector)
print(upstage_king_vector.shape)

[-0.01209259 -0.02642822 -0.00889587 ... -0.0089035   0.0037117
  0.01570129]
(4096,)


In [25]:
response = upstage_client.embeddings.create(
    input='왕',
    model='embedding-query'
)
 
kor_upstage_king_vector = np.array(response.data[0].embedding)
print(kor_upstage_king_vector)
print(kor_upstage_king_vector.shape)

[-0.01207733 -0.0224762  -0.01322937 ... -0.00020826  0.00362587
  0.01420593]
(4096,)


In [27]:
# 왕과 king의 유사도 출력
print(cosine_similarity(upstage_king_vector, kor_upstage_king_vector))

0.8423532330391306
