In [20]:
import numpy as np

def cosine_similarity(vec_a: np.ndarray, vec_b: np.ndarray)->float:
  dot = np.dot(vec_a, vec_b)
  norm_a = np.linalg.norm(vec_a)
  norm_b = np.linalg.norm(vec_b)
  return dot / (norm_a*norm_b)

# openai embedding

In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
from openai import OpenAI
openai_client = OpenAI()

In [16]:
def get_openai_embedding(text:str, model='text-embedding-3-small'):
  response = openai_client.embeddings.create(
    input = text, 
    model=model
  )
  return response.data[0].embedding

In [14]:
text_str = '안녕하세요'
emb_vector = get_openai_embedding(text_str)
len(emb_vector.embedding)

1536

In [22]:
# 임베딩모델 생성
from langchain_upstage import UpstageEmbeddings
import os
embeddings = UpstageEmbeddings(
  api_key=os.getenv('UPSTAGE_API_KEY'),
  model='solar-embedding-1-large'
)

In [23]:
#업스테이지 모델로 임베딩함수
def get_upstage_embedding(text: str, is_query: bool = False) -> np.ndarray:
    if is_query:
        vec = embeddings.embed_query(text)
    else:
        vec = embeddings.embed_documents([text])[0]  # 리스트에서 첫 번째 요소만 추출
    return np.array(vec)

In [10]:
# pip install -qU langchain-core langchain-upstage
import os

from langchain_upstage import UpstageEmbeddings
 
embeddings = UpstageEmbeddings(
    api_key= os.getenv('UPSTAGE_API_KEY'),
    model="embedding-query"
)
 
doc_result = embeddings.embed_documents(
    ["Sam is a teacher.", "This is another document"]
)
print(doc_result)
 
query_result = embeddings.embed_query("What does Sam do?")
print(query_result)

[[0.016357421875, 0.0171356201171875, -0.0077972412109375, 0.024078369140625, 0.00342559814453125, -0.00682830810546875, -0.01465606689453125, -0.01067352294921875, -0.015625, 0.00673675537109375, 0.0183258056640625, 0.00507354736328125, 0.007244110107421875, 0.01177978515625, 0.02777099609375, 0.02191162109375, -0.0213623046875, -0.0012531280517578125, -0.0030117034912109375, -0.0167388916015625, -0.02410888671875, -0.0090789794921875, -0.01122283935546875, -0.005924224853515625, -0.01036834716796875, 0.0162811279296875, 0.002777099609375, -0.01197052001953125, 0.00518798828125, 0.02154541015625, 0.0056610107421875, 0.01412200927734375, 0.0020542144775390625, -0.0094146728515625, 0.01070404052734375, -0.01226806640625, -0.003635406494140625, 0.02374267578125, -0.012420654296875, 0.01287078857421875, -0.0112152099609375, -0.02001953125, -0.01410675048828125, 8.320808410644531e-05, -0.016510009765625, 0.012786865234375, 0.00691986083984375, -0.0008373260498046875, 0.004840850830078125, 

In [11]:
len(query_result)

4096

#openai embedding 모델로 임베딩

In [15]:
texts = ['king', 'queen', 'slave', '왕']

In [None]:
openai_embeddings = {txt: get_openai_embedding(txt) for txt in texts}


In [21]:
cosine_similarity(openai_embeddings['queen'], openai_embeddings['king'])

np.float64(0.5906015302396912)

#업스테이 임베딩 모델로 임베딩

In [24]:
upstage_embeddings = {txt: get_upstage_embedding(txt) for txt in texts}

In [25]:
cosine_similarity(upstage_embeddings['queen'], upstage_embeddings['king'])

np.float64(0.6445563269944867)

# 왕의 비교

In [26]:
cosine_similarity(openai_embeddings['왕'], openai_embeddings['king'])

np.float64(0.5040406331683573)

In [27]:
cosine_similarity(upstage_embeddings['왕'], upstage_embeddings['king'])

np.float64(0.6961081410307075)

젬마2 모델로 임베딩 비교

# 올라마 임베딩 > huggingface 의 임베딩 모델

In [None]:
!pip install sentence-transformers

In [None]:
from langchain_community.chat_models import ChatOllama
llm_ollama = ChatOllama(model='gemma2')
response = llm_ollama.invoke('안녕? 네 소개를 2줄로 작성해')
response.content

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('BAAI/bge-multilingual-gemma2')
gemma_embedding = model.encode(texts)
gemma_embedding