In [1]:
import numpy as np

def cosine_similarity(vec_a: np.ndarray, vec_b: np.ndarray)->float:
    dot = np.dot(vec_a, vec_b)
    norm_a = np.linalg.norm(vec_a)
    norm_b = np.linalg.norm(vec_b)
    return dot / (norm_a*norm_b)

In [None]:
# openai embedding

In [11]:
from dotenv import load_dotenv
import os
load_dotenv()
upstage_api_key = os.getenv('UPSTAGE_API_KEY')

# OpenAI

In [8]:
from openai import OpenAI
openai_client = OpenAI()

In [17]:
def get_openai_embedding(text:str, model='text-embedding-3-small'):
    response = openai_client.embeddings.create(
        input = text,
        model = model 
    )
    return response.data[0].embedding

In [18]:
text_str = '안녕하세요'
emb_vector = get_openai_embedding(text_str)
emb_vector

[-0.002531265141442418,
 -0.06127675995230675,
 -0.008443817496299744,
 0.031540773808956146,
 0.031089577823877335,
 -0.04993239790201187,
 -0.059171177446842194,
 0.03244316577911377,
 -0.014116000384092331,
 -0.06140567362308502,
 -0.020722804591059685,
 0.006832401733845472,
 0.009195811115205288,
 -0.020067494362592697,
 -0.011484021320939064,
 0.035214800387620926,
 -0.04653768241405487,
 0.004017795901745558,
 -0.0141482288017869,
 0.027394063770771027,
 0.05706559494137764,
 -0.017306603491306305,
 -0.031368888914585114,
 -0.020185666158795357,
 0.04413130134344101,
 0.06239400804042816,
 0.056936681270599365,
 0.0005176672129891813,
 0.017166946083307266,
 -0.05358493700623512,
 0.0314333438873291,
 -0.02758743427693844,
 -0.01743551529943943,
 0.0011414192849770188,
 -0.002343266736716032,
 0.02537442371249199,
 0.02264576032757759,
 -0.04391644522547722,
 -0.011752590537071228,
 -0.034204982221126556,
 -0.013546633534133434,
 -0.018176767975091934,
 -0.00301334704272449,
 0.

In [15]:
len(emb_vector.embedding)

1536

# upstage

In [40]:
# 임베딩모델 생성
from langchain_upstage import UpstageEmbeddings
embeddings =UpstageEmbeddings(
    api_key = upstage_api_key,
    model = 'solar-embedding-1-large'
)

In [41]:
#업스테이지 모델로 임베딩함수
def get_upstage_embedding(text: str, is_query: bool = False) -> np.ndarray:
    if is_query:
        vec = embeddings.embed_query(text)
    else:
        vec = embeddings.embed_documents([text])[0]  # 리스트에서 첫 번째 요소만 추출
    return np.array(vec)

In [13]:
# pip install -qU langchain-core langchain-upstage
 
from langchain_upstage import UpstageEmbeddings
 
embeddings = UpstageEmbeddings(
    api_key=upstage_api_key,
    model="embedding-query"
)
 
doc_result = embeddings.embed_documents(
    ["안녕하세요", "나의 이름은 홍길동입니다."]
)
print(doc_result)
 
query_result = embeddings.embed_query("What does Sam do?")
print(query_result)

[[0.029086124151945114, -0.028972061350941658, -0.01569998823106289, 0.024360647425055504, 0.015903674066066742, 0.006725656799972057, -0.007968131452798843, -0.01723984256386757, 0.00945095345377922, -0.007169688586145639, 0.009744258597493172, 0.008750279434025288, -0.005430224351584911, 0.012058112770318985, 0.018559716641902924, 0.011039691045880318, -0.008562889881432056, 0.00011520895350258797, -0.015390388667583466, -0.028662459924817085, -0.01388312503695488, 0.005548361223191023, -0.00030196201987564564, 0.0018932459643110633, 0.003287465311586857, -0.006167561747133732, -0.012074408121407032, -0.017305022105574608, 0.015040051192045212, 0.010070153512060642, 0.014298640191555023, -0.010224954225122929, 0.022568223997950554, -0.0034239336382597685, -0.00299823353998363, -0.01664508506655693, -0.019358159974217415, 0.025810878723859787, 0.020629150792956352, 0.009222826920449734, 0.016376221552491188, 0.004220339469611645, -0.01968405395746231, -0.004570676479488611, -0.0061431

In [16]:
len(query_result)

4096

# 유사도 비교 (openai)

In [34]:
texts = ['king', 'queen', 'slave', '왕']

In [35]:
openai_embeddings = {txt: get_openai_embedding(txt) for txt in texts}

In [36]:
cosine_similarity(openai_embeddings['queen'],openai_embeddings['king'])

np.float64(0.590601530239691)

# 유사도 비교 (upstage)

In [42]:
texts = ['king', 'queen', 'slave', '왕']

In [43]:
upstage_embeddings = {txt: get_upstage_embedding(txt) for txt in texts}

In [44]:
cosine_similarity(upstage_embeddings['queen'],upstage_embeddings['king'])

np.float64(0.6446770680612558)

# 왕의 비교

In [45]:
# openai embedding 모델로 임베딩
cosine_similarity(openai_embeddings['왕'],openai_embeddings['king'])

np.float64(0.5040406331683572)

In [46]:
 #업스테이 임베딩 모델로 임베딩
cosine_similarity(upstage_embeddings['왕'],upstage_embeddings['king'])

np.float64(0.6964350838223727)

In [None]:
# 젬마2 모델로 임베딩 비교

# Ollama 임베딩 > huggingface의 임베딩 모델
# pip install sentence-transformers

In [None]:
from langchain_community.chat_models import ChatOllama
llm_Ollama = ChatOllama(model='gemma2')
response = llm_Ollama.invoke('안녕? 네 소개를 2줄로 작성해')
response.content

In [47]:
texts = ['king', 'queen', 'slave', '왕']

In [None]:
# os 계열의 임베딩 모델 사용
# 허깅페이스의 임베딩 모델 사용 -transfromers 라이브러리, gpu기반 pytorch
# 토치쿠다 기반의 가상환경에서 실행

In [3]:
from sentence_transformers import SentenceTransformer
import torch
# model = SentenceTransformer('BAAI/bge-multilingual-gemma2')
MODEL = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device


  from .autonotebook import tqdm as notebook_tqdm


'cuda'

In [4]:
model = MODEL.to(device)

In [None]:
texts = ['king', 'queen', 'slave', '왕']

In [7]:
model.max_seq_length=256
gemma_embedding = model.encode(texts, batch_size=64, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True)
gemma_embedding

Batches: 100%|██████████| 1/1 [00:00<00:00, 28.57it/s]


array([[-0.00158092,  0.10178689, -0.01695296, ..., -0.05791416,
        -0.04557933, -0.04427628],
       [ 0.06440259, -0.00555207,  0.02317658, ..., -0.05347718,
         0.00656335, -0.04465976],
       [-0.04929696,  0.07381593, -0.00779096, ..., -0.08466548,
        -0.04895946, -0.06428   ],
       [ 0.00818061,  0.07960664, -0.0231488 , ..., -0.04931469,
        -0.04339727, -0.00256804]], dtype=float32)