<a href="https://colab.research.google.com/github/ychoi-kr/llm-api-prog/blob/main/6_upstage/solar_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 솔라 임베딩 API

In [1]:
!pip install openai

Collecting openai
  Downloading openai-1.52.0-py3-none-any.whl.metadata (24 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.6-py3-none-any.whl.metadata (21 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.52.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.9/386.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.6-py3-none-any.whl (78 kB)
[2K   [90m━━

In [2]:
from google.colab import userdata
import os

api_key = userdata.get('UPSTAGE_API_KEY')

In [3]:
from openai import OpenAI
client = OpenAI(
    api_key=api_key,
    base_url="https://api.upstage.ai/v1/solar"
)

## 임베딩

In [4]:
response = client.embeddings.create(
    input="왕",
    model="embedding-passage"  # 구 명칭: solar-embedding-1-large-passage
)

king = response.data[0].embedding

In [None]:
king

[0.0102386474609375,
 0.005893707275390625,
 -0.0134735107421875,
 0.0223388671875,
 -0.006168365478515625,
 -0.0030460357666015625,
 -0.016510009765625,
 -0.01409912109375,
 0.0032958984375,
 -0.0008378028869628906,
 0.0103759765625,
 0.01184844970703125,
 -0.00788116455078125,
 -0.022064208984375,
 0.0016984939575195312,
 0.020172119140625,
 -0.0196685791015625,
 0.01030731201171875,
 0.00798797607421875,
 -0.0192718505859375,
 -0.01294708251953125,
 0.00860595703125,
 -0.00777435302734375,
 0.0139312744140625,
 -0.0031070709228515625,
 -0.0041046142578125,
 0.0086669921875,
 0.0038852691650390625,
 0.012786865234375,
 0.00812530517578125,
 0.0138702392578125,
 -0.00823974609375,
 0.00960540771484375,
 -0.0112152099609375,
 -0.007717132568359375,
 -0.0078582763671875,
 -0.01251983642578125,
 0.006229400634765625,
 -0.008514404296875,
 -0.0080413818359375,
 0.0142364501953125,
 0.00820159912109375,
 -0.0267181396484375,
 -0.002376556396484375,
 -0.01194000244140625,
 -0.0135498046875,

In [None]:
len(king)

4096

## 임베딩 함수 정의

In [5]:
def embedding_passage(input):
    response = client.embeddings.create(
        input=input,
        model="embedding-passage"  # 구 명칭: "solar-embedding-1-large-passage"
    )

    return response.data[0].embedding

def embedding_query(input):
    response = client.embeddings.create(
        input=input,
        model="embedding-query"  # 구 명칭: "solar-embedding-1-large-query"
    )

    return response.data[0].embedding

## 코사인 유사도

In [6]:
import numpy as np

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

## 비슷한 단어

In [7]:
import pandas as pd

df = pd.DataFrame({'단어': ['공부', '글', '도서', '문서', '영화', '책상', '화장품']})
df['embedding'] = df['단어'].apply(embedding_passage)
df

Unnamed: 0,단어,embedding
0,공부,"[0.020843505859375, 0.014373779296875, -0.0106..."
1,글,"[0.03106689453125, -0.00457763671875, -0.00252..."
2,도서,"[0.0014448165893554688, -0.0022983551025390625..."
3,문서,"[0.0208587646484375, -0.0011606216430664062, -..."
4,영화,"[0.006229400634765625, -0.01091766357421875, -..."
5,책상,"[0.01120758056640625, -0.0303497314453125, 0.0..."
6,화장품,"[0.005054473876953125, -0.002826690673828125, ..."


In [8]:
df['cosine_similarity'] = df['embedding'].apply(lambda x: cosine_similarity(x, embedding_query('책')))
df.sort_values(by='cosine_similarity', ascending=False)

Unnamed: 0,단어,embedding,cosine_similarity
2,도서,"[0.0014448165893554688, -0.0022983551025390625...",0.292534
5,책상,"[0.01120758056640625, -0.0303497314453125, 0.0...",0.230325
1,글,"[0.03106689453125, -0.00457763671875, -0.00252...",0.20302
0,공부,"[0.020843505859375, 0.014373779296875, -0.0106...",0.200271
4,영화,"[0.006229400634765625, -0.01091766357421875, -...",0.16215
3,문서,"[0.0208587646484375, -0.0011606216430664062, -...",0.161588
6,화장품,"[0.005054473876953125, -0.002826690673828125, ...",0.105394


## 비슷한 속담 찾기

In [9]:
import pandas as pd

proverbs = [
    "가는 말이 고와야 오는 말이 곱다",
    "가는 날이 장날이다",
    "개구리 올챙이 적 생각 못 한다",
    "고래 싸움에 새우 등 터진다",
    "공든 탑이 무너지랴",
    "구슬이 서 말이라도 꿰어야 보배",
    "그림의 떡",
    "김칫국부터 마신다",
    "낫 놓고 기역 자도 모른다",
    "누워서 떡 먹기",
    "등잔 밑이 어둡다",
    "뛰는 놈 위에 나는 놈 있다",
    "말 한마디로 천 냥 빚 갚는다",
    "매도 먼저 맞는 게 낫다",
    "모로 가도 서울만 가면 된다",
    "물이 깊을수록 소리가 없다",
    "바늘 도둑이 소 도둑 된다",
    "배보다 배꼽이 크다",
    "백지장도 맞들면 낫다",
    "보기 좋은 떡이 먹기도 좋다",
    "콩 심은 데 콩 나고 팥 심은 데 팥 난다",
    "티끌 모아 태산",
    "시작이 반이다"
]

df = pd.DataFrame({'속담': proverbs})
df['embedding'] = df['속담'].apply(embedding_passage)
df

Unnamed: 0,속담,embedding
0,가는 말이 고와야 오는 말이 곱다,"[-0.00742340087890625, 0.0096282958984375, -0...."
1,가는 날이 장날이다,"[-0.00678253173828125, -0.00921630859375, -0.0..."
2,개구리 올챙이 적 생각 못 한다,"[-0.0093536376953125, 0.0005087852478027344, -..."
3,고래 싸움에 새우 등 터진다,"[0.0036449432373046875, -0.00701141357421875, ..."
4,공든 탑이 무너지랴,"[-0.0182342529296875, -0.0170745849609375, -0...."
5,구슬이 서 말이라도 꿰어야 보배,"[-0.0123443603515625, 0.0128173828125, -0.0065..."
6,그림의 떡,"[0.0017614364624023438, 0.0024318695068359375,..."
7,김칫국부터 마신다,"[-0.0004596710205078125, -0.0107269287109375, ..."
8,낫 놓고 기역 자도 모른다,"[0.01177215576171875, 0.0017156600952148438, 0..."
9,누워서 떡 먹기,"[-0.006183624267578125, -0.0021457672119140625..."


In [10]:
# df의 각 속담에 대해 코사인 유사도를 계산하고 내림차순 정렬

df['cosine_similarity'] = df['embedding'].apply(
    lambda x: cosine_similarity(x, embedding_query('아니 땐 굴뚝에 연기 나랴?'))
)
df.sort_values(by='cosine_similarity', ascending=False)


Unnamed: 0,속담,embedding,cosine_similarity
4,공든 탑이 무너지랴,"[-0.0182342529296875, -0.0170745849609375, -0....",0.248347
20,콩 심은 데 콩 나고 팥 심은 데 팥 난다,"[-0.010040283203125, -0.007080078125, -0.00022...",0.233737
1,가는 날이 장날이다,"[-0.00678253173828125, -0.00921630859375, -0.0...",0.213273
18,백지장도 맞들면 낫다,"[-0.0265655517578125, 0.01502227783203125, -0....",0.208328
16,바늘 도둑이 소 도둑 된다,"[-0.00290679931640625, -0.01053619384765625, -...",0.203372
14,모로 가도 서울만 가면 된다,"[-0.0211334228515625, -0.0003561973571777344, ...",0.20299
3,고래 싸움에 새우 등 터진다,"[0.0036449432373046875, -0.00701141357421875, ...",0.188546
11,뛰는 놈 위에 나는 놈 있다,"[0.0003533363342285156, -0.0019626617431640625...",0.187714
13,매도 먼저 맞는 게 낫다,"[-0.01378631591796875, 0.0036773681640625, 0.0...",0.1842
2,개구리 올챙이 적 생각 못 한다,"[-0.0093536376953125, 0.0005087852478027344, -...",0.183713


In [11]:
df['cosine_similarity'] = df['embedding'].apply(
    lambda x: cosine_similarity(x, embedding_query('천 리 길도 한 걸음부터'))
)
df.sort_values(by='cosine_similarity', ascending=False)

Unnamed: 0,속담,embedding,cosine_similarity
22,시작이 반이다,"[0.00649261474609375, 0.021087646484375, -0.00...",0.251173
12,말 한마디로 천 냥 빚 갚는다,"[-0.006175994873046875, 0.0087738037109375, 0....",0.243227
21,티끌 모아 태산,"[0.0052032470703125, 0.01265716552734375, -0.0...",0.221995
14,모로 가도 서울만 가면 된다,"[-0.0211334228515625, -0.0003561973571777344, ...",0.214862
5,구슬이 서 말이라도 꿰어야 보배,"[-0.0123443603515625, 0.0128173828125, -0.0065...",0.196779
16,바늘 도둑이 소 도둑 된다,"[-0.00290679931640625, -0.01053619384765625, -...",0.174624
18,백지장도 맞들면 낫다,"[-0.0265655517578125, 0.01502227783203125, -0....",0.174423
1,가는 날이 장날이다,"[-0.00678253173828125, -0.00921630859375, -0.0...",0.171325
13,매도 먼저 맞는 게 낫다,"[-0.01378631591796875, 0.0036773681640625, 0.0...",0.170217
7,김칫국부터 마신다,"[-0.0004596710205078125, -0.0107269287109375, ...",0.16879


In [12]:
df['cosine_similarity'] = df['embedding'].apply(
    lambda x: cosine_similarity(x, embedding_query('이왕이면 다홍 치마'))
)
df.sort_values(by='cosine_similarity', ascending=False)

Unnamed: 0,속담,embedding,cosine_similarity
19,보기 좋은 떡이 먹기도 좋다,"[-0.00951385498046875, 0.015869140625, -0.0053...",0.223491
13,매도 먼저 맞는 게 낫다,"[-0.01378631591796875, 0.0036773681640625, 0.0...",0.206668
0,가는 말이 고와야 오는 말이 곱다,"[-0.00742340087890625, 0.0096282958984375, -0....",0.193762
17,배보다 배꼽이 크다,"[0.01528167724609375, -0.01018524169921875, -0...",0.185962
14,모로 가도 서울만 가면 된다,"[-0.0211334228515625, -0.0003561973571777344, ...",0.18148
1,가는 날이 장날이다,"[-0.00678253173828125, -0.00921630859375, -0.0...",0.17882
5,구슬이 서 말이라도 꿰어야 보배,"[-0.0123443603515625, 0.0128173828125, -0.0065...",0.163159
12,말 한마디로 천 냥 빚 갚는다,"[-0.006175994873046875, 0.0087738037109375, 0....",0.160988
7,김칫국부터 마신다,"[-0.0004596710205078125, -0.0107269287109375, ...",0.160666
18,백지장도 맞들면 낫다,"[-0.0265655517578125, 0.01502227783203125, -0....",0.157344
