# OpenAI Embedding API로 문서 임베딩하고 검색하기

In [1]:
!pip install tiktoken

Collecting tiktoken
  Obtaining dependency information for tiktoken from https://files.pythonhosted.org/packages/2a/ad/d1c81988ca81bbf5faa79656b86fa9a9d08cd7f8c74b73775d29579a6da0/tiktoken-0.6.0-cp311-cp311-macosx_10_9_x86_64.whl.metadata
  Downloading tiktoken-0.6.0-cp311-cp311-macosx_10_9_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.6.0-cp311-cp311-macosx_10_9_x86_64.whl (999 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m999.8/999.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.6.0


In [2]:
import math
import os
from glob import glob
import tiktoken

## Tiktoken?

Tiktoken은 OpenAI에서 측정는 텍스트의 토큰 갯수를 확인 할 수 있는 방법입니다.

In [3]:
encoding = tiktoken.get_encoding("cl100k_base")
num_tokens = len(encoding.encode("tiktoken is great!"))

In [4]:
num_tokens

6

In [5]:
encoding.encode("tiktoken is great!")

[83, 1609, 5963, 374, 2294, 0]

In [6]:
encoding.decode(encoding.encode("tiktoken is great!"))

'tiktoken is great!'

## 긴 문서 chunking 하기

In [8]:
def long_text_into_chunks(text, n_slide=400, max_size=800): # n_slide=4000, max_size=8000, MAX_TOKENS = 8191
    text_chunk_list = []

    encoding = tiktoken.get_encoding("cl100k_base")
    token_ids = encoding.encode(text)

    n_chunk = int(math.ceil(len(token_ids) / n_slide))

    for chunk_i in range(n_chunk):
        token_ids_chunk = token_ids[chunk_i * n_slide:chunk_i * n_slide + max_size]
        curr_text_chunk = encoding.decode(token_ids_chunk)
        text_chunk_list.append(curr_text_chunk)
    return text_chunk_list

In [9]:
# get all rst files
def get_all_doc_data(dir_path, doc_id_start=0):
    md_path_list = glob(f"{dir_path}/**/*.rst", recursive=True)
    doc_list = []
    # make docs
    for doc_id, md_path in enumerate(md_path_list, start=doc_id_start):
        with open(md_path, "rt") as f:
            text = f.read()
        doc_list.append({"docId": str(doc_id), "src": md_path, "text": text})

    psg_list = []

    for doc in doc_list:
        text_chunk_list = long_text_into_chunks(doc['text'])
        for psg_id, text_chunk in enumerate(text_chunk_list):

            psg_list.append(text_chunk)

    return psg_list

In [10]:
dir_path = "./konlpy"
psg_list = get_all_doc_data(dir_path, doc_id_start=0)

In [12]:
len(psg_list)

56

In [16]:
print(psg_list[1])

명해주세요. 같은 상황을 겪고 있는 사람들이 많이 모일수록 문제는 빠르게 해결될 수 있습니다.
3. 같은 이슈가 아직 제기되지 않았다면, "New Issue" 버튼을 눌러 이슈를 새로 생성해주시면 됩니다. 이슈를 새로 생성하시는 경우에는 사용하는 OS나 패키지 버젼 등을 같이 적어주시면 문제를 빠르게 해결하는데 도움이 됩니다.


3. 이슈 제안/해결하기
---------------------

- `깃헙 이슈 <https://github.com/konlpy/konlpy/issues>`_ 에 코드를 개선할 수 있는 방법을 제안하거나, 제안된 이슈에 대해 토론/해결하실 수 있습니다.
- 코드를 작성할 때는 다음에 유의해주세요.
    1. 탭 대신 공백 4개 사용
    2. 문서에서 특별히 언급되지 않은 사항은 일단 코드의 다른 부분들을 참고해서 작성 (+ 다른 분들의 편의를 위해 이 문서를 업데이트 해주세요)
    3. 커밋 로그는 설명력 있게 작성
- 코드 작성을 완료한 후 코드가 모든 테스트를 통과하는지 확인해주세요.
    1. 자바 코드를 수정한 경우::

        # Install `Apache Ant <http://ant.apache.org/manual/install.html>`_
        make java

    1. 코드를 단 한 줄이라도 수정한 모든 경우::

        pip install -r requirements-dev.txt
        pip3 install -r requirements-dev.txt
        make build      # create tar.gz
        make check      # check code styles
        make testall    # run tests

- PR을 보내기 전 다음을 확인해주세요.
    1. PR을 보내면 해당 코드는 KoNLPy의 오픈소스 라이센스를 따름
    1. PR를 보낸 후 코드의 일부를 변경하도록 요청될 경우, ``git c

## 문서 Embedding 하기

In [25]:
import numpy as np
from tenacity import retry, wait_random_exponential, stop_after_attempt
from openai import OpenAI
client = OpenAI()

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [17]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def encode(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-ada-002")
    return np.array(response.data[0].embedding, dtype=np.float32)

In [18]:
psg_embs = np.array([encode(psg) for psg in psg_list], dtype=np.float32)

In [19]:
psg_embs

array([[ 0.0096548 , -0.00663032,  0.0192404 , ...,  0.00558179,
        -0.00019498, -0.03147674],
       [ 0.00919929, -0.01486862,  0.01937467, ...,  0.01465468,
        -0.00289149, -0.01357163],
       [ 0.01713589, -0.00342784,  0.0237734 , ...,  0.02025631,
        -0.00056649, -0.00144121],
       ...,
       [-0.01171179, -0.00286324,  0.01626278, ..., -0.00817453,
        -0.00545687, -0.04026152],
       [-0.02808339,  0.00442881,  0.01717625, ..., -0.00433307,
         0.00235978, -0.02092071],
       [-0.00584578,  0.01443437,  0.02386796, ...,  0.00840158,
        -0.02129139, -0.02983841]], dtype=float32)

In [20]:
psg_embs.shape

(56, 1536)

## Faiss Index

### Faiss Index 빌드하기

In [22]:
!pip install faiss-cpu



In [23]:
import faiss

In [24]:
# Calculate the L2 norms for each vector (row)
norms = np.linalg.norm(psg_embs, axis=1, keepdims=True)

# Divide by the norm to normalize
psg_embs_normed = psg_embs / norms

NameError: name 'np' is not defined

In [24]:
psg_embs_normed.shape

(56, 1536)

In [25]:
index = faiss.IndexFlatIP(psg_embs_normed.shape[1])  # 1536

In [26]:
index.add(psg_embs_normed)

### Faiss Index에서 검색하기

In [34]:
query = "Standing on the shoulders of giants"  # Ubuntu install, Standing on the shoulders of giants
query_emb = encode(query)
query_emb_normed =  query_emb / np.linalg.norm(query_emb)

In [35]:
query_emb_normed

array([-0.00665539, -0.03375141, -0.00335975, ..., -0.00604948,
       -0.01418277, -0.00806597], dtype=float32)

In [36]:
dist_list_list, psg_idx_list_list = index.search(query_emb.reshape(1, -1), k=3)  # (1, dim 1536)

In [37]:
dist_list_list

array([[0.7544828, 0.7140408, 0.7096298]], dtype=float32)

In [38]:
psg_idx_list_list

array([[13, 36, 49]])

In [39]:
dist_list, psg_idx_list = dist_list_list[0], psg_idx_list_list[0]

In [40]:
for psg_idx in psg_idx_list:
    print(psg_list[psg_idx])
    print("*"*30)

고는 실행환경, 에러메세지와함께 설명을 최대한상세히!^^'))
    [(오류, NNG),
     (보고, NNG),
     (는, JX),
     (실행, NNG),
     (환경, NNG),
     (,, SP),
     (에러, NNG),
     (메세지, NNG),
     (와, JKM),
     (함께, MAG),
     (설명, NNG),
     (을, JKO),
     (최대한, NNG),
     (상세히, MAG),
     (!, SF),
     (^^, EMO)]


Standing on the shoulders of giants
-----------------------------------

Korean, the `13th most widely spoken language in the world <http://www.koreatimes.co.kr/www/news/nation/2014/05/116_157214.html>`_, is a beautiful, yet complex language.
Myriad :ref:`engines` were built by numerous researchers, to computationally extract meaningful features from the labyrinthine text.

KoNLPy is not just to create another, but to unify and build upon their shoulders, and see one step further.
It is built particularly in the `Python (programming) language <http://python.org>`_, not only because of the language's simplicity and elegance, but also the powerful string processing modules and applicability to various tas

In [None]:
psg_idx_list_list