In [2]:
import os
import numpy as np
import numpy.linalg as norm
import pandas as pd

from langchain_openai import OpenAIEmbeddings

from dotenv import load_dotenv

# .env 파일에서 환경 변수를 불러옵니다.
load_dotenv()

# os.getenv() 함수를 이용해 API 키를 변수에 저장합니다.
openai_api_key = os.getenv("OPENAI_API_KEY")

# OpenAIEmbeddings 객체를 생성합니다.
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
query_result = embeddings.embed_query("저는 배가 고파요")
print("Query Embedding:", query_result)

data = ['저는 배가 고파요', '저기 배가 지나가네요', '굶머서 허기가 지네요', 
        '허기 워기라는 게임이 있는데 즐거워','스팀에서 재밌는 거 해야지',
        '스팀에어프라이어로 연어구이 해먹을 거야'
        ]

df = pd.DataFrame(data, columns=['text'])

def get_embedding(text):
    return embeddings.embed_query(text) 

df['embedding'] = df.apply(lambda row: get_embedding(row.text), axis=1)

import numpy as np
from numpy import dot
from numpy.linalg import norm

def cosine_similarity(a, b):
    """
    Compute the cosine similarity between two vectors.
    
    Parameters:
    a (array-like): First vector.
    b (array-like): Second vector.
    
    Returns:
    float: Cosine similarity between the two vectors.
    """
    if norm(a) == 0 or norm(b) == 0:
        return 0.0
    return dot(a, b) / (norm(a) * norm(b))

def return_answer_candidate(df, query):
    """
    Find the most similar text in the DataFrame to the query using cosine similarity.
    
    Parameters:
    df (DataFrame): DataFrame containing text and embeddings.
    query (str): The query string to compare against the DataFrame.
    
    Returns:
    str: The most similar text from the DataFrame.
    """
    query_embedding = get_embedding(query)
    df['similarity'] = df.embedding.apply(lambda x: cosine_similarity(x, np.array(query_embedding)))

    top_three_doc = df.sort_values(by='similarity', ascending=False).head(3)
    return top_three_doc

# Return the text with the highest similarity score
sim_result = return_answer_candidate(df, "아무것도 안 먹었더니 꼬르륵 소리가 나네")
print("Top 3 Similar Documents:")
sim_result

Query Embedding: [-0.01663736067712307, -0.02178889885544777, 0.015218060463666916, -0.02722955122590065, -0.03682297468185425, 0.011774941347539425, -0.0345626063644886, -0.006715396419167519, -0.02399669960141182, -0.016821345314383507, -0.008134697563946247, 0.010822169482707977, -0.01055933628231287, -0.022669389843940735, 0.011308412067592144, -0.004934699274599552, 0.012070628814399242, -0.0029240227304399014, 0.008134697563946247, -0.016137978062033653, 0.00137576914858073, -0.014547835104167461, 0.018253788352012634, -0.012701429426670074, 0.0032525646965950727, 0.006432850379496813, 0.005342090968042612, -0.019016005098819733, -0.00918603129684925, -0.0017347012180835009, 0.034510038793087006, -0.016440236940979958, 0.00011386029655113816, 0.003149074036628008, 0.007372479885816574, -0.0054800789803266525, -0.006354000419378281, 0.0034726879093796015, -0.003784802509471774, -0.002250511897727847, -0.02243283949792385, -0.010362211614847183, 0.011354408226907253, -0.02473263442

Unnamed: 0,text,embedding,similarity
2,굶머서 허기가 지네요,"[-0.004118344280868769, -0.02247757278382778, ...",0.827823
5,스팀에어프라이어로 연어구이 해먹을 거야,"[-0.0034524151124060154, -0.030498342588543892...",0.820306
0,저는 배가 고파요,"[-0.016708848997950554, -0.021796436980366707,...",0.812257
