### from : 04.hyde_experiment.md

In [1]:
import os
from dotenv import load_dotenv

# .env 파일 로드
load_dotenv()

# 환경 변수 가져오기
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_LLM_MODEL = os.getenv("OPENAI_LLM_MODEL")
OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_REGION = os.getenv("PINECONE_INDEX_REGION")
PINECONE_INDEX_CLOUD = os.getenv("PINECONE_INDEX_CLOUD")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_INDEX_METRIC = os.getenv("PINECONE_INDEX_METRIC")
PINECONE_INDEX_DIMENSION = int(os.getenv("PINECONE_INDEX_DIMENSION"))
COHERE_API_KEY = os.getenv("COHERE_API_KEY")

print("환경 변수 로딩 완료")

환경 변수 로딩 완료


In [3]:
import pandas as pd

# 문서 및 질의 데이터 로드
documents_df = pd.read_csv("../../datas/documents.csv")
queries_df = pd.read_csv("../../datas/queries.csv")

print(f"문서 수: {len(documents_df)}")
print(f"질의 수: {len(queries_df)}")

문서 수: 30
질의 수: 30


In [4]:
from konlpy.tag import Okt
from rank_bm25 import BM25Okapi

# Mecab 형태소 분석기 초기화
mecab = Okt()
# 문서 토큰화
tokenized_docs = [mecab.morphs(content) for content in documents_df['content']]
bm25 = BM25Okapi(tokenized_docs)

def bm25_search(query, top_k=20):
    tokens = mecab.morphs(query)
    scores = bm25.get_scores(tokens)
    ranked_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    return [documents_df['doc_id'].iloc[i] for i in ranked_idx[:top_k]]

print("BM25 예시 상위 5:", bm25_search("제주도 관광 명소", top_k=5))

BM25 예시 상위 5: ['D1', 'D12', 'D2', 'D3', 'D4']


In [5]:
from pinecone import Pinecone
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore

# Pinecone 클라이언트 연결
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(PINECONE_INDEX_NAME)

# 임베딩 모델 생성
embedding_model = OpenAIEmbeddings(model=OPENAI_EMBEDDING_MODEL, openai_api_key=OPENAI_API_KEY)
# Dense 벡터 스토어 설정
vector_store = PineconeVectorStore(index_name=PINECONE_INDEX_NAME, embedding=embedding_model)

print("Dense Retrieval 설정 완료")

  from .autonotebook import tqdm as notebook_tqdm


Dense Retrieval 설정 완료


In [6]:
import numpy as np

def parse_relevant(relevant_str):
    pairs = relevant_str.split(';')
    rel_dict = { }
    for pair in pairs:
        doc_id, grade = pair.split('=')
        rel_dict[doc_id] = int(grade)
    return rel_dict

def compute_metrics(predicted, relevant_dict, k=5):
    hits = sum(1 for doc in predicted[:k] if doc in relevant_dict)
    precision = hits / k
    total_relevant = len(relevant_dict)
    recall = hits / total_relevant if total_relevant > 0 else 0
    rr = 0
    for idx, doc in enumerate(predicted):
        if doc in relevant_dict:
            rr = 1 / (idx + 1)
            break
    num_correct = 0
    precisions = []
    for i, doc in enumerate(predicted[:k]):
        if doc in relevant_dict:
            num_correct += 1
            precisions.append(num_correct / (i + 1))
    ap = np.mean(precisions) if precisions else 0
    return precision, recall, rr, ap

def evaluate_all(results_dict, queries_df, k=5):
    prec_list, rec_list, rr_list, ap_list = [], [], [], []
    for idx, row in queries_df.iterrows():
        qid = row['query_id']
        relevant = parse_relevant(row['relevant_doc_ids'])
        predicted = results_dict[qid]
        p, r, rr, ap = compute_metrics(predicted, relevant, k)
        prec_list.append(p)
        rec_list.append(r)
        rr_list.append(rr)
        ap_list.append(ap)
    return {
        'P@5': np.mean(prec_list),
        'R@5': np.mean(rec_list),
        'MRR': np.mean(rr_list),
        'MAP': np.mean(ap_list)
    }

In [7]:
# BM25 상위 20 후보 생성
bm25_candidates = {}
for idx, row in queries_df.iterrows():
    qid = row['query_id']
    query_text = row['query_text']
    bm25_candidates[qid] = bm25_search(query_text, top_k=20)

# Dense Retrieval 상위 20 후보 생성
dense_candidates = {}
for idx, row in queries_df.iterrows():
    qid = row['query_id']
    query_text = row['query_text']
    docs = vector_store.similarity_search(query_text, k=20)
    dense_candidates[qid] = [doc.metadata['doc_id'] for doc in docs]

print("BM25 & Dense 후보 생성 완료")

BM25 & Dense 후보 생성 완료


In [8]:
import cohere
import time
from cohere import TooManyRequestsError

# Cohere 클라이언트 초기화
co = cohere.Client(COHERE_API_KEY)

def cohere_rerank(query, candidate_ids):
    texts = [
        documents_df.loc[documents_df['doc_id'] == cid, 'content'].values[0]
        for cid in candidate_ids
    ]
    try:
        response = co.rerank(
            model='rerank-multilingual-v3.0',
            query=query,
            documents=texts
        )
        ranked = sorted(response.results, key=lambda x: x.relevance_score, reverse=True)
        return [candidate_ids[r.index] for r in ranked]

    except TooManyRequestsError:
        # 429 에러 발생 시 잠깐 대기 후 재시도
        print("TooManyRequestsError 발생, 10초 대기 후 재시도합니다.")
        time.sleep(10)
        response = co.rerank(
            model='rerank-multilingual-v3.0',
            query=query,
            documents=texts
        )
        ranked = sorted(response.results, key=lambda x: x.relevance_score, reverse=True)
        return [candidate_ids[r.index] for r in ranked]

# Reranked results 저장 (상위 5)
rerank_results = {}
for idx, row in queries_df.iterrows():
    qid = row['query_id']
    candidates = list(dict.fromkeys(bm25_candidates[qid] + dense_candidates[qid]))
    query_text = row['query_text']
    rerank_results[qid] = cohere_rerank(query_text, candidates)[:5]
    time.sleep(6)  # 호출 간 최소 6초 대기 → 분당 약 10회 이하로 제한

print("Cohere Reranking 완료 (상위 5 저장됨)")

Cohere Reranking 완료 (상위 5 저장됨)


In [9]:
import pandas as pd

# BM25 상위 5, Dense 상위 5
bm25_results_5 = {qid: lst[:5] for qid, lst in bm25_candidates.items()}
dense_results_5 = {qid: lst[:5] for qid, lst in dense_candidates.items()}

# 평가
bm25_metrics = evaluate_all(bm25_results_5, queries_df, k=5)
dense_metrics = evaluate_all(dense_results_5, queries_df, k=5)
rerank_metrics = evaluate_all(rerank_results, queries_df, k=5)

df_metrics = pd.DataFrame({
    'Metric': ['P@5', 'R@5', 'MRR', 'MAP'],
    'BM25': [bm25_metrics['P@5'], bm25_metrics['R@5'], bm25_metrics['MRR'], bm25_metrics['MAP']],
    'Dense': [dense_metrics['P@5'], dense_metrics['R@5'], dense_metrics['MRR'], dense_metrics['MAP']],
    'Cohere': [rerank_metrics['P@5'], rerank_metrics['R@5'], rerank_metrics['MRR'], rerank_metrics['MAP']]
})
df_metrics

Unnamed: 0,Metric,BM25,Dense,Cohere
0,P@5,0.253333,0.466667,0.26
1,R@5,0.894444,1.716667,0.922222
2,MRR,0.966667,0.977778,0.966667
3,MAP,0.957407,0.978148,0.964074
