### from : 03.rrf_comparision.md

In [8]:
import os
from dotenv import load_dotenv

# .env 파일 로드
load_dotenv()

# 환경 변수 가져오기
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_LLM_MODEL = os.getenv("OPENAI_LLM_MODEL")
OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_REGION = os.getenv("PINECONE_INDEX_REGION")
PINECONE_INDEX_CLOUD = os.getenv("PINECONE_INDEX_CLOUD")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_INDEX_METRIC = os.getenv("PINECONE_INDEX_METRIC")
PINECONE_INDEX_DIMENSION = int(os.getenv("PINECONE_INDEX_DIMENSION"))

print("환경 변수 로딩 완료")

환경 변수 로딩 완료


In [9]:
import pandas as pd

# 문서 및 질의 데이터 로드
documents_df = pd.read_csv("../../datas/documents.csv")
queries_df = pd.read_csv("../../datas/queries.csv")

print(f"문서 수: {len(documents_df)}")
print(f"질의 수: {len(queries_df)}")

문서 수: 30
질의 수: 30


In [10]:
from konlpy.tag import Okt
from rank_bm25 import BM25Okapi

# 예시: Mecab 형태소 분석기로 문서 토큰화
mecab = Okt()

tokenized_docs = [mecab.morphs(content) for content in documents_df['content']]
bm25 = BM25Okapi(tokenized_docs)

def bm25_search(query, top_k=5):
    query_tokens = mecab.morphs(query)
    scores = bm25.get_scores(query_tokens)
    ranked_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    return [documents_df['doc_id'].iloc[i] for i in ranked_idx[:top_k]]

print("Mecab 기반 BM25 검색:", bm25_search("제주도 관광 명소", top_k=5))

Mecab 기반 BM25 검색: ['D1', 'D12', 'D2', 'D3', 'D4']


In [11]:
from pinecone import Pinecone
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore

pc = Pinecone(api_key=PINECONE_API_KEY)

index = pc.Index(PINECONE_INDEX_NAME)  

embedding_model = OpenAIEmbeddings(
    model=OPENAI_EMBEDDING_MODEL,
    openai_api_key=OPENAI_API_KEY
)

vector_store = PineconeVectorStore(
    index_name=PINECONE_INDEX_NAME,
    embedding=embedding_model
)

print("기존 'ir' 인덱스에 연결하여 Dense Retrieval 설정 완료")

기존 'ir' 인덱스에 연결하여 Dense Retrieval 설정 완료


In [12]:
import numpy as np
from sklearn.metrics import precision_score, recall_score

def parse_relevant(relevant_str):
    pairs = relevant_str.split(';')
    rel_dict = {}
    for pair in pairs:
        doc_id, grade = pair.split('=')
        rel_dict[doc_id] = int(grade)
    return rel_dict

def compute_metrics(predicted, relevant_dict, k=5):
    hits = sum(1 for doc in predicted[:k] if doc in relevant_dict)
    precision = hits / k
    total_relevant = len(relevant_dict)
    recall = hits / total_relevant if total_relevant > 0 else 0
    rr = 0
    for idx, doc in enumerate(predicted):
        if doc in relevant_dict:
            rr = 1 / (idx + 1)
            break
    num_correct = 0
    precisions = []
    for i, doc in enumerate(predicted[:k]):
        if doc in relevant_dict:
            num_correct += 1
            precisions.append(num_correct / (i + 1))
    ap = np.mean(precisions) if precisions else 0
    return precision, recall, rr, ap

def evaluate_all(method_results, queries_df, k=5):
    prec_list, rec_list, rr_list, ap_list = [], [], [], []
    for idx, row in queries_df.iterrows():
        qid = row['query_id']
        relevant_dict = parse_relevant(row['relevant_doc_ids'])
        predicted = method_results[qid]
        p, r, rr, ap = compute_metrics(predicted, relevant_dict, k)
        prec_list.append(p)
        rec_list.append(r)
        rr_list.append(rr)
        ap_list.append(ap)
    return {
        'P@5': np.mean(prec_list),
        'R@5': np.mean(rec_list),
        'MRR': np.mean(rr_list),
        'MAP': np.mean(ap_list)
    }

In [13]:
# RRF 결합 함수
def rrf_rank(bm25_list, dense_list, k=60):
    # bm25_list, dense_list: 상위 20개 문서 ID 리스트
    candidate_scores = {}
    for rank, doc in enumerate(bm25_list):
        candidate_scores[doc] = candidate_scores.get(doc, 0) + 1 / (rank + 1 + k)
    for rank, doc in enumerate(dense_list):
        candidate_scores[doc] = candidate_scores.get(doc, 0) + 1 / (rank + 1 + k)
    # 점수 정렬
    ranked = sorted(candidate_scores.items(), key=lambda x: x[1], reverse=True)
    return [doc for doc, _ in ranked]

# BM25 상위 20, Dense 상위 20, RRF 상위 5 결과 생성
bm25_candidates = {}
dense_candidates = {}
rrf_results = {}
for idx, row in queries_df.iterrows():
    qid = row['query_id']
    query_text = row['query_text']
    bm25_top20 = bm25_search(query_text, top_k=20)
    dense_top20 = [doc.metadata['doc_id'] for doc in vector_store.similarity_search(query_text, k=20)]
    bm25_candidates[qid] = bm25_top20
    dense_candidates[qid] = dense_top20
    rrf_list = rrf_rank(bm25_top20, dense_top20, k=60)
    rrf_results[qid] = rrf_list[:5]

print("RRF 결합 완료 (상위 5개 저장됨)")
# 예시 확인
print("질의 Q1 RRF 상위 5:", rrf_results[queries_df.loc[0, 'query_id']])

RRF 결합 완료 (상위 5개 저장됨)
질의 Q1 RRF 상위 5: ['D1', 'D2', 'D12', 'D8', 'D17']


In [14]:
import pandas as pd

# BM25 상위 5 리스트 생성
bm25_results_5 = {qid: lst[:5] for qid, lst in bm25_candidates.items()}
# Dense 상위 5 리스트 생성
dense_results_5 = {qid: lst[:5] for qid, lst in dense_candidates.items()}

# 평가
bm25_metrics = evaluate_all(bm25_results_5, queries_df, k=5)
dense_metrics = evaluate_all(dense_results_5, queries_df, k=5)
rrf_metrics = evaluate_all(rrf_results, queries_df, k=5)

# 결과 테이블
df_metrics = pd.DataFrame({
    'Metric': ['P@5', 'R@5', 'MRR', 'MAP'],
    'BM25': [bm25_metrics['P@5'], bm25_metrics['R@5'], bm25_metrics['MRR'], bm25_metrics['MAP']],
    'Dense': [dense_metrics['P@5'], dense_metrics['R@5'], dense_metrics['MRR'], dense_metrics['MAP']],
    'RRF': [rrf_metrics['P@5'], rrf_metrics['R@5'], rrf_metrics['MRR'], rrf_metrics['MAP']]
})
df_metrics

Unnamed: 0,Metric,BM25,Dense,RRF
0,P@5,0.253333,0.466667,0.266667
1,R@5,0.894444,1.716667,0.927778
2,MRR,0.966667,0.977778,0.983333
3,MAP,0.957407,0.978148,0.955185
