### from : 02.bm25_dense_comparision.md

In [1]:
import os
from dotenv import load_dotenv

# .env 파일 로드
load_dotenv()

# 환경 변수 가져오기
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_LLM_MODEL = os.getenv("OPENAI_LLM_MODEL")
OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_REGION = os.getenv("PINECONE_INDEX_REGION")
PINECONE_INDEX_CLOUD = os.getenv("PINECONE_INDEX_CLOUD")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_INDEX_METRIC = os.getenv("PINECONE_INDEX_METRIC")
PINECONE_INDEX_DIMENSION = int(os.getenv("PINECONE_INDEX_DIMENSION"))

print("환경 변수 로딩 완료")

환경 변수 로딩 완료


In [2]:
import pandas as pd

# 문서 및 질의 데이터 로드
documents_df = pd.read_csv("documents.csv")
queries_df = pd.read_csv("queries.csv")

print(f"문서 수: {len(documents_df)}")
print(f"질의 수: {len(queries_df)}")

문서 수: 30
질의 수: 30


In [1]:
%pip install mecab-python3

Collecting mecab-python3
  Downloading mecab_python3-1.0.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (6.2 kB)
Downloading mecab_python3-1.0.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (567 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m567.5/567.5 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mecab-python3
Successfully installed mecab-python3-1.0.10
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
from mecab import MeCab

mecab = MeCab()
print(mecab.pos("한국어를 처리하는 예시입니다."))


ModuleNotFoundError: No module named 'mecab'

In [3]:
from eunjeon import Mecab
from rank_bm25 import BM25Okapi

# 예시: Mecab 형태소 분석기로 문서 토큰화
# mecab = Mecab()
mecab = Mecab('/usr/lib/x86_64-linux-gnu/mecab/dic/ipadic')

tokenized_docs = [Mecab.morphs(content) for content in documents_df['content']]
bm25 = BM25Okapi(tokenized_docs)

def bm25_search_mecab(query, top_k=5):
    query_tokens = mecab.morphs(query)
    scores = bm25.get_scores(query_tokens)
    ranked_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    return [documents_df['doc_id'].iloc[i] for i in ranked_idx[:top_k]]

print("Mecab 기반 BM25 검색:", bm25_search_mecab("제주도 관광 명소", top_k=5))

Exception: The MeCab dictionary does not exist at "/usr/lib/x86_64-linux-gnu/mecab/dic/ipadic". Is the dictionary correctly installed?
You can also try entering the dictionary path when initializing the Mecab class: "Mecab('/some/dic/path')"

In [None]:
from pinecone import Pinecone
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore

pc = Pinecone(api_key=PINECONE_API_KEY)

index = pc.Index(PINECONE_INDEX_NAME)  

embedding_model = OpenAIEmbeddings(
    model=OPENAI_EMBEDDING_MODEL,
    openai_api_key=OPENAI_API_KEY
)

vector_store = PineconeVectorStore(
    index_name=PINECONE_INDEX_NAME,
    embedding=embedding_model
)

print("기존 'ir' 인덱스에 연결하여 Dense Retrieval 설정 완료")

In [None]:
import numpy as np
from sklearn.metrics import precision_score, recall_score

# 다중 정답 및 등급을 처리하기 위한 헬퍼 함수
def parse_relevant(relevant_str):
    # 'D6=3;D14=2;D26=1' 형태
    pairs = relevant_str.split(';')
    rel_dict = {}
    for pair in pairs:
        doc_id, grade = pair.split('=')
        rel_dict[doc_id] = int(grade)
    return rel_dict

def compute_metrics(predicted, relevant_dict, k=5):
    # Precision@k: 상위 k 중 관련(grade>=1) 문서 비율
    hits = sum([1 for doc in predicted[:k] if doc in relevant_dict])
    precision = hits / k
    # Recall@k: 관련 문서 총 개수 대비 상위 k 중 회수된 관련 개수
    total_relevant = len(relevant_dict)
    recall = hits / total_relevant if total_relevant > 0 else 0
    # MRR: 첫 번째 관련 문서 위치 기반
    rr = 0
    for idx, doc in enumerate(predicted):
        if doc in relevant_dict:
            rr = 1 / (idx + 1)
            break
    # 단일 AP 계산 (MAP를 위해)
    num_correct = 0
    precisions = []
    for i, doc in enumerate(predicted[:k]):
        if doc in relevant_dict:
            num_correct += 1
            precisions.append(num_correct / (i + 1))
    ap = np.mean(precisions) if precisions else 0
    return precision, recall, rr, ap

def evaluate_all(method_results, queries_df, k=5):
    prec_list, rec_list, rr_list, ap_list = [], [], [], []
    for idx, row in queries_df.iterrows():
        qid = row['query_id']
        relevant_dict = parse_relevant(row['relevant_doc_ids'])
        predicted = method_results[qid]
        p, r, rr, ap = compute_metrics(predicted, relevant_dict, k)
        prec_list.append(p)
        rec_list.append(r)
        rr_list.append(rr)
        ap_list.append(ap)
    # 평균 지표 반환
    return {
        'P@k': np.mean(prec_list),
        'R@k': np.mean(rec_list),
        'MRR': np.mean(rr_list),
        'MAP': np.mean(ap_list)
    }

In [None]:
# BM25 결과 저장: {query_id: [doc_ids...]}
bm25_results = {}
for idx, row in queries_df.iterrows():
    qid = row['query_id']
    query_text = row['query_text']
    bm25_results[qid] = bm25_search_mecab(query_text, top_k=5)

# Dense Retrieval 결과 저장
dense_results = {}
for idx, row in queries_df.iterrows():
    qid = row['query_id']
    query_text = row['query_text']
    docs = vector_store.similarity_search(query_text, k=5)
    dense_results[qid] = [doc.metadata['doc_id'] for doc in docs]

print("BM25 & Dense Retrieval 결과 수집 완료")

In [None]:
# BM25 평가
bm25_metrics = evaluate_all(bm25_results, queries_df, k=5)
# Dense 평가
dense_metrics = evaluate_all(dense_results, queries_df, k=5)

import pandas as pd

df_metrics = pd.DataFrame({
    'Metric': ['P@5', 'R@5', 'MRR', 'MAP'],
    'BM25': [bm25_metrics['P@k'], bm25_metrics['R@k'], bm25_metrics['MRR'], bm25_metrics['MAP']],
    'Dense': [dense_metrics['P@k'], dense_metrics['R@k'], dense_metrics['MRR'], dense_metrics['MAP']]
})

df_metrics