# Env

In [2]:
import argparse
import json
import numpy as np

from mecab import MeCab
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

In [5]:
args = argparse.Namespace()
args.context = "../../data/aihub/eval_context.json"
args.question = "../../data/aihub/eval_question.json"
args.model_id = "snunlp/KR-SBERT-V40K-klueNLI-augSTS"
args

Namespace(context='../../data/aihub/eval_context.json', question='../../data/aihub/eval_question.json', model_id='snunlp/KR-SBERT-V40K-klueNLI-augSTS')

In [9]:
with open(args.context) as f:
    context = json.load(f)

In [10]:
with open(args.question) as f:
    question = json.load(f)

In [11]:
assert len(context) == len(question)

# BM25 (0.9084407721630294)

In [12]:
# 형태소 분석기를 이용한 tokeinizer 선언
# 조사 등 일부 품사를 제거
# 품사표: https://blog.naver.com/aramjo/221404488280
MECAB = MeCab()
EXCLUDE = set(
    [
        "JKS",
        "JKC",
        "JKG",
        "JKO",
        "JKB",
        "JKV",
        "JKQ",
        "JX",
        "JC",
        "EP",
        "EF",
        "EC",
        "ETN",
        "ETM",
        "SF",
        "SSC",
        "SSO",
        "SY",
    ]
)

In [13]:
def tokenizer(sent):
    tokens = []
    for w, t in MECAB.pos(sent):
        if t not in EXCLUDE:
            tokens.append(w)
    return tokens

In [14]:
# tokenize
context_keys = np.array(list(context.keys()))
tokenized_contexts = [tokenizer(context[k]) for k in context_keys]

In [15]:
# bm25 class 생성
bm25 = BM25Okapi(tokenized_contexts)

In [16]:
 # 평가
score = 0.0
for key, value in tqdm(question.items()):
    # question tokenize
    tokenized_question = tokenizer(value["question"])
    # score 계산
    scores = bm25.get_scores(tokenized_question)
    # score 역순으로 정렬
    rank = np.argsort(-scores)[:10]  # top 10
    # mrr 계산
    rank_keys = context_keys[rank]
    result = np.where(rank_keys == key)
    assert len(result[0]) < 2
    if len(result[0]) == 1:
        score += 1 / (result[0][0] + 1)
print(score / len(question))

  0%|          | 0/10002 [00:00<?, ?it/s]

0.9084407721630294


# Sentence BERT (0.5799622218413428)

In [17]:
# SentenceBERT 모델 생성
model = SentenceTransformer(args.model_id)



In [18]:
# corpus embeddings
context_keys = np.array(list(context.keys()))
context_values = [context[k] for k in context_keys]
context_embeddings = model.encode(context_values, normalize_embeddings=True)

In [19]:
# 평가
score = 0.0
for key, value in tqdm(question.items()):
    # query embedding
    question_embedding = model.encode(value["question"], normalize_embeddings=True)
    # score 계산
    scores = np.dot(context_embeddings, question_embedding)
    # score 역순으로 정렬
    rank = np.argsort(-scores)[:10]  # top 10
    # mrr 계산
    rank_keys = context_keys[rank]
    result = np.where(rank_keys == key)
    assert len(result[0]) < 2
    if len(result[0]) == 1:
        score += 1 / (result[0][0] + 1)
print(score / len(question))

  0%|          | 0/10002 [00:00<?, ?it/s]

0.5799622218413428
