In [None]:
documents = [
    "日本で一番高い山は富士山です。",
    "富士山は日本で最も大きい山です。",
    "Mt. Fuji is the highest mountain in Japan.",
    "富士山プリンは日本で一番おいしい。",
    "エベレストは世界で一番高い山です。",
    "アメリカには自由の女神があります。"
]

In [None]:
query = "富士山は日本で何番目に高い山ですか？"

In [None]:
# キーワード検索

import numpy as np
import faiss
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from janome.tokenizer import Tokenizer as JanomeTokenizer
from rank_bm25 import BM25Okapi

janome_tokenizer = JanomeTokenizer()
def tokenize(text):
    allowed_pos = {"名詞", "動詞", "形容詞", "副詞"}
    tokens = []
    for token in janome_tokenizer.tokenize(text):
        if token.part_of_speech.split(',')[0] in allowed_pos:
            tokens.append(token.surface)
    return tokens

tokenized_docs = [tokenize(doc) for doc in documents]
bm25 = BM25Okapi(tokenized_docs)
tokenized_query = tokenize(query)
bm25_scores = bm25.get_scores(tokenized_query)

print("=== キーワード検索上位5件 ===")
for i in np.argsort(bm25_scores)[::-1][:6]:
    print(f"{documents[i]} (score={bm25_scores[i]:.2f})")

In [None]:
# リランキング

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

RERANK_MODEL_DIR = "hotchpotch/japanese-bge-reranker-v2-m3-v1"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(RERANK_MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(RERANK_MODEL_DIR).to(DEVICE)

# キーワード検索上位5件の文書リスト
bm25_top5_idx = np.argsort(bm25_scores)[::-1][:5]
bm25_top5_docs = [documents[i] for i in bm25_top5_idx]

def rerank(query, candidate_docs):
    inputs = tokenizer(
        [query] * len(candidate_docs),
        candidate_docs,
        padding=True,
        truncation=True,
        return_tensors="pt"
    ).to(DEVICE)

    with torch.no_grad():
        outputs = model(**inputs)
        scores = outputs.logits.squeeze(-1).cpu().numpy()
    
    rerank_order = np.argsort(scores)[::-1]
    return rerank_order, scores

rerank_order, rerank_scores = rerank(query, bm25_top5_docs)

print("=== リランキング結果 ===")
for i, idx in enumerate(rerank_order):
    print(f"{i+1}. {bm25_top5_docs[idx]} (score={rerank_scores[idx]:.4f})")