In [4]:
import math
from collections import Counter
from typing import List
import jieba
import json
from pathlib import Path
import os

import jieba
from tqdm import tqdm
from utils import BM25, filter_stop
import pandas as pd

In [6]:
class BM25:
    def __init__(self, corpus: List[List[str]], k1=1.5, b=0.75):
        assert isinstance(corpus, list), "Corpus must be a list of documents"
        assert all([isinstance(c, str) for c in corpus]), "Corpus must be a list of strings"
        
        tmp = []
        for para in corpus:
            filtered = filter_stop(jieba.lcut(para))
            tmp.append([s for s in filtered if s != ' '])
        
        self.k1 = k1
        self.b = b
        self.corpus = dict((i, doc) for i, doc in enumerate(corpus))
        self.doc_lengths = [len(doc) for doc in tmp]
        self.avg_doc_length = sum(self.doc_lengths) / len(self.doc_lengths)
        self.doc_count = len(corpus)
        self.doc_term_freqs = [Counter(doc) for doc in tmp]
        self.build_inverted_index()

    def build_inverted_index(self):
        self.inverted_index = {}
        for doc_id, doc_term_freq in enumerate(self.doc_term_freqs):
            for term, freq in doc_term_freq.items():
                if term not in self.inverted_index:
                    self.inverted_index[term] = []
                self.inverted_index[term].append((doc_id, freq))

    def idf(self, term):
        doc_freq = len(self.inverted_index.get(term, []))
        if doc_freq == 0:
            return 0
        return math.log((self.doc_count - doc_freq + 0.5) / (doc_freq + 0.5) + 1.0)

    def bm25_score(self, query_terms, doc_id):
        score = 0
        doc_length = self.doc_lengths[doc_id]
        for term in query_terms:
            tf = self.doc_term_freqs[doc_id].get(term, 0)
            idf = self.idf(term)
            numerator = tf * (self.k1 + 1)
            denominator = tf + self.k1 * (1 - self.b + self.b * (doc_length / self.avg_doc_length))
            score += idf * (numerator / denominator)
        return score

    def query(self, query):
        query_terms = [w.lower() for w in jieba.cut(query) if w.lower() != ' ']
        docs_w_scores = [(self.corpus[doc_id], self.bm25_score(query_terms, doc_id)) for doc_id in self.corpus.keys()]
        sorted_docs_by_scores = sorted(docs_w_scores, key=lambda x: x[1], reverse=True)
        return sorted_docs_by_scores

In [7]:
prompt_template = '''You are a helpful assistant, please answer the following question based on the given content:

Question:
```
{question}
```

Content:
```
{content}
```

Just give a simple answer, do not include any additional information or explaination.
'''

qa_instruction = '''
You are an expert in information evaluation and critical thinking. Your task is to find the answer to a given question from a passage of text. You must carefully read every word and think through each step without overlooking any details. Your output should contain two fields: `Reasoning` and `Response`. In `Reasoning`, document your logical thought process in a clear, concise manner. If you find the answer, write it in the `Response` field; if not, try your best to guess one. The `Reasoning` should end with '*' to indicate completion.

Objective: The task is to carefully analyze a passage of text to determine whether it contains the answer to a given question. The evaluation must be detailed, with clear reasoning, and identify the correct answer if present, or confirm its absence.

You are provided with the following inputs:

1. Context: {question}
2. Question: {content}

Based on these inputs, provide a step-by-step explanation to identify the correct answer from the content. If you cannot find the answer in the passage, try to guess the answer. Your response should only contain the answer itself. Do not explain, provide notes, or include any additional text, punctuation, or preposition (e.g., 'on', 'at'), or articles (e.g., 'a', 'an', 'the') unless absolutely necessary.

Output format: 

-----
SCHEMA
-----

{{
    "Reasoning": "Step-by-step reasoning explaining how the answer is inferenced to satisfy the question.",
    "Response": "The answer itself, as simple as possible."
}}

-----

1. Context: ```Pilotwings 64\nPilotwings 64 (Japanese: パイロットウイングス64, Hepburn: Pairottouingusu Rokujūyon) is a video game for the Nintendo 64, originally released in 1996 along with the debut of the console. The game was co-developed by Nintendo and the American visual technology group Paradigm Simulation. It was one of three launch titles for the Nintendo 64 in Japan as well as Europe and one of two launch titles in North America. Pilotwings 64 is a follow-up to Pilotwings for the Super Nintendo Entertainment System (SNES), which was a North American launch game for its respective console in 1991. Also like that game, Pilotwings 64 received production input from Nintendo producer Shigeru Miyamoto.```
2. Question: Who is a Japanese video game designer and producer, currently serving as the co-Representative Director of Nintendo, who gave production input to a video game for the Nintendo 64, originally released in 1996 along with the debut of the console?

-----

output:

{{
    "Reasoning": "The context mentions that 'Pilotwings 64' was a video game released in 1996 for the Nintendo 64. The game received production input from Nintendo producer Shigeru Miyamoto. This aligns with the question, which asks for a Japanese video game designer and producer who gave production input to a Nintendo 64 game released in 1996. Additionally, Shigeru Miyamoto is well known as a prominent figure at Nintendo and is currently serving as the co-Representative Director of the company. Therefore, the content fully supports that Shigeru Miyamoto is the correct answer to the question.*", 
    "Response": "Shigeru Miyamoto" 
}}

-----

'''

In [8]:
with open('../../data/LongBench_original.json') as f:
    data = json.load(f)

In [9]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embed = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en-v1.5')
faiss_index = FAISS.load_local('/Users/wenjiazhai/Documents/GitHub/paper_analyze/store/vanilla_langchain', 
                               embed, allow_dangerous_deserialization=True)
all_documents = [doc.page_content for doc in faiss_index.docstore._dict.values()]

  from tqdm.autonotebook import tqdm, trange


In [10]:
bm25 = BM25(all_documents)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/77/tngzlz3n44s1cm7spj20fy8m0000gn/T/jieba.cache
Loading model cost 0.386 seconds.
Prefix dict has been built successfully.


In [11]:
from langchain_ollama.llms import OllamaLLM
llm = OllamaLLM(model="qwen2.5:0.5b")

In [12]:
k = 6

res = {'query': [], 'answer': [], 'predict': []}
for q, a in tqdm(zip(data['query'], data['answer'])):
    retrieval = '\n'.join([retr[0] for retr in bm25.query(q)[:k]])
    response = llm.invoke(prompt_template.format(question=q, content=retrieval))
    res['query'].append(q)
    res['answer'].append(a)
    res['predict'].append(response)

314it [03:54,  1.34it/s]


In [13]:
with open('../../output/bm25_output.json', 'w') as f:
    json.dump(res, f, indent=4, ensure_ascii=False)

In [3]:
path = Path('/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/BM25').rglob('*.json')

for file in path:
    try:
        with open(file, 'r') as f:
            data = json.load(f)
    except:
        print(f'{file} error')
        os.remove(file)

/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/BM25/103.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/BM25/174.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/BM25/162.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/BM25/95.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/BM25/227.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/BM25/231.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/BM25/220.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/BM25/26.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/BM25/108.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/BM25/117.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/BM25/249.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/BM25/157.json error
/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/BM25/9.json error


In [5]:
path = Path('/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/BM25').rglob('*.json')

data = []
for file in path:
    try:
        with open(file, 'r') as f:
            data.append(json.load(f))
    except:
        pass
    
df = pd.DataFrame(data)
df.to_csv('/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/bh25_output_consistency.csv', index=False)

In [6]:
consistency = [int(d) for d in df['consistency']]
sum(consistency) / len(consistency)

0.19601328903654486