In [29]:
import os
import json
from pathlib import Path
from tqdm import tqdm
import pandas as pd

# vanilla pipeline

In [31]:
path = Path('/Users/wenjiazhai/Documents/GitHub/Data/LongBench')
data = {'query': [], 'answer': [], 'context': []}
exclude = ['dureader.jsonl', 'lsht.jsonl', 'multifieldqa_zh.jsonl', 'passage_retrieval_zh.jsonl', 'repobench-p.jsonl', 'vcsum.jsonl', 'trec.jsonl', 'lcc.jsonl', 'samsum.jsonl', 'gov_report.jsonl', 'triviaqa.jsonl', 'qmsum.jsonl', 'passage_count.jsonl']
include = ['multifieldqa_en.jsonl', 'qasper.jsonl', '2wikimqa.jsonl', 'hotpotqa.jsonl', 'narrativeqa.jsonl', 'musique.jsonl']

for file in include:
    with open(path.joinpath(file), 'r') as f:
        for line in f.readlines():
            d = json.loads(line)
            # print(f"\n\n-------------------------------------\n\ndataset: {file}\n======\nquery: {d['input']}\n======\nanswer: {d['answers']}\n======\ncontext: {d['context']}")
            # break
            num = d['context'].count('\n')
            length = len(d['context'])
            # print(f"{file}: line break: {num}, length: {length}, line break pct: {num/length:.4f}")
            if num/length > 0.01:
                continue
            if not d['input']:
                continue
            if 'Passage' in d['context']:
                continue
            # if d['input'] == "The text discusses the challenges faced by compilers when invoking procedures with parameters in different programming languages. In Fortran, parameters are always passed by reference, making it relatively easy to generate the code. However, in languages like Pascal, parameters can be passed by different methods chosen by the programmer (by reference, by value, or by \"name\"). This information is only known at the procedure's definition, which may not be encountered before its invocation. To address this issue, a special construction called a \"forward\" declaration is introduced in Pascal. This allows the compiler to know the parameter's usage before its full definition, enabling code generation without the need for a messy fixup mechanism or multiple passes through the source code.":
            #     print(file)
            #     break
            data['query'].append(d['input'])
            data['answer'].append(''.join(d['answers']))
            data['context'].append(d['context'])

In [32]:
os.makedirs('../data', exist_ok=True)
with open('../data/LongBench.csv', 'w') as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

In [4]:
from datasets import Dataset

# ds = Dataset.from_dict(data)
ds = Dataset.from_json('/Users/wenjiazhai/Documents/GitHub/paper_analyze/data/LongBench.csv')

Generating train split: 314 examples [00:00, 5761.33 examples/s]


In [40]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core import documents

splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32)
chunks = splitter.create_documents(ds['context'])

In [41]:
os.makedirs('../store', exist_ok=True)

In [10]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embed = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en-v1.5')
# vector_db = FAISS.from_documents(chunks, embed)
# vector_db.save_local('../store/vinilla_langchain')
vector_db = FAISS.load_local('../store/vinilla_langchain', embed, allow_dangerous_deserialization=True)
retriever = vector_db.as_retriever(search_kwargs={'k':6})

In [7]:
from langchain_ollama.llms import OllamaLLM
llm = OllamaLLM(model="qwen2.5:1.5b")

In [21]:
prompt_template = '''You are a helpful assistant, please answer the following question based on the given content:

Question:
```
{question}
```

Content:
```
{content}
```

Just give a simple answer, do not include any additional information or explaination.
'''

qa_instruction = '''
You are an expert in information evaluation and critical thinking. Your task is to find the answer to a given question from a passage of text. You must carefully read every word and think through each step without overlooking any details. Your output should contain two fields: `Reasoning` and `Response`. In `Reasoning`, document your logical thought process in a clear, concise manner. If you find the answer, write it in the `Response` field; if not, try your best to guess one. The `Reasoning` should end with '*' to indicate completion.

Objective: The task is to carefully analyze a passage of text to determine whether it contains the answer to a given question. The evaluation must be detailed, with clear reasoning, and identify the correct answer if present, or confirm its absence.

You are provided with the following inputs:

1. Context: {question}
2. Question: {content}

Based on these inputs, provide a step-by-step explanation to identify the correct answer from the content. If you cannot find the answer in the passage, try to guess the answer. Your response should only contain the answer itself. Do not explain, provide notes, or include any additional text, punctuation, or preposition (e.g., 'on', 'at'), or articles (e.g., 'a', 'an', 'the') unless absolutely necessary.

Output format: 

-----
SCHEMA
-----

{{
    "Reasoning": "Step-by-step reasoning explaining how the answer is inferenced to satisfy the question.",
    "Response": "The answer itself, as simple as possible."
}}

-----

1. Context: ```Pilotwings 64\nPilotwings 64 (Japanese: パイロットウイングス64, Hepburn: Pairottouingusu Rokujūyon) is a video game for the Nintendo 64, originally released in 1996 along with the debut of the console. The game was co-developed by Nintendo and the American visual technology group Paradigm Simulation. It was one of three launch titles for the Nintendo 64 in Japan as well as Europe and one of two launch titles in North America. Pilotwings 64 is a follow-up to Pilotwings for the Super Nintendo Entertainment System (SNES), which was a North American launch game for its respective console in 1991. Also like that game, Pilotwings 64 received production input from Nintendo producer Shigeru Miyamoto.```
2. Question: Who is a Japanese video game designer and producer, currently serving as the co-Representative Director of Nintendo, who gave production input to a video game for the Nintendo 64, originally released in 1996 along with the debut of the console?

-----

output:

{{
    "Reasoning": "The context mentions that 'Pilotwings 64' was a video game released in 1996 for the Nintendo 64. The game received production input from Nintendo producer Shigeru Miyamoto. This aligns with the question, which asks for a Japanese video game designer and producer who gave production input to a Nintendo 64 game released in 1996. Additionally, Shigeru Miyamoto is well known as a prominent figure at Nintendo and is currently serving as the co-Representative Director of the company. Therefore, the content fully supports that Shigeru Miyamoto is the correct answer to the question.*", 
    "Response": "Shigeru Miyamoto" 
}}

-----

'''

In [24]:
res = {'query': [], 'answer': [], 'predict': []}

In [26]:
for d in tqdm(ds):
    q = d['query']
    a = d['answer']
    retrieval = '\n'.join([r.page_content for r in retriever.invoke(q)])
    # print(q)
    # print(retrieval)
    # break
    # response = llm.invoke(qa_instruction.format(question=q, content=retrieval))
    response = llm.invoke(prompt_template.format(question=q, content=retrieval))
    # print(response)
    # print(a)
    # break
    res['query'].append(q)
    res['answer'].append(a)
    res['predict'].append(response)

100%|██████████| 314/314 [03:18<00:00,  1.58it/s]


In [27]:
with open('../data/sample.json', 'w') as f:
    json.dump(res, f, ensure_ascii=False, indent=4)

In [28]:
for i, (q, a, p) in enumerate(zip(res['query'], res['answer'], res['predict'])):
    print(f'{q}\n{a}\n{p}\n-------------')
    if i > 5:
        break

Is the ISR necessary for transgene reactivation?
No, it is not necessary.
No, the ISR (Intracellular Signal Transduction) is neither necessary nor sufficient for transgene reactivation.
-------------
What experimental techniques were used to study the quantum dot structures in this research?
Low temperature scanning tunneling microscopy and spectroscopy (STM/STS).
Low-temperature scanning tunneling microscopy/photoluminescence measurements supported by analytical models and ab-initio simulations were used to study quantum dot structures in this research.
-------------
What is the purpose of an ICD?
Implantable Cardioverter Defibrillator (ICD) is a surgically implanted electronic device to treat life-threatening heartbeat irregularities.
The purpose of an ICD is to describe the patient's histological diagnosis in pathology reports for cancer registries, as per the World Health Organization's International Classification of Diseases for Oncology.
-------------
Why is it important for the

In [30]:
# aggregate
files = Path('/Users/wenjiazhai/Documents/GitHub/paper_analyze/output/LongBench').rglob('*.json')
data = [json.loads(open(file).read()) for file in files]
df = pd.DataFrame(data)
df.to_csv('../output/vanilla_longbench.csv', index=False)

In [33]:
df['consistency'].astype(int).sum() / len(df)

0.2229299363057325