In [None]:
import json
import os
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from nltk import sent_tokenize, word_tokenize
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

device = 'cuda'

def calculate_perplexity(sentence):
    # Tokenize 输入句子
    encodings = tokenizer(sentence, return_tensors="pt")
    input_ids = encodings.input_ids.to(device)

    # 初始化变量
    nlls = []
    seq_len = input_ids.size(1)

    # 遍历每个时间步，逐字计算困惑度
    for i in range(1, seq_len):
        # 当前时间步的输入序列
        input_ids_step = input_ids[:, :i]
        target_id = input_ids[:, i]  # 目标 token 是下一个字

        with torch.no_grad():
            # 获取模型输出 logits
            outputs = model(input_ids_step)
            logits = outputs.logits  # shape: (batch_size, seq_len, vocab_size)
            next_token_logits = logits[:, -1, :]  # 只取最后一个 token 的预测分布

            # 计算目标 token 的概率
            probs = torch.softmax(next_token_logits, dim=-1)
            target_prob = probs[:, target_id].squeeze()  # 取出目标 token 的概率

            # 计算负对数似然 (NLL)
            nll = -torch.log(target_prob)
            nlls.append(nll)

    # 计算整句困惑度
    avg_nll = torch.stack(nlls).mean()
    perplexity = torch.exp(avg_nll)

    return perplexity.item()

model_name = "Qwen/Qwen2.5-0.5B-Instruct"  # 替换为任意支持的模型
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

df = pd.read_csv('../../data/LongBench_original.csv')

In [None]:
tmp_contents = []
ppls = []

for con in df['context']:
    sentences = sent_tokenize(con)
    sentences = [s.strip() for s in sentences]
    tmp_contents.append(sentences)
    for s in sentences:
        ppl = calculate_perplexity(s)
        ppls.append(ppl)
        
threshold = np.percentile(ppls, 75)
currect_idx = 0
filtered_contents = []
for con in tmp_contents:
    filtered_sentences = []
    for sen in con:
        if ppls[current_idx] > threshold:
            current_idx += 1
            continue
        filtered_sentences.append(sen)
        current_idx += 1
    filtered_contents.append(' '.join(filtered_sentences))

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core import documents

splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32)
chunks = splitter.create_documents(filtered_contents)

In [None]:
os.makedirs('../store', exist_ok=True)

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embed = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en-v1.5')
vector_db = FAISS.load_local('../store/ppl_filter_langchain', embed, allow_dangerous_deserialization=True)
retriever = vector_db.as_retriever(search_kwargs={'k':6})

In [None]:
from langchain_ollama.llms import OllamaLLM
llm = OllamaLLM(model="qwen2.5:1.5b")

In [None]:
prompt_template = '''You are a helpful assistant, please answer the following question based on the given content:

Question:
```
{question}
```

Content:
```
{content}
```

Just give a simple answer, do not include any additional information or explaination.
'''

qa_instruction = '''
You are an expert in information evaluation and critical thinking. Your task is to find the answer to a given question from a passage of text. You must carefully read every word and think through each step without overlooking any details. Your output should contain two fields: `Reasoning` and `Response`. In `Reasoning`, document your logical thought process in a clear, concise manner. If you find the answer, write it in the `Response` field; if not, try your best to guess one. The `Reasoning` should end with '*' to indicate completion.

Objective: The task is to carefully analyze a passage of text to determine whether it contains the answer to a given question. The evaluation must be detailed, with clear reasoning, and identify the correct answer if present, or confirm its absence.

You are provided with the following inputs:

1. Context: {question}
2. Question: {content}

Based on these inputs, provide a step-by-step explanation to identify the correct answer from the content. If you cannot find the answer in the passage, try to guess the answer. Your response should only contain the answer itself. Do not explain, provide notes, or include any additional text, punctuation, or preposition (e.g., 'on', 'at'), or articles (e.g., 'a', 'an', 'the') unless absolutely necessary.

Output format: 

-----
SCHEMA
-----

{{
    "Reasoning": "Step-by-step reasoning explaining how the answer is inferenced to satisfy the question.",
    "Response": "The answer itself, as simple as possible."
}}

-----

1. Context: ```Pilotwings 64\nPilotwings 64 (Japanese: パイロットウイングス64, Hepburn: Pairottouingusu Rokujūyon) is a video game for the Nintendo 64, originally released in 1996 along with the debut of the console. The game was co-developed by Nintendo and the American visual technology group Paradigm Simulation. It was one of three launch titles for the Nintendo 64 in Japan as well as Europe and one of two launch titles in North America. Pilotwings 64 is a follow-up to Pilotwings for the Super Nintendo Entertainment System (SNES), which was a North American launch game for its respective console in 1991. Also like that game, Pilotwings 64 received production input from Nintendo producer Shigeru Miyamoto.```
2. Question: Who is a Japanese video game designer and producer, currently serving as the co-Representative Director of Nintendo, who gave production input to a video game for the Nintendo 64, originally released in 1996 along with the debut of the console?

-----

output:

{{
    "Reasoning": "The context mentions that 'Pilotwings 64' was a video game released in 1996 for the Nintendo 64. The game received production input from Nintendo producer Shigeru Miyamoto. This aligns with the question, which asks for a Japanese video game designer and producer who gave production input to a Nintendo 64 game released in 1996. Additionally, Shigeru Miyamoto is well known as a prominent figure at Nintendo and is currently serving as the co-Representative Director of the company. Therefore, the content fully supports that Shigeru Miyamoto is the correct answer to the question.*", 
    "Response": "Shigeru Miyamoto" 
}}

-----

'''

In [None]:
res = {'query': [], 'answer': [], 'predict': []}

for d in tqdm(df.iterrows()):
    q = d['query']
    a = d['answer']
    retrieval = ' '.join([r.page_content for r in retriever.invoke(q)])
    # print(q)
    # print(retrieval)
    # break
    # response = llm.invoke(qa_instruction.format(question=q, content=retrieval))
    response = llm.invoke(prompt_template.format(question=q, content=retrieval))
    # print(response)
    # print(a)
    # break
    res['query'].append(q)
    res['answer'].append(a)
    res['predict'].append(response)
    
with open('../data/ppl_filtered_output.json', 'w') as f:
    json.dump(res, f, ensure_ascii=False, indent=4)