In [None]:
import gc
import os
import pickle
import jsonlines
import torch
from tqdm import tqdm
import pandas as pd
import csv
from collections import defaultdict
import argparse
from core.models.entailment import EntailmentDeberta
from rank_eval import load_rank_results
import os
import pickle
from core.computation.uncertainty_measure import cluster_assignment_entropy

def load_pickle_file(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    return data

def save_pickle_file(file_path, data):
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)

def load_samples(dataset_name, qid, doc_id):
    file_path = f'output/rank/gen/Qwen/Qwen2.5-7B-Instruct/{dataset_name}/{dataset_name}-{qid}-{doc_id}.pkl'
    if os.path.exists(file_path):
        result = load_pickle_file(file_path)
        return [x['text'] for x in result['sample']]
    return []


def load_cluster_ids(dataset_name, qid, doc_id):
    file_path = f'output/rank/cluster/Qwen/Qwen2.5-7B-Instruct/{dataset_name}/{dataset_name}-{qid}-{doc_id}.pkl'
    if os.path.exists(file_path):
        result = load_pickle_file(file_path)
        return result.get('cluster_ids', [])
    return []


# 计算语义熵
def compute_entropy(cluster_ids):
    if len(cluster_ids) == 0:
        return None
    return cluster_assignment_entropy(cluster_ids)

def merge_score(rank_score, entropy_score):
    if entropy_score is None:
        return rank_score
    if entropy_score < 0.01:
        return rank_score + 1.0
    return rank_score

def load_dataset(dataset_path):
    query_path = os.path.join(dataset_path, 'queries.jsonl')
    queries = {}
    with jsonlines.open(query_path) as reader:
        for query in reader:
            queries[str(query['_id'])] = query['text']

    doc_path = os.path.join(dataset_path, 'corpus.jsonl')
    docs = {}
    with jsonlines.open(doc_path) as reader:
        for doc in reader:
            docs[str(doc['_id'])] = doc['text']

    rel_path = os.path.join(dataset_path, 'qrels/test.tsv')
    df = pd.read_csv(rel_path, sep='\t', header=0)
    
    scores = defaultdict(dict)
    for qid, docid, score in df.values:
        scores[str(qid)][str(docid)] = score
    
    return queries, docs, scores

size_name = "large"
dataset_names = ["trec-covid", "climate-fever", "dbpedia-entity", "fever", "hotpotqa", "nfcorpus", "nq", "scidocs"]
for dataset_name in tqdm(dataset_names, desc='dataset'):
    dataset_path = f'/home/song/dataset/beir/{dataset_name}'
    queries, docs, scores = load_dataset(dataset_path)
    rank_result_path = f'dataset/rank/{dataset_name}/{dataset_name}-rank10-{size_name}.tsv'
    rank_results = load_rank_results(rank_result_path)
    # entropy_result_path = f'output/rerank/{dataset_name}/entropy-small.tsv'
    # entropy_results = load_rank_results(entropy_result_path)
    # print(f"dataset: {dataset_name}")
    merge_results = [] # ['qid', 'query', 'docid', 'doc', 'gold_score', 'rank_index', 'rank_score', 'entropy_score', 'merge_score', 'samples', 'cluster_ids']
    for qid in rank_results:
        for i, docid in enumerate(rank_results[qid]):
            samples = load_samples(dataset_name, qid, docid)
            samples_text = '|'.join(samples)
            cluster_ids = load_cluster_ids(dataset_name, qid, docid)
            cluster_text = '|'.join([str(x) for x in cluster_ids])
            entropy = compute_entropy(cluster_ids)
            merge_results.append([str(qid), 
                                  queries.get(str(qid), ''), 
                                  str(docid), 
                                  docs.get(str(docid), ''), 
                                  scores.get(str(qid), {}).get(str(docid), 0.0), 
                                  i,
                                  rank_results.get(qid, {}).get(docid, 0.0),
                                  entropy,
                                  merge_score(rank_results.get(qid, {}).get(docid, 0.0), entropy),
                                    samples_text,
                                    cluster_text
                                  ])
    with open(f'output/tmp/merge-{size_name}-{dataset_name}.tsv', 'w', newline='') as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerow(['qid', 'query', 'docid', 'doc', 'gold_score', 'rank_index', 'rank_score', 'entropy_score', 'merge_score', 'samples', 'cluster_ids'])
        writer.writerows(merge_results)
    print(f"output: output/tmp/merge-{size_name}-{dataset_name}.tsv")

In [2]:


# sample = load_gen_texts('nq','test0','doc0')
# print(sample)

# cluster_ids = load_cluster_ids('nq','test0','doc0')
# print(cluster_ids)

# entropy = compute_entropy(cluster_ids)
# print(entropy)