In [1]:
import os
import pickle
import jsonlines
from tqdm import tqdm
from time import time
import pandas as pd
import csv
from collections import defaultdict
from rank_eval import load_rank_results, eval_rank_results
import os
import pickle
from core.computation.uncertainty_measure import cluster_assignment_entropy

# ALL_DATASET_NAMES = ["nq"]
ALL_DATASET_NAMES = ["trec-covid", "climate-fever", "dbpedia-entity", "fever", "fiqa", "hotpotqa", "msmarco", "nfcorpus", "scidocs", "scifact", "nq"]
SIZE_NAME = "all"
# SIZE_NAME = "large"
BEIR_DATASET_DIR = "/home/song/dataset/beir"
RANK_DIR = "dataset/rank"
SAMPLE_DIR = "output/sample"

def load_pickle_file(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    return data

def save_pickle_file(file_path, data):
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)

def load_samples(dataset_name, qid, doc_id):
    file_path = f'output/rank/gen/Qwen/Qwen2.5-7B-Instruct/{dataset_name}/{dataset_name}-{qid}-{doc_id}.pkl'
    if os.path.exists(file_path):
        result = load_pickle_file(file_path)
        return [x['text'] for x in result['sample']]
    return []

def load_greedy(dataset_name, qid, doc_id):
    file_path = f'output/rank/gen/Qwen/Qwen2.5-7B-Instruct/{dataset_name}/{dataset_name}-{qid}-{doc_id}.pkl'
    if os.path.exists(file_path):
        result = load_pickle_file(file_path)
        return result['greedy']
    return None


def load_cluster_ids(dataset_name, qid, doc_id):
    file_path = f'output/rank/cluster/Qwen/Qwen2.5-7B-Instruct/{dataset_name}/{dataset_name}-{qid}-{doc_id}.pkl'
    if os.path.exists(file_path):
        result = load_pickle_file(file_path)
        return result.get('cluster_ids', [])
    return []


# 计算语义熵
def compute_entropy(cluster_ids):
    if len(cluster_ids) == 0:
        return None
    return cluster_assignment_entropy(cluster_ids)


def load_dataset(dataset_path):
    query_path = os.path.join(dataset_path, 'queries.jsonl')
    queries = {}
    with jsonlines.open(query_path) as reader:
        for query in reader:
            queries[str(query['_id'])] = query['text']

    doc_path = os.path.join(dataset_path, 'corpus.jsonl')
    docs = {}
    with jsonlines.open(doc_path) as reader:
        for doc in reader:
            docs[str(doc['_id'])] = doc['text']

    rel_path = os.path.join(dataset_path, 'qrels/test.tsv')
    df = pd.read_csv(rel_path, sep='\t', header=0)
    
    qrels = defaultdict(dict)
    for qid, docid, score in df.values:
        qrels[str(qid)][str(docid)] = score
    
    return queries, docs, dict(qrels)

def make_sample_data(rank_results, dataset_name):
    sample_data = defaultdict(dict)
    for qid, doc_ids in tqdm(rank_results.items()):
        for doc_id in ['no']+list(doc_ids.keys()):
            samples = load_samples(dataset_name, qid, doc_id)
            greedy = load_greedy(dataset_name, qid, doc_id)
            cluster_ids = load_cluster_ids(dataset_name, qid, doc_id)
            sample_data[qid][doc_id] = {
                'greedy': greedy,
                'samples': samples,
                'cluster_ids': cluster_ids,
                'entropy': compute_entropy(cluster_ids)
            }
    return dict(sample_data)

def make_rank_results(dataset_name):
    rank_path = f'{RANK_DIR}/{dataset_name}/{dataset_name}-rank10-{SIZE_NAME}.tsv'
    rank_results = load_rank_results(rank_path)
    return rank_results

  from tqdm.autonotebook import tqdm, trange


In [2]:
data = load_pickle_file('output/sample/trec-covid/sample.pkl')
data

{'29': {'no': {'greedy': 'Potential drug targets include interactions between SARS-CoV-2 spike protein and human ACE2, and between SARS-CoV-2 nucleocapsid protein and host RNA. Drugs like remdesivir (targeting viral RNA-dependent RNA polymer',
   'samples': ['',
    'Potential drug targets include interactions between SARS-CoV-2 spike protein and human ACE2, as well as viral protease and host cellular factors. Remdesivir and lopinavir/ritonavir have been repurposed',
    'Potential drug targets include interactions involving the Spike (S), Envelope (E), Membrane (M), and Nucleocapsid (N) proteins. Repurposed drugs like remdesivir (targeting RNA-dependent RNA polymerase)',
    'Potential drug targets include interactions between SARS-CoV-2 spike protein and human ACE2, and between Nsp1 and human nuclear proteins. Drugs like camostat (for ACE2 interaction) and Lopinavir/ritonav',
    'SARS-CoV-2 spike protein, protease, and RNA-dependent RNA polymerase interact with human proteins. Drugs