In [5]:
import gc
import os
import pickle
import jsonlines
import torch
from tqdm import tqdm
import pandas as pd
from collections import defaultdict
import argparse
from core.models.entailment import EntailmentDeberta
from core.data.data_utils import load_ds_from_json
from rank_eval import eval_beir_rank_result

def load_pickle_file(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    return data

def save_pickle_file(file_path, data):
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)

In [None]:
small_scores = load_pickle_file('output/rerank/entropy_scores_small.pkl')
small_scores

In [None]:
# BEIR_DATASET_NAMES = ["trec-covid", "climate-fever", "dbpedia-entity", "fever", "fiqa", "hotpotqa", "msmarco",  "nfcorpus", "nq", "scidocs", "scifact"]

# scores = []
# for dataset_name in tqdm(BEIR_DATASET_NAMES):
#     print(dataset_name)
#     try:
#         scores.append({
#             'dataset': dataset_name,
#             'rank': small_scores[dataset_name]['rank']['ndcg']['NDCG@5'],
#             'entropy': small_scores[dataset_name]['entropy']['ndcg']['NDCG@5'],
#         })
#     except Exception as e:
#         print(e)
#         print(f"Error in {dataset_name}")
# print(f"small_scores: {scores}")

In [None]:
# table_data = ['Reranker', 'Avg.', 'trec-covid', 'climate-fever', 'dbpedia-entity', 'fever', 'fiqa', 'hotpotqa', 'msmarco',  'nfcorpus', 'nq', 'scidocs', 'scifact']

In [None]:
# small_scores.keys()

In [None]:
all_metrics = set()
for d in small_scores.values(): # d = {'entropy': {'map': {'MAP@1': 0.00172,
    for  v in d.values(): # v = {'map': {'MAP@1': 0.00172,
        for k1, m in v.items(): # k1=map, m={'MAP@1': 0.00172
            for k2 in m.keys():
                all_metrics.add((k1, k2)) # ('map', 'MAP@1')
all_metrics = sorted(all_metrics)
all_metrics

In [None]:
# "fiqa", "msmarco", "scifact"
dataset_names = ["trec-covid", "climate-fever", "dbpedia-entity", "fever", "hotpotqa", "nfcorpus", "nq", "scidocs"]
methods = ["rank", "entropy"]
all_metrics = [('map', 'MAP@1'), ('map', 'MAP@10'), ('map', 'MAP@3'), ('map', 'MAP@5'), ('mrr', 'MRR@1'), ('mrr', 'MRR@10'), ('mrr', 'MRR@3'), ('mrr', 'MRR@5'), ('ndcg', 'NDCG@1'), ('ndcg', 'NDCG@10'), ('ndcg', 'NDCG@3'), ('ndcg', 'NDCG@5'), ('precision', 'P@1'), ('precision', 'P@10'), ('precision', 'P@3'), ('precision', 'P@5'), ('recall', 'Recall@1'), ('recall', 'Recall@10'), ('recall', 'Recall@3'), ('recall', 'Recall@5'), ('recall_cap', 'R_cap@1'), ('recall_cap', 'R_cap@10'), ('recall_cap', 'R_cap@3'), ('recall_cap', 'R_cap@5')]

# 利用numpy，将small_scores建立高维数组，[指标][方法][数据集]
import numpy as np
score_array = np.zeros((len(all_metrics), len(methods), len(dataset_names)))
for i, dataset_name in enumerate(dataset_names):
    for j, method in enumerate(methods):
        for k, (metric1, metric2) in enumerate(all_metrics):
            try:
                score_array[k, j, i] = small_scores[dataset_name][method][metric1][metric2]
            except Exception as e:
                pass
                # print(e)
                # print(f"Error in {dataset_name}")
print(score_array.shape)

# 将score_array转换为DataFrame，将方法名称和指标名称（all_metrics的第二个元素）作为行列索引，数据集这列取平均值
import pandas as pd
# df = pd.DataFrame(score_array.mean(axis=-1), index=all_metrics, columns=methods)
df = pd.DataFrame(score_array.mean(axis=-1), index=[m[1] for m in all_metrics], columns=methods)
df

# 过滤掉不需要的指标，只保留@5结尾的
df[df.index.str.endswith('@5')]

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 示例数据
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Score1": [85, 92, 78, 88],
    "Score2": [91, 84, 88, 90],
    "Score3": [87, 89, 93, 86],
}
df = pd.DataFrame(data)

# 找出每列的最大值
max_values = df[["Score1", "Score2", "Score3"]].max()

# 创建一个新的 DataFrame，用于设置加粗格式
def format_bold(val, max_val):
    return f"**{val}**" if val == max_val else f"{val}"

formatted_df = df.copy()
for col in ["Score1", "Score2", "Score3"]:
    formatted_df[col] = df[col].apply(format_bold, max_val=max_values[col])

# 绘制表格
fig, ax = plt.subplots(figsize=(6, 2))
ax.axis("tight")
ax.axis("off")

# 渲染表格
table = ax.table(
    cellText=df.values,
    colLabels=df.columns,
    cellLoc="center",
    loc="center",
)

# 加粗最大值的单元格
for i, col in enumerate(["Score1", "Score2", "Score3"], start=1):
    max_row = df[col].idxmax()  # 找到最大值所在行
    cell = table[(max_row + 1, i)]  # 获取对应单元格
    cell.set_text_props(fontweight="bold", color="red")  # 设置加粗和颜色

plt.show()

# 查看rerank数据详情

In [None]:
# 读取tsv文件
import pandas as pd
df = pd.read_csv('output/rerank/hotpotqa/rerank-small.tsv', sep='\t', header=None, names=['qid', 'doc_id', 'score'])
df[df['score'] == 0]

In [68]:
from make_rank_dataset import load_dataset
queries, docs, scores = load_dataset('/home/song/dataset/beir/hotpotqa')

In [None]:
# df['query'] = df['qid'].apply(lambda x: queries[str(x)])
# df['doc'] = df['doc_id'].apply(lambda x: docs[str(x)])
# df

In [60]:
import json


data = {}
# # 遍历scores
# for qid in scores:
#     qid = str(qid)
#     for doc_id in scores[qid]:
#         doc_id = str(doc_id)
#         score = scores[qid][doc_id]
#         if score > 0:
#             if qid not in data:
#                 data[qid] = {
#                     'qid': str(qid),
#                     'query': queries[qid],
#                     'best': [],
#                     'docs': [],
#                 }
#             data[qid]['best'].append({'doc_id': doc_id, 'doc': docs[doc_id], 'score': score})

def new_data(qid):
    d = {
        'qid': str(qid),
        'query': queries[str(qid)],
        'best': [],
        'docs': [],
    }
    for doc_id in scores[qid]:
        doc_id = str(doc_id)
        score = scores[qid][doc_id]
        if score > 0:
            d['best'].append({'doc_id': doc_id, 'doc': docs[doc_id], 'score': score})
    return d

# 遍历df
for i, row in df.iterrows():
    qid = str(row['qid'])
    doc_id = str(row['doc_id'])
    score = row['score']
    if qid not in data:
        data[qid] = new_data(qid)
    data[qid]['docs'].append({'doc_id': doc_id, 'doc': docs[doc_id], 'score': score})

# 保存json文件
with open('output/tmp/rerank-small.json', 'w') as writer:
    json.dump(data, writer, indent=2)

In [None]:
ranks = []
for qid, d in data.items():
    best = set([doc['doc_id'] for doc in d['best']])
    docs1 = [doc['doc_id'] for doc in d['docs']]
    min_index = len(docs1)
    for i, doc in enumerate(docs1):
        if doc in best:
            min_index = i
            break
    ranks.append(min_index)
ranks

In [None]:
# 可视化ranks
import matplotlib.pyplot as plt
plt.hist(ranks)
plt.show()

In [None]:
# 读取tsv文件
import pandas as pd
df = pd.read_csv('/home/song/dataset/first/beir_rank/hotpotqa/rank.tsv', sep='\t', header=None, names=['qid', 'doc_id', 'score'])
# df[df['score'] == 0]
df

In [None]:
# 对于每个qid，保留前10行
df = df.groupby('qid').head(10)
df

In [69]:
import json


data = {}

def new_data(qid):
    d = {
        'qid': str(qid),
        'query': queries[str(qid)],
        'best': [],
        'docs': [],
    }
    for doc_id in scores[qid]:
        doc_id = str(doc_id)
        score = scores[qid][doc_id]
        if score > 0:
            d['best'].append({'doc_id': doc_id, 'doc': docs[doc_id], 'score': score})
    return d

# 遍历df
for i, row in df.iterrows():
    qid = str(row['qid'])
    doc_id = str(row['doc_id'])
    score = row['score']
    if qid not in data:
        data[qid] = new_data(qid)
    data[qid]['docs'].append({'doc_id': doc_id, 'doc': docs[doc_id], 'score': score})

# 保存json文件
with open('output/tmp/rank-small.json', 'w') as writer:
    json.dump(data, writer, indent=2)

In [None]:
ranks = []
for qid, d in data.items():
    best = set([doc['doc_id'] for doc in d['best']])
    docs1 = [doc['doc_id'] for doc in d['docs']]
    min_index = len(docs1)
    for i, doc in enumerate(docs1):
        if doc in best:
            min_index = i
            break
    ranks.append(min_index)
ranks

In [None]:
# 可视化ranks
import matplotlib.pyplot as plt
plt.hist(ranks)
plt.show()

In [None]:
# # 读取rank数据，只取前10个，保存为tsv文件
# dataset_names = ["trec-covid", "climate-fever", "dbpedia-entity", "fever", "fiqa", "hotpotqa", "msmarco",  "nfcorpus", "nq", "scidocs", "scifact"]
# for dataset_name in tqdm(dataset_names):
#     rank_result_path = f'/home/song/dataset/first/beir_rank/{dataset_name}/rank.tsv'
#     df = pd.read_csv(rank_result_path, sep='\t', header=None, names=['qid', 'doc_id', 'score'])
#     df = df.groupby('qid').head(10)
#     df.to_csv(f'dataset/rank/{dataset_name}/{dataset_name}-rank10.tsv', sep='\t', index=False, header=False)

In [7]:
import pytrec_eval

# 定义真实的相关性（qrels）
qrels = {
    'q1': {'d1': 1, 'd2': 0, 'd3': 1},
    'q2': {'d2': 1, 'd4': 1, 'd5': 0}
}

# 定义检索系统的结果（results）
results = {
    'q1': {'d3': 1.0, 'd1': 1.5, 'd2': 0.5},
    'q2': {'d2': 2.0, 'd4': 1.0, 'd5': 0.5}
}

# 初始化评估器
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'ndcg'})

# 计算检索指标
metrics = evaluator.evaluate(results)

# 输出评估结果
for query_id, query_metrics in metrics.items():
    print(f"Query: {query_id}")
    for metric, value in query_metrics.items():
        print(f"  {metric}: {value:.4f}")

Query: q1
  map: 1.0000
  ndcg: 1.0000
Query: q2
  map: 1.0000
  ndcg: 1.0000


In [6]:
import gc
import os
import pickle
import jsonlines
import torch
from tqdm import tqdm
from collections import defaultdict
import argparse
from core.models.entailment import EntailmentDeberta
from core.data.data_utils import load_ds_from_json
from rank_eval import eval_beir_rerank_result

def load_pickle_file(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    return data

def save_pickle_file(file_path, data):
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)

def run_eval():

    BEIR_DATASET_NAMES = ["trec-covid", "climate-fever", "dbpedia-entity", "fever", "fiqa", "hotpotqa", "msmarco",  "nfcorpus", "nq", "scidocs", "scifact"]
    # SIZE_NAME = "toy"
    # SIZE_NAME = "small"

    all_scores = {}

    for SIZE_NAME in ["small"]:
        for dataset_name in tqdm(BEIR_DATASET_NAMES):
            try:
                dataset_path = f'/home/song/dataset/beir/{dataset_name}'
                rank_result_path = f'dataset/rank/{dataset_name}/{dataset_name}-rank10.tsv'
                entropy_result_path = f'output/rerank/{dataset_name}/entropy-{SIZE_NAME}.tsv'
                all_scores[dataset_name] = eval_beir_rerank_result(rank_result_path, entropy_result_path, dataset_path, dataset_name, k_values=[1,3,5,10])
            except Exception as e:
                print(f"Error: {e}")
        # Save all_scores
        save_pickle_file(f"output/rerank/entropy_scores_{SIZE_NAME}.pkl", all_scores)
    return all_scores
# all_scores = load_pickle_file('output/rerank/entropy_scores_small.pkl')

%time all_scores = run_eval()

def calc_avg_score(all_scores, dataset_names, methods, all_metrics):
    # 利用numpy，将all_scores建立高维数组，[指标][方法][数据集]
    import numpy as np
    score_array = np.zeros((len(all_metrics), len(methods), len(dataset_names)))
    for i, dataset_name in enumerate(dataset_names):
        for j, method in enumerate(methods):
            for k, (metric1, metric2) in enumerate(all_metrics):
                try:
                    score_array[k, j, i] = all_scores[dataset_name][method][metric1][metric2]
                except Exception as e:
                    pass
                    # print(e)
                    # print(f"Error in {dataset_name}")
    print(score_array.shape)

    # 将score_array转换为DataFrame，将方法名称和指标名称（all_metrics的第二个元素）作为行列索引，数据集这列取平均值
    import pandas as pd
    # df = pd.DataFrame(score_array.mean(axis=-1), index=all_metrics, columns=methods)
    df = pd.DataFrame(score_array.mean(axis=-1), index=[m[1] for m in all_metrics], columns=methods)
    return df

df = calc_avg_score(all_scores, dataset_names, methods, all_metrics)
# 过滤掉不需要的指标，只保留@5结尾的
df[df.index.str.endswith('@5')]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/171332 [00:00<?, ?it/s]

  9%|▉         | 1/11 [00:02<00:22,  2.21s/it]


KeyboardInterrupt: 

(24, 3, 8)


Unnamed: 0,rank,entropy,rerank
MAP@5,0.267825,0.065489,0.065489
MRR@5,0.576873,0.109799,0.109799
NDCG@5,0.449309,0.217816,0.217816
P@5,0.276875,0.200887,0.200887
Recall@5,0.336066,0.146546,0.146546
R_cap@5,0.087,0.077,0.077


In [5]:
def calc_avg_score(all_scores, dataset_names, methods, all_metrics):
    # 利用numpy，将all_scores建立高维数组，[指标][方法][数据集]
    import numpy as np
    score_array = np.zeros((len(all_metrics), len(methods), len(dataset_names)))
    for i, dataset_name in enumerate(dataset_names):
        for j, method in enumerate(methods):
            for k, (metric1, metric2) in enumerate(all_metrics):
                try:
                    score_array[k, j, i] = all_scores[dataset_name][method][metric1][metric2]
                except Exception as e:
                    pass
                    # print(e)
                    # print(f"Error in {dataset_name}")
    print(score_array.shape)

    # 将score_array转换为DataFrame，将方法名称和指标名称（all_metrics的第二个元素）作为行列索引，数据集这列取平均值
    import pandas as pd
    # df = pd.DataFrame(score_array.mean(axis=-1), index=all_metrics, columns=methods)
    df = pd.DataFrame(score_array.mean(axis=-1), index=[m[1] for m in all_metrics], columns=methods)
    return df

# "fiqa", "msmarco", "scifact"
dataset_names = ["trec-covid", "climate-fever", "dbpedia-entity", "fever", "hotpotqa", "nfcorpus", "nq", "scidocs"]
methods = ["rank", "entropy", "rerank"]
all_metrics = [('map', 'MAP@1'), ('map', 'MAP@10'), ('map', 'MAP@3'), ('map', 'MAP@5'), ('mrr', 'MRR@1'), ('mrr', 'MRR@10'), ('mrr', 'MRR@3'), ('mrr', 'MRR@5'), ('ndcg', 'NDCG@1'), ('ndcg', 'NDCG@10'), ('ndcg', 'NDCG@3'), ('ndcg', 'NDCG@5'), ('precision', 'P@1'), ('precision', 'P@10'), ('precision', 'P@3'), ('precision', 'P@5'), ('recall', 'Recall@1'), ('recall', 'Recall@10'), ('recall', 'Recall@3'), ('recall', 'Recall@5'), ('recall_cap', 'R_cap@1'), ('recall_cap', 'R_cap@10'), ('recall_cap', 'R_cap@3'), ('recall_cap', 'R_cap@5')]

# 利用numpy，将all_scores建立高维数组，[指标][方法][数据集]
import numpy as np
score_array = np.zeros((len(all_metrics), len(methods), len(dataset_names)))
for i, dataset_name in enumerate(dataset_names):
    for j, method in enumerate(methods):
        for k, (metric1, metric2) in enumerate(all_metrics):
            try:
                score_array[k, j, i] = all_scores[dataset_name][method][metric1][metric2]
            except Exception as e:
                pass
                # print(e)
                # print(f"Error in {dataset_name}")
print(score_array.shape)

# 将score_array转换为DataFrame，将方法名称和指标名称（all_metrics的第二个元素）作为行列索引，数据集这列取平均值
import pandas as pd
# df = pd.DataFrame(score_array.mean(axis=-1), index=all_metrics, columns=methods)
df = pd.DataFrame(score_array.mean(axis=-1), index=[m[1] for m in all_metrics], columns=methods)
df

# 过滤掉不需要的指标，只保留@5结尾的
df[df.index.str.endswith('@5')]

(24, 3, 8)


Unnamed: 0,rank,entropy,rerank
MAP@5,0.267825,0.065489,0.065489
MRR@5,0.576873,0.109799,0.109799
NDCG@5,0.449309,0.217816,0.217816
P@5,0.276875,0.200887,0.200887
Recall@5,0.336066,0.146546,0.146546
R_cap@5,0.087,0.077,0.077
