# 构造rank数据集

## 尺寸说明
- `T`/`toy`: 玩具尺寸，10个query
- `S`/`small`: 小尺寸，50个query
- `M`/`medium`: 中尺寸，待定
- `L`/`large`: 大尺寸，待定
- `A`/`all`: 完整尺寸，所有query
- 默认10个rank，若100个rank可在后面追加，例如`small100`表示50个query+100个rank

In [1]:
%load_ext autoreload
%autoreload 2

import os
import json
import jsonlines
import pandas as pd
from collections import defaultdict
import shutil
from tqdm import tqdm
import csv
from make_rank_dataset import load_dataset, load_rank_results, make_rank_dataset

In [None]:
# 数据集目录
DATASET_DIR = '/home/song/dataset/beir'
# 检索结果目录
RANK_RESULT_DIR = '/home/song/dataset/first/beir_rank'
# FIRST所使用的11个数据集
BEIR_DATASET_NAMES = ["trec-covid", "climate-fever", "dbpedia-entity", "fever", "fiqa", "hotpotqa", "msmarco",  "nfcorpus", "nq", "scidocs", "scifact"]
# 尺寸后缀、问题数量、文档数量
SIZES = [("toy", 10, 10), ("small", 50, 10)]

for dataset_name in tqdm(BEIR_DATASET_NAMES):
    dataset_path = os.path.join(DATASET_DIR, dataset_name)
    rank_result_path = os.path.join(RANK_RESULT_DIR, dataset_name, "rank.tsv")
    for size_name, query_num, rank_num in SIZES:
        output_path = os.path.join('dataset/rank', dataset_name, f"{dataset_name}-{size_name}.jsonl")
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        make_rank_dataset(dataset_name, dataset_path, rank_result_path, output_path, query_num, rank_num)


######## Dataset trec-covid
Total 50 questions have relevance results
Total 50 questions have rank results
Total 50 questions have both relevance and rank results
> Use 10 questions


100%|██████████| 10/10 [00:37<00:00,  3.76s/it]

len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
Saved to dataset/rank/trec-covid/trec-covid-toy.tsv
Total 10 prompts
######## Dataset trec-covid





Total 50 questions have relevance results
Total 50 questions have rank results
Total 50 questions have both relevance and rank results
> Use 50 questions


100%|██████████| 50/50 [00:00<00:00, 37065.25it/s]

len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
l




Total 1535 questions have relevance results
Total 1535 questions have rank results
Total 1535 questions have both relevance and rank results
> Use 10 questions


100%|██████████| 10/10 [00:00<00:00, 24921.59it/s]

len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
Saved to dataset/rank/climate-fever/climate-fever-toy.tsv
Total 10 prompts
######## Dataset climate-fever





Total 1535 questions have relevance results
Total 1535 questions have rank results
Total 1535 questions have both relevance and rank results
> Use 50 questions


100%|██████████| 50/50 [00:00<00:00, 30700.51it/s]

len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
len(rank_results[qid]): 0
doc_ids: 1
l




######## Dataset dbpedia-entity


KeyboardInterrupt: 

In [None]:
from make_rank_dataset import load_dataset

DATASET_DIR = '/home/song/dataset/beir'
# "cqadustack", 
BEIR_DATASET_NAMES = ["msmarco"]
for dataset_name in BEIR_DATASET_NAMES:
    dataset_path = os.path.join(DATASET_DIR, dataset_name)
    queries, docs, scores = load_dataset(dataset_path)
    print(f"{dataset_name}, query: {len(queries)}, doc: {len(docs)}, score: {len(scores)}")

In [None]:




for dataset_name in ["msmarco"]:
    rank_result_path = os.path.join(RANK_RESULT_DIR, dataset_name, "rank.tsv")
    results = load_rank_results(rank_result_path)
    print(f"{dataset_name}, query: {len(results)}")
    

In [None]:
rank_qids = set(results.keys())
rel_qids = set(map(str, scores.keys()))
print(len(rank_qids), len(rel_qids), len(rank_qids & rel_qids))

In [None]:
rank_qids

In [None]:
rel_qids

In [None]:
rank_qids & rel_qids