In [1]:
# !pip install jsonlines

In [2]:
%load_ext autoreload
%autoreload 2

import os
import json
import jsonlines
import pandas as pd
from collections import defaultdict
import shutil

In [3]:

# 数据集路径
DATA_ROOT_DIR = '/Users/song/Downloads/nq'

# 问题文件路径
QUERIES_FILE = os.path.join(DATA_ROOT_DIR, 'queries.jsonl')

# 文档文件路径
DOCS_FILE = os.path.join(DATA_ROOT_DIR, 'corpus.jsonl')

# 相关性文件路径
RELEVANCE_FILE = os.path.join(DATA_ROOT_DIR, 'qrels/test.tsv')

# 排序文件路径
RANK_FILE = '/Users/song/Downloads/beir/nq/rank.tsv'

# 数据集名称
DATASET_NAME = 'nq-rank'

# 排序数量
RANK_NUM = 1000

# 输出json文件路径
OUTPUT_PATH = os.path.join('/Users/song/Downloads/rank-dataset', f'{DATASET_NAME}-{RANK_NUM}.json')

# 创建输出文件夹
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

```json
all_data = {
    "metadata": {
        "name": "nq",
        "description": "nq dataset",
        "version": "1.0.0"
    },
    // 遍历顺序
    "query_ids": ["test0", "test1", "test2"],
    // 问题文本
    "queries": {
        "test0": "test0",
        "test1": "test1",
        "test2": "test2"
    },
    // 文档文本
    "docs": {
        "doc0": "doc0",
        "doc1": "doc1",
        "doc2": "doc2"
    },
    // 相关性标注
    "qrels": {
        "test0": ["doc0", "doc1"],
        "test1": ["doc1", "doc2"],
        "test2": ["doc0", "doc2"]
    },
    // 排序结果
    "rank": {
        "test0": ["doc0", "doc1"],
        "test1": ["doc1", "doc2"],
        "test2": ["doc0", "doc2"],
    }
}
```

In [4]:
all_data = {
    "metadata": {
        "name": f"{DATASET_NAME}-{RANK_NUM}",
        "version": "1.0.0"
    },
    "query_ids": [],
    "queries": {},
    "docs": {},
    "qrels": {},
    "rank": {},
}

In [5]:
# 读取问题文件
queries = {}
with jsonlines.open(QUERIES_FILE) as reader:
    for query in reader:
        queries[query['_id']] = query['text']

In [6]:
# 读取文档文件
docs = {}
with jsonlines.open(DOCS_FILE) as reader:
    for doc in reader:
        docs[doc['_id']] = doc['text']

In [7]:
# 使用pandas读取相关性文件(.tsv)
df = pd.read_csv(RELEVANCE_FILE, sep='\t', header=0)
qids = set()
docids = set()
qrels = defaultdict(dict)
for qid, docid, rel in df.values:
    qids.add(qid)
    docids.add(docid)
    qrels[qid][docid] = rel
all_data['qrels'] = qrels
all_data['query_ids'] = list(qids)

In [8]:
# 使用pandas读取排序文件
df = pd.read_csv(RANK_FILE, sep='\t', header=None, names=['query-id', 'corpus-id', 'score'])
df

Unnamed: 0,query-id,corpus-id,score
0,test291,doc515250,1.710614
1,test291,doc515229,1.701058
2,test291,doc10943,1.677855
3,test291,doc521311,1.666365
4,test291,doc515254,1.665312
...,...,...,...
3451995,test1028,doc1954370,1.008903
3451996,test1028,doc1784450,1.008894
3451997,test1028,doc635552,1.008881
3451998,test1028,doc170128,1.008880


In [9]:
rank_data = defaultdict(list)
rank_qids = set()
for qid, docid, score in df.values:
    rank_qids.add(qid)
    if len(rank_data[qid]) < RANK_NUM:
        rank_data[qid].append(docid)
        docids.add(docid)
# 校验rank_qids和qids是否一致
assert rank_qids == qids
all_data['rank'] = rank_data

In [10]:
# 将query和doc文本写入all_data
all_data['queries'] = {qid: queries[qid] for qid in qids}
all_data['docs'] = {docid: docs[docid] for docid in docids}

# 将all_data写入json文件
with open(OUTPUT_PATH, 'w') as f:
    json.dump(all_data, f)

In [11]:
# 验证json文件
with open(OUTPUT_PATH, 'r') as f:
    all_data = json.load(f)

In [12]:
for qid in all_data['query_ids']:
    assert qid in all_data['queries']
    for doc_id in all_data['qrels'][qid]:
        assert doc_id in all_data['docs']
    assert qid in all_data['rank']
    assert len(all_data['rank'][qid]) == RANK_NUM
    for doc_id in all_data['rank'][qid]:
        assert doc_id in all_data['docs']

In [13]:
new_data = []
for qid in all_data['query_ids']:
    query_text = all_data['queries'][qid]
    rank = all_data['rank'][qid]
    rank_docs = []
    for doc_id in rank:
        assert doc_id in all_data['docs']
        doc_text = all_data['docs'][doc_id]
        rank_docs.append({"doc_id": doc_id, "doc_text": doc_text})
    ref_doc_ids = all_data['qrels'][qid]
    for doc_id in ref_doc_ids:
        assert doc_id in all_data['docs']
    new_data.append({
        "query_text": query_text,
        "rank_docs": rank_docs,
        "ref_doc_ids": ref_doc_ids
    })

print(len(new_data))
print(new_data[0])
print(len(new_data[0]['rank_docs']))

3452
1000
