# 根据rank.json构造rank.jsonl
- 根据nq-rank-10.json构造nq-rank-10.jsonl

In [1]:
# !pip install jsonlines

In [2]:
%load_ext autoreload
%autoreload 2

import os
import json
import jsonlines
import pandas as pd
from collections import defaultdict
import shutil
from tqdm import tqdm

In [3]:
# 输入json文件路径
JSON_FILE_PATH = '/Users/song/Downloads/rank-dataset/nq-rank-10.json'

# 输出jsonl文件路径
JSONL_FILE_PATH = '/Users/song/Downloads/rank-dataset/nq-rank-10.jsonl'

# 创建输出文件夹
os.makedirs(os.path.dirname(JSONL_FILE_PATH), exist_ok=True)

## JSON文件格式：
```json
all_data = {
    "metadata": {
        "name": "nq",
        "description": "nq dataset",
        "version": "1.0.0"
    },
    // 遍历顺序
    "query_ids": ["test0", "test1", "test2"],
    // 问题文本
    "queries": {
        "test0": "test0",
        "test1": "test1",
        "test2": "test2"
    },
    // 文档文本
    "docs": {
        "doc0": "doc0",
        "doc1": "doc1",
        "doc2": "doc2"
    },
    // 相关性标注
    "qrels": {
        "test0": ["doc0", "doc1"],
        "test1": ["doc1", "doc2"],
        "test2": ["doc0", "doc2"]
    },
    // 排序结果
    "rank": {
        "test0": ["doc0", "doc1"],
        "test1": ["doc1", "doc2"],
        "test2": ["doc0", "doc2"],
    }
}
```

## JSONL文件格式：

- 不带上下文
```json
{
    "metadata": {
        "src_json_file": "nq-rank-10.json",
        "query_id": "test0",
        "doc_id": "no",
    },
    "id": "nq-test0-no",
    "question": "Why was the General Belgrano sunk?",
    "context": null,
}
```

- 带上下文
```json
{
    "metadata": {
        "src_json_file": "nq-rank-10.json",
        "query_id": "test0",
        "doc_id": "doc0",
    },
    "id": "nq-test0-doc0",
    "question": "Why was the General Belgrano sunk?",
    "context": "This is a test context",
}
```

In [4]:
# 加载json文件
with open(JSON_FILE_PATH, 'r') as f:
    all_data = json.load(f)

with jsonlines.open(JSONL_FILE_PATH, 'w') as writer:
    for qid in tqdm(all_data['query_ids']):
        query = all_data['queries'][qid]
        doc_ids = ['no'] + all_data['rank'][qid]
        for doc_id in doc_ids:
            assert all_data['docs'].get(doc_id, None) is not None or doc_id == 'no'
            writer.write({
                'metadata': {
                    'src_json_file': JSON_FILE_PATH.split('/')[-1],
                    'query_id': qid,
                    'doc_id': doc_id,
                },
                'id': f'nq-{qid}-{doc_id}',
                'question': query,
                'context': all_data['docs'].get(doc_id, None),
            })

100%|██████████| 3452/3452 [00:00<00:00, 18274.01it/s]
