In [1]:
import pandas as pd

# 读取JSON文件
dataset_train = pd.read_json('training12b_new.json')

In [4]:
# 先查看数据结构
print("数据集的列名:")
print(dataset_train.columns)

# 查看第一条数据的内容
print("\n第一条数据的内容:")
print(dataset_train.iloc[0].to_dict())
print("*"*100)

数据集的列名:
Index(['questions'], dtype='object')

第一条数据的内容:
{'questions': {'body': 'Is Hirschsprung disease a mendelian or a multifactorial disorder?', 'documents': ['http://www.ncbi.nlm.nih.gov/pubmed/15858239', 'http://www.ncbi.nlm.nih.gov/pubmed/20598273', 'http://www.ncbi.nlm.nih.gov/pubmed/15829955', 'http://www.ncbi.nlm.nih.gov/pubmed/6650562', 'http://www.ncbi.nlm.nih.gov/pubmed/12239580', 'http://www.ncbi.nlm.nih.gov/pubmed/21995290', 'http://www.ncbi.nlm.nih.gov/pubmed/15617541', 'http://www.ncbi.nlm.nih.gov/pubmed/23001136', 'http://www.ncbi.nlm.nih.gov/pubmed/8896569'], 'ideal_answer': ["Coding sequence mutations in RET, GDNF, EDNRB, EDN3, and SOX10 are involved in the development of Hirschsprung disease. The majority of these genes was shown to be related to Mendelian syndromic forms of Hirschsprung's disease, whereas the non-Mendelian inheritance of sporadic non-syndromic Hirschsprung disease proved to be complex; involvement of multiple loci was demonstrated in a multiplicati

In [7]:
import json

# 处理所有数据
processed_data = []
for _, row in dataset_train.iterrows():
    question_data = row['questions']  # 获取questions字段下的数据
    
    # 只处理factoid和yesno类型的数据
    if question_data['type'] in ['factoid', 'yesno']:
        # 提取snippets中的text
        contexts = [snippet['text'] for snippet in question_data['snippets']]
        
        processed_row = {
            'question': question_data['body'],
            'id': question_data['id'],
            'type': question_data['type'],
            'contexts': contexts,
            'answers': question_data['exact_answer']
        }
        processed_data.append(processed_row)

# 使用json模块保存，设置缩进为4个空格
with open('training12b_process.json', 'w', encoding='utf-8') as f:
    json.dump(processed_data, f, ensure_ascii=False, indent=4)

print(f"处理完成！共处理了 {len(processed_data)} 条数据")
print("数据已保存到 training12b_process.json")

# 显示前一条数据作为样例
print("\n处理后的数据示例（第一条）:")
print(json.dumps(processed_data[0], ensure_ascii=False, indent=4))

处理完成！共处理了 2872 条数据
数据已保存到 training12b_process.json

处理后的数据示例（第一条）:
{
    "question": "Is the protein Papilin secreted?",
    "id": "54e25eaaae9738404b000017",
    "type": "yesno",
    "contexts": [
        "Using expression analysis, we identify three genes that are transcriptionally regulated by HLH-2: the protocadherin cdh-3, and two genes encoding secreted extracellular matrix proteins, mig-6/papilin and him-4/hemicentin. ",
        "We found that mig-6 encodes long (MIG-6L) and short (MIG-6S) isoforms of the extracellular matrix protein papilin, each required for distinct aspects of DTC migration. Both MIG-6 isoforms have a predicted N-terminal papilin cassette",
        "apilins are homologous, secreted extracellular matrix proteins which share a common order of protein domains. ",
        "The TSR superfamily is a diverse family of extracellular matrix and transmembrane proteins, many of which have functions related to regulating matrix organization, cell-cell interactions and ce

In [1]:
import pandas as pd

# 读取JSON文件
dataset_train = pd.read_json('training12b_process.json')

In [4]:
import json

# 读取JSON文件
with open('training12b_process.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 修改 answers 字段的格式
for item in data:
    if isinstance(item['answers'], str):
        item['answers'] = [item['answers']]

# 保存修改后的数据集
with open('training12b_process_modified.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print(f"处理完成！共处理了 {len(data)} 条数据")
print("数据已保存到 training12b_process_modified.json")

处理完成！共处理了 2872 条数据
数据已保存到 training12b_process_modified.json
