## 筛选含有nct的数据集

In [4]:
import json
import re

def filter_records_by_nct(filename,output_filename):
    nct_pattern = re.compile(r"NCT\d{8}")

    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)

    filtered_records = []
    nct_numbers=[]
    for record in data:
        nct_number = None
        if 'abstract' in record:
            match = nct_pattern.search(record['abstract'])
            if match:
                nct_number = match.group()
                nct_numbers.append(nct_number)
        elif 'content' in record:
            match = nct_pattern.search(record['content'])
            if match:
                nct_number = match.group()
                nct_numbers.append(nct_number)
        if nct_number:
            filtered_records.append(record)
    #print(nct_numbers)
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(filtered_records, f, ensure_ascii=False, indent=4)
    print("含有NCT number的记录数量:", len(filtered_records))

    
filter_records_by_nct('/local/home/sumyao/YSforGIT/dataset/BIO2json/ebm_3.PIOformatchanged_contentLabel.json',
                        '/local/home/sumyao/YSforGIT/dataset/Json2Filtered/ebm_nct_filtered.json')
filter_records_by_nct('/local/home/sumyao/YSforGIT/dataset/BIO2json/picocorpus_1formatransfered_contentlabel.json',
                        '/local/home/sumyao/YSforGIT/dataset/Json2Filtered/picocorpus_nct_filtered.json')
filter_records_by_nct('/local/home/sumyao/YSforGIT/dataset/BIO2json/sectionspecific_5.NCTfilterd_98absPICOpmidAbsTitle.json',
                        '/local/home/sumyao/YSforGIT/dataset/Json2Filtered/sectionspecific_nct_filtered.json')   

含有NCT number的记录数量: 183
含有NCT number的记录数量: 160
含有NCT number的记录数量: 97


## 检索clinical内容并保存

In [5]:
import sys,csv
csv.field_size_limit(sys.maxsize)

def check_clinicaltrials_colnames(csv_file_path):
    with open(csv_file_path, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        column_names = csv_reader.fieldnames
    return column_names  

def load_clinicaltrials():
    csv_file_path = '/local/home/sumyao/ysmpubmed/DATASET/clinicaltrials.csv'
    nct_data = {}
    nct_numbers = []

    with open(csv_file_path, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            nct_id = row['NCT Number']  # Assuming "NCT Number" is the column name
            """汇总nct内容
            nct_data[nct_id] = {
                "participants": [row.get("age", ""),row.get("Sex", ""),row.get("Conditions", "")],
                "sentence": row.get("sentence", ""),
                "outcomes": [row.get("Primary Outcome Measures", "") , row.get("Secondary Outcome Measures", "")]
            }"""
            nct_data[nct_id] = {
                "age": row.get("age", ""),
                "gender": row.get("Sex", ""),
                "conditions":row.get("Conditions", ""),
                "interventions": row.get("Interventions", ""),
                "primary outcome measures": row.get("Primary Outcome Measures", "") , 
                "secondary outcome measures":row.get("Secondary Outcome Measures", "")}
            nct_numbers.append(nct_id)
        return nct_data, nct_numbers

## 调用
nct_data,nct_numbers=load_clinicaltrials()
print(len(nct_data))
len(nct_numbers)
nct_numbers[0]
nct_data['NCT00072579']

495377


{'age': '',
 'gender': 'ALL',
 'conditions': 'Leukemia',
 'interventions': 'BIOLOGICAL: sargramostim',
 'primary outcome measures': 'Cytogenetic response (complete and partial)',
 'secondary outcome measures': 'Toxicity as assessed by the Expanded Common Toxicity Criteria v2.0|Time to progression|Survival'}

In [6]:
import os
import json
import re

def add_Retrivel_NCTrecord(json_file_path,output_finename):
    with open(json_file_path, 'r') as json_file:
        json_data = json.load(json_file)
    nct_pattern = re.compile(r'\bNCT\W*\d{8}\b', re.IGNORECASE)


    unmatched_numbers=[]
    matched_records=[]
    for record in json_data:
        nct_number = None
        #record=json_data[1]
        if 'abstract' in record:
            match = nct_pattern.search(record['abstract'])
            if match:
                nct_number = match.group()
                #nct_numbers.append(nct_number)
        elif 'content' in record:
            match = nct_pattern.search(record['content'])
            if match:
                nct_number = match.group()
                #nct_numbers.append(nct_number)
        if nct_number:
            try:
                record['retrieved']=nct_data[nct_number]
                matched_records.append(record)
            except:
                unmatched_numbers.append(nct_number)
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(matched_records, f, ensure_ascii=False, indent=4)
    print("匹配NCTnumber的记录数量:", len(matched_records))
    print(unmatched_numbers)

input_folder = '/local/home/sumyao/YSforGIT/dataset/Json2Filtered/'
output_folder = '/local/home/sumyao/YSforGIT/dataset/Filtered2Added/'
os.makedirs(output_folder, exist_ok=True)
for filename in os.listdir(input_folder):
    if filename.endswith('.json'):  # Process only JSON files
        input_filepath = os.path.join(input_folder, filename)
        output_filename = filename.replace('.json', '_added_withnocluster.json')
        output_filepath = os.path.join(output_folder, output_filename)
        print(output_filepath)
        add_Retrivel_NCTrecord(input_filepath,output_filepath)

/local/home/sumyao/YSforGIT/dataset/Filtered2Added/ebm_nct_filtered_added_withnocluster.json
匹配NCTnumber的记录数量: 181
[]
/local/home/sumyao/YSforGIT/dataset/Filtered2Added/sectionspecific_nct_filtered_added_withnocluster.json
匹配NCTnumber的记录数量: 96
[]
/local/home/sumyao/YSforGIT/dataset/Filtered2Added/picocorpus_nct_filtered_added_withnocluster.json
匹配NCTnumber的记录数量: 160
[]


In [9]:
def check_clinicaltrials_colnames(csv_file_path):
    with open(csv_file_path, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        column_names = csv_reader.fieldnames
    return column_names 
check_clinicaltrials_colnames( '/local/home/sumyao/ysmpubmed/DATASET/clinicaltrials.csv') 

['NCT Number',
 'Study Title',
 'Study URL',
 'Acronym',
 'Study Status',
 'Brief Summary',
 'Study Results',
 'Conditions',
 'Interventions',
 'Primary Outcome Measures',
 'Secondary Outcome Measures',
 'Sponsor',
 'Collaborators',
 'Sex',
 'Age',
 'Phases',
 'Enrollment',
 'Funder Type',
 'Study Type',
 'Study Design',
 'Start Date',
 'Primary Completion Date',
 'Completion Date',
 'First Posted',
 'Results First Posted',
 'Last Update Posted',
 'Locations',
 'Study Documents']