In [1]:
import pandas as pd
import json

In [68]:
filename = "test"

In [69]:
with open(f"{filename}.json") as file:
    file_json = json.load(file)

In [4]:
def read_ct_json(filename):
    with open(f"CT json/{filename}.json") as file:
        ct_json = json.load(file)
        
    return ct_json

In [21]:
import json

def write_ct_json(filename, json_content):
    with open(f"normalized/CT json/{filename}.json", "w") as file:
        json.dump(json_content, file, indent=4)

In [54]:
import re

def normalize_text(text):  
    # special symbols
    text = re.sub("([0-9]+)\+", r" more than \1", text)
    text = re.sub("</=", r" less than or equal to ", text)
    text = re.sub(">/=", r" greater than or equal to ", text)
    text = re.sub("<=", r" less than or equal to ", text)
    text = re.sub(">=", r" greater than or equal to ", text)
    text = re.sub("<", r" less than ", text)
    text = re.sub(">", r" greater than ", text)
    text = re.sub("%", r" percent (%)", text)
    
    # abbreviations
    text = re.sub(" AEs", r" Adverse Events", text)
    text = re.sub(" Aes", r" Adverse Events", text)
    text = re.sub(" PFS", r" Progression Free Survival", text)
    text = re.sub(" IV ", r" intravenous ", text)
    text = re.sub(" PO ", r" orally ", text)
    text = re.sub(" qd ", r" every day ", text)
    
    return text.strip()

In [55]:
def normalize_adverse_event(text):
    #x/y (x/y%) -> x cases out of y participants (x/y percent %)
    
    text= re.sub("([1])/([0-9]+)", r"1 case out of \2 participants", text)
    text= re.sub("([0-9]+)/([0-9]+)", r"\1 cases out of \2 participants", text)
    
    return text

In [56]:
def is_subsection_heading(answer_line):
    return answer_line.strip().endswith(':') and len(answer_line.strip()) <= 30

In [61]:
def process_data_item(key):
    ct_json = read_ct_json(file_json[key]['Primary_id'])
    qn = file_json[key]['Statement']
    section = file_json[key]['Section_id']
    ct_type = file_json[key]['Type']
    
    processed_qn = normalize_text(qn)
    
    sections = ['Eligibility', 'Intervention', 'Results', 'Adverse Events']
    
    for section in sections:
        new_section = []
        for section_line in ct_json[section]:
            new_section_line = normalize_text(section_line)
        
            if section == 'Adverse Events':
                new_section_line = normalize_adverse_event(new_section_line)
        
            new_section.append(new_section_line)
 
        ct_json[section] = new_section
    
    secondary_ct_json = {}
    
    if 'Secondary_id' in file_json[key].keys():        
        secondary_ct_json = read_ct_json(file_json[key]['Secondary_id'])
        secondary_text_context = '.'.join(secondary_ct_json[section])
        
        for section in sections:
            new_second_section = []
            for section_line in secondary_ct_json[section]:
                new_section_line = normalize_text(section_line)
            
                if section == 'Adverse Events':
                    new_section_line = normalize_adverse_event(new_section_line)
                        
                new_second_section.append(new_section_line)
            
            secondary_ct_json[section] = new_second_section

    return processed_qn, ct_json, secondary_ct_json

In [66]:
process_data_item('83b83400-1439-462d-bba3-42817b5b1fa1')
#process_data_item('0a6d1b4c-244e-44e2-a229-62e4cbdfa979')
#process_data_item('9f978634-637c-472f-a588-6f4bb2fb121f')

('Most of the cases of CHF in the primary trial, were in cohort 1.',
 {'Clinical Trial ID': 'NCT00777049',
  'Intervention': ['INTERVENTION 1:',
   'ER+ and/or PgR+ (Arm I)',
   'Panobinostat - LBH589: hard gelatine capsule - 5mg and 20mg',
   'INTERVENTION 2:',
   'ER- and PgR- (Arm II)',
   'Panobinostat - LBH589: hard gelatine capsule - 5mg and 20mg'],
  'Eligibility': ['Inclusion Criteria:',
   'Written informed consent obtained prior to any study-related procedures',
   'Women  18 years old',
   'Patients with an ECOG performance status of  2 assessed within 2 weeks (14 days) prior to registration',
   'Histologically or cytologically confirmed breast cancer with locally recurrent or radiological evidence of metastatic disease. Locally recurrent disease must not be amenable to resection with curative intent.',
   'Measurable disease per RECIST (Response Evaluation Criteria in Solid Tumor) guidelines',
   'HER2-negative patients by local laboratory testing (IHC 0 or  more than 1 st

In [70]:
data_items = []
for key in file_json.keys():
    new_qn, ct_json, secondary_ct_json = process_data_item(key)
    file_json[key]['Statement'] = new_qn
    
    write_ct_json(file_json[key]['Primary_id'], ct_json)
    
    if file_json[key]['Type'] == 'Comparison':
        write_ct_json(file_json[key]['Secondary_id'], secondary_ct_json)
    
# write the dev/train/test.json
with open(f"normalized/{filename}.json", "w") as file:
    json.dump(file_json, file, indent=4)