### Dependencies

In [20]:
import pandas as pd
import math
import os
import json

In [21]:
path_to_ctr_directory = '/path/to/clinical_records_directory'
path_to_source_df = '/path/to/source.json' 
path_to_output_df = '/path/to/output.json'

### Preprocessing

In [22]:
def is_subsection_heading(answer_line):
    return answer_line.strip().endswith(':') and len(answer_line.strip()) <= 30

# augments a subsection heading with cohort information.
def get_cohort_information(subsection_prefix):
  if not is_subsection_heading(subsection_prefix):
    return ''
    
  if 'adverse events' in subsection_prefix.lower():
      subsection_prefix = subsection_prefix.lower().replace('adverse events', 'adverse events cohort')
  if 'results' in subsection_prefix.lower():
      subsection_prefix = subsection_prefix.lower().replace('results', 'results cohort')
  if 'intervention' in subsection_prefix.lower():
      subsection_prefix = subsection_prefix.lower().replace('intervention', 'intervention cohort')
  return subsection_prefix

In [23]:
# returns fully augmented evidence sentences for a given premise. 
def get_premise_sentences(row):
  section_id = row['Section_id']
  premise_sentences = []
  with open(os.path.join(path_to_ctr_directory, row['Primary_id'] + '.json')) as primary_trial:
    primary_evidence_indices = row['Primary_evidence_index']
    primary_trial_json = json.load(primary_trial)
    primary_section = primary_trial_json[section_id]
    premise_sentences.extend(['Primary trial: ' + get_cohort_information(primary_section[index]) + primary_section[index] for index in primary_evidence_indices])
  if row['Type'] == 'Comparison':
    with open(os.path.join(path_to_ctr_directory, row['Secondary_id'] + '.json')) as secondary_trial:
      secondary_evidence_indices = row['Secondary_evidence_index']
      secondary_trial_json = json.load(secondary_trial)
      secondary_section = secondary_trial_json[section_id]
      premise_sentences.extend(['Secondary trial: ' + get_cohort_information(secondary_section[index]) + secondary_section[index] for index in secondary_evidence_indices])
  return premise_sentences

In [24]:
src_dataframe = pd.read_json(path_to_source_df).transpose()

In [25]:
# get the premise sentences for each row.
src_dataframe['Premise'] = src_dataframe.apply(get_premise_sentences, axis=1)

In [26]:
# transform each premise sentence into a separate entry.
src_dataframe = src_dataframe.explode('Premise')
src_dataframe.reset_index(inplace=True, drop=True)

src_dataframe.to_json(path_to_output_df)

src_dataframe.head()

Unnamed: 0,Type,Section_id,Primary_id,Statement,Label,Primary_evidence_index,Secondary_id,Secondary_evidence_index,Premise
0,Single,Results,NCT00066573,there is a 13.2 percent (%) difference between...,Contradiction,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",,,Primary trial: Outcome Measurement:Outcome Mea...
1,Single,Results,NCT00066573,there is a 13.2 percent (%) difference between...,Contradiction,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",,,Primary trial: Event-free Survival
2,Single,Results,NCT00066573,there is a 13.2 percent (%) difference between...,Contradiction,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",,,"Primary trial: Event free survival, the primar..."
3,Single,Results,NCT00066573,there is a 13.2 percent (%) difference between...,Contradiction,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",,,Primary trial: Time frame: 5 years
4,Single,Results,NCT00066573,there is a 13.2 percent (%) difference between...,Contradiction,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",,,Primary trial: results cohort 1:Results 1:
