### Dependencies

In [None]:
import pandas as pd
import math
import os
import json

In [None]:
path_to_ctr_directory = '/path/to/clinical_records_directory'
path_to_source_df = '/path/to/source.json' 
path_to_output_df = '/path/to/output.json'

# only used when preprocessing task 2 results. Not requires for preprocessing train/val sets. 
# this is either dev.json or train.json from the original set.
# task 2 outputs only example ID + evidence indexes, so we need to augment each row with Primary_id, Secondary_id and Statement. 
path_to_original_dataframe = '/path/to/original/test.json' 

is_train_dataframe = False
is_task_2_results_dataframe = False 

### Preprocessing

In [None]:
def is_subsection_heading(answer_line):
    return answer_line.strip().endswith(':') and len(answer_line.strip()) <= 30

# augments a subsection heading with cohort information.
def get_cohort_information(subsection_prefix):
  if not is_subsection_heading(subsection_prefix):
    return ''
    
  if 'adverse events' in subsection_prefix.lower():
      subsection_prefix = subsection_prefix.lower().replace('adverse events', 'adverse events cohort')
  if 'results' in subsection_prefix.lower():
      subsection_prefix = subsection_prefix.lower().replace('results', 'results cohort')
  if 'intervention' in subsection_prefix.lower():
      subsection_prefix = subsection_prefix.lower().replace('intervention', 'intervention cohort')
  return subsection_prefix

In [None]:
# returns fully augmented evidence sentences for a given premise. 
def get_premise_sentences(row):
  section_id = row['Section_id']
  premise_sentences = []
  with open(os.path.join(path_to_ctr_directory, row['Primary_id'] + '.json')) as primary_trial:
    primary_evidence_indices = row['Primary_evidence_index']
    primary_trial_json = json.load(primary_trial)
    primary_section = primary_trial_json[section_id]
    premise_sentences.extend(['Primary trial: ' + get_cohort_information(primary_section[index]) + primary_section[index] for index in primary_evidence_indices])
  if row['Type'] == 'Comparison':
    with open(os.path.join(path_to_ctr_directory, row['Secondary_id'] + '.json')) as secondary_trial:
      secondary_evidence_indices = row['Secondary_evidence_index']
      secondary_trial_json = json.load(secondary_trial)
      secondary_section = secondary_trial_json[section_id]
      premise_sentences.extend(['Secondary trial: ' + get_cohort_information(secondary_section[index]) + secondary_section[index] for index in secondary_evidence_indices])
  return premise_sentences

In [None]:
src_dataframe = pd.read_json(path_to_source_df).transpose()

In [None]:
# need to augment the task 2 results with at least the clinical trial IDs and the statement, so we can evaluate.
if is_task_2_results_dataframe:
  src_dataframe = src_dataframe.transpose()
  original_df = pd.read_json(path_to_original_dataframe).transpose()
    
  original_df_instance_ids = original_df.index.tolist()

  def get_column_value(instance_id, column):
    return original_df.loc[instance_id][column] if instance_id in original_df_instance_ids else None
  
  original_df_column_names = list(original_df.columns.values)

  for column_name in original_df_column_names:
    src_dataframe[column_name] = src_dataframe.apply(lambda row: get_column_value(row.name, column_name), axis=1)  

In [None]:
# get the premise sentences for each row.
src_dataframe['Premise'] = src_dataframe.apply(get_premise_sentences, axis=1)

In [None]:
# transform each premise sentence into a separate entry. This is only required for the train set.
if is_train_dataframe:
  src_dataframe = src_dataframe.explode('Premise')
  src_dataframe.reset_index(inplace=True, drop=True)

src_dataframe.to_json(path_to_output_df)

src_dataframe.head()