## Importing packages

In [1]:
import warnings
warnings.filterwarnings('ignore')

import xml.etree.ElementTree as ET
import sys
import os
import pandas as pd
import json
import xmltodict

## Loading RAW DATASET

In [2]:
DATA_DIR = "/data/user/tr27p/Courses/CS762-NLP/FinalProject/nlp-group-project-fall-2020-deepbiocomp/dataset/raw/1_CancerGov_QA"

DATA_FILES = sorted(os.listdir(DATA_DIR))

## Get the list of XML tags in DATASET

In [3]:
elemList = []
for each_file in DATA_FILES:
    tree = ET.parse(DATA_DIR+'/'+each_file)
    for elem in tree.iter():
        elemList.append(elem.tag)

elemList = list(set(elemList))

print(elemList)

['UMLS', 'SemanticGroup', 'Question', 'Answer', 'SemanticType', 'QAPairs', 'CUI', 'QAPair', 'SemanticTypes', 'FocusAnnotations', 'CUIs', 'Focus', 'Document']


## Parsing XML to Data Frame with necessary tags

In [4]:
data_column_names = ["Focus", "SemanticType", "SemanticGroup","QType", "Question", "Answer"]

QA_DATAFRAME = pd.DataFrame(columns = data_column_names)
QA_SIZE = 0
for each_file in DATA_FILES:
    tree = ET.parse(DATA_DIR+'/'+each_file)
    root = tree.getroot()
    SemanticTypes = []
    SemanticGroups = []
    QTypes = []
    Questions = []
    Answers = []
    doc_focus = []

    for each_passage in root.iter('Document'):
        doc_focus.append(each_passage.find('Focus').text)

        for each_SemanticType in each_passage.iter('SemanticType'):
            SemanticTypes.append(each_SemanticType.text)
            
        for each_SemanticGroup in each_passage.iter('SemanticGroup'):
            SemanticGroups.append(each_SemanticGroup.text)
            
        for each_Question in each_passage.iter('Question'):
            QTypes.append(each_Question.attrib['qtype'])
            Questions.append(each_Question.text)
            
        for each_Answer in each_passage.iter('Answer'):
            temp_answer = each_Answer.text.replace('\n', ' ').replace('\t', '')
            Answers.append(temp_answer)
    
    doc_df = pd.DataFrame(columns = data_column_names)
    if (len(Questions) == len(Answers)):
        for index in range(len(Questions)):
            if (len(SemanticTypes) == 1):
                SemanticType = SemanticTypes[0]
            elif (len(SemanticTypes) == 2):
                SemanticType = SemanticTypes[0] + ',' + SemanticTypes[1]
            temp_df = pd.DataFrame([[doc_focus[0], SemanticType, SemanticGroups[0], QTypes[index], Questions[index], Answers[index]]], columns=data_column_names)
            doc_df = doc_df.append(temp_df, ignore_index=True)
        QA_SIZE += len(Questions)
    QA_DATAFRAME = QA_DATAFRAME.append(doc_df, ignore_index=True)


## Spliting Data Frames to TRAIN(80%), DEV(10%), TEST(10%)

In [5]:
TRAIN_QA_DATAFRAME = QA_DATAFRAME.sample(frac=0.8,random_state=200)

temp_df = QA_DATAFRAME.drop(TRAIN_QA_DATAFRAME.index)
DEV_QA_DATAFRAME = temp_df.sample(frac=0.5,random_state=200)

TEST_QA_DATAFRAME = temp_df.drop(DEV_QA_DATAFRAME.index)

TRAIN_QA_DATAFRAME = TRAIN_QA_DATAFRAME.reset_index(drop=True)
DEV_QA_DATAFRAME = DEV_QA_DATAFRAME.reset_index(drop=True)
TEST_QA_DATAFRAME = TEST_QA_DATAFRAME.reset_index(drop=True)

print("  Train datasize (80%)     :", len(TRAIN_QA_DATAFRAME), 
      "\n+ Validation datasize (10%): ",len(DEV_QA_DATAFRAME),
      "\n+ Test datasize (10%)      : ", len(TEST_QA_DATAFRAME), 
      "\n", "-"*32, "\n", 
      " Total QA size            :", QA_SIZE)

  Train datasize (80%)     : 583 
+ Validation datasize (10%):  73 
+ Test datasize (10%)      :  73 
 -------------------------------- 
  Total QA size            : 729


## Saving Data frames to Tab Separated File

In [6]:
PREPARED_DATA_PATH ='/data/user/tr27p/Courses/CS762-NLP/FinalProject/nlp-group-project-fall-2020-deepbiocomp/dataset/prepared-data/qa/'

TRAIN_QA_TSV = PREPARED_DATA_PATH + 'train_qa.tsv'
DEV_QA_TSV = PREPARED_DATA_PATH + 'dev_qa.tsv'
TEST_QA_TSV = PREPARED_DATA_PATH + 'test_qa.tsv'

TRAIN_QA_DATAFRAME.to_csv(TRAIN_QA_TSV, sep='\t', index=False)
DEV_QA_DATAFRAME.to_csv(DEV_QA_TSV, sep='\t', index=False)
TEST_QA_DATAFRAME.to_csv(TEST_QA_TSV, sep='\t', index=False)

## Convert Data Frames to JSON format

In [7]:
#Reff: https://stackabuse.com/reading-and-writing-json-to-a-file-in-python/
def dataFrame_to_json(DataFrame_name, OutPut_json_file_name):
    QA_JSON = {}

    QA_JSON['qas'] = []
    for index in range(len(DataFrame_name)):
        QA_JSON['qas'].append({"index": index, "Focus": DataFrame_name.Focus[index],
                                     "Focus": DataFrame_name.Focus[index], 
                                     "SemanticType": DataFrame_name.SemanticType[index], 
                                     "SemanticGroup": DataFrame_name.SemanticGroup[index], 
                                     "QType": DataFrame_name.QType[index], 
                                     "Question" : DataFrame_name.Question[index], 
                                     "Answer" : DataFrame_name.Answer[index]})
        
    with open(OutPut_json_file_name, 'w') as outfile:
        json.dump(QA_JSON, outfile)

## Saving JSON data to JSON file

In [8]:
TRAIN_QA_JSON = PREPARED_DATA_PATH + 'train_qa.json'
DEV_QA_JSON = PREPARED_DATA_PATH + 'dev_qa.json'
TEST_QA_JSON = PREPARED_DATA_PATH + 'test_qa.json'

dataFrame_to_json(TRAIN_QA_DATAFRAME, TRAIN_QA_JSON)
dataFrame_to_json(DEV_QA_DATAFRAME, DEV_QA_JSON)
dataFrame_to_json(TEST_QA_DATAFRAME, TEST_QA_JSON)

## Convert RAW XML to JSON format

In [9]:
#Reff: https://www.geeksforgeeks.org/python-xml-to-json/
def XML_to_json(XML_file_name, OutPut_json_file_name):
    with open(XML_file_name) as xml_file: 
        data_dict = xmltodict.parse(xml_file.read()) 
        xml_file.close()
        
        json_data = json.dumps(data_dict) 
        
        with open(OutPut_json_file_name, "w") as json_file: 
            json_file.write(json_data) 
            json_file.close() 

In [10]:
RAW_JSON_DATA_PATH = "/data/user/tr27p/Courses/CS762-NLP/FinalProject/nlp-group-project-fall-2020-deepbiocomp/dataset/raw/1_CancerGov_QA_JSON"

for each_file in DATA_FILES:
    source = DATA_DIR+'/'+each_file
    each_file = each_file.split('.')[0]
    destination = RAW_JSON_DATA_PATH+'/'+each_file+'.json'
    XML_to_json(source, destination)