## Importing packages

In [1]:
import warnings
warnings.filterwarnings('ignore')

import xml.etree.ElementTree as ET
import sys
import os
import pandas as pd

## Loading RAW DATASET

In [2]:
DATA_DIR = "/data/user/tr27p/Courses/CS762-NLP/FinalProject/nlp-group-project-fall-2020-deepbiocomp/dataset/raw/1_CancerGov_QA"

DATA_FILES = sorted(os.listdir(DATA_DIR))

## Parsing XML to Data Frame with necessary tags

In [3]:
data_column_names = ["Answer"]

TXT_GEN_SIZE = 0
Answers = []
for each_file in DATA_FILES:
    tree = ET.parse(DATA_DIR+'/'+each_file)
    root = tree.getroot()

    for each_passage in root.iter('Document'):
        for each_Answer in each_passage.iter('Answer'):
            Answers.append(each_Answer.text)
            TXT_GEN_SIZE +=1
            
TXT_GEN_DATAFRAME = pd.DataFrame(Answers, columns = data_column_names)

## Spliting Data Frames to TRAIN(80%), DEV(10%), TEST(10%)

In [4]:
TRAIN_TXT_GEN_DATAFRAME = TXT_GEN_DATAFRAME.sample(frac=0.8,random_state=200)

temp_df = TXT_GEN_DATAFRAME.drop(TRAIN_TXT_GEN_DATAFRAME.index)
DEV_TXT_GEN_DATAFRAME = temp_df.sample(frac=0.5,random_state=200)

TEST_TXT_GEN_DATAFRAME = temp_df.drop(DEV_TXT_GEN_DATAFRAME.index)

TRAIN_TXT_GEN_DATAFRAME = TRAIN_TXT_GEN_DATAFRAME.reset_index(drop=True)
DEV_TXT_GEN_DATAFRAME = DEV_TXT_GEN_DATAFRAME.reset_index(drop=True)
TEST_TXT_GEN_DATAFRAME = TEST_TXT_GEN_DATAFRAME.reset_index(drop=True)

print("  Train datasize (80%)     :", len(TRAIN_TXT_GEN_DATAFRAME), 
      "\n+ Validation datasize (10%): ",len(DEV_TXT_GEN_DATAFRAME),
      "\n+ Test datasize (10%)      : ", len(TEST_TXT_GEN_DATAFRAME), 
      "\n", "-"*32, "\n", 
      " Total QA size            :", TXT_GEN_SIZE)

  Train datasize (80%)     : 583 
+ Validation datasize (10%):  73 
+ Test datasize (10%)      :  73 
 -------------------------------- 
  Total QA size            : 729


## Added $\lt$BOS$\gt$ and $\lt$EOS$\gt$ tags

In [5]:
def dataFrame_to_text_generate(DataFrame_name, OutPut_json_file_name):
    with open(OutPut_json_file_name, 'w') as outfile:
        for index in range(len(DataFrame_name)):
            temp_answer = DataFrame_name.Answer[index].split('\n')
            for line in temp_answer:
                line = line.strip()
                if line:
                    temp_line = "<BOS>"+line+"<EOS>\n"
                    outfile.write(temp_line)

## Saving data to file

In [6]:
PREPARED_DATA_PATH ='/data/user/tr27p/Courses/CS762-NLP/FinalProject/nlp-group-project-fall-2020-deepbiocomp/dataset/prepared-data/text-generation/'

TRAIN_TXT_GEN_TXT = PREPARED_DATA_PATH + 'train_text-generation.txt'
DEV_TXT_GEN_TXT = PREPARED_DATA_PATH + 'dev_text-generation.txt'
TEST_TXT_GEN_TXT = PREPARED_DATA_PATH + 'test_text-generation.txt'

dataFrame_to_text_generate(TRAIN_TXT_GEN_DATAFRAME, TRAIN_TXT_GEN_TXT)
dataFrame_to_text_generate(DEV_TXT_GEN_DATAFRAME, DEV_TXT_GEN_TXT)
dataFrame_to_text_generate(TEST_TXT_GEN_DATAFRAME, TEST_TXT_GEN_TXT)