## Importing packages

In [17]:
import warnings
warnings.filterwarnings('ignore')

import json
import sys
import os
import pandas as pd

## Loading RAW DATASET

In [14]:
# DATA_DIR = "../../dataset/new-prepared-data/qa/"

DATA_FILES = "../../dataset/new-prepared-data/qa/full_qa.json"

## Parsing XML to Data Frame with necessary tags

In [27]:
data_column_names = ["Answer"]

TXT_GEN_SIZE = 0
Answers = []

with open(DATA_FILES) as f:
    tree = json.loads(f.read())

for each_data in tree['data']:
    for each_para in each_data['paragraphs']:
        Answers.append(each_para['context'])
        TXT_GEN_SIZE +=1

TXT_GEN_DATAFRAME = pd.DataFrame(Answers, columns = data_column_names)

## Spliting Data Frames to TRAIN(80%), DEV(10%), TEST(10%)

In [28]:
TRAIN_TXT_GEN_DATAFRAME = TXT_GEN_DATAFRAME.sample(frac=0.8,random_state=200)

temp_df = TXT_GEN_DATAFRAME.drop(TRAIN_TXT_GEN_DATAFRAME.index)
DEV_TXT_GEN_DATAFRAME = temp_df.sample(frac=0.5,random_state=200)

TEST_TXT_GEN_DATAFRAME = temp_df.drop(DEV_TXT_GEN_DATAFRAME.index)

TRAIN_TXT_GEN_DATAFRAME = TRAIN_TXT_GEN_DATAFRAME.reset_index(drop=True)
DEV_TXT_GEN_DATAFRAME = DEV_TXT_GEN_DATAFRAME.reset_index(drop=True)
TEST_TXT_GEN_DATAFRAME = TEST_TXT_GEN_DATAFRAME.reset_index(drop=True)

print("  Train datasize (80%)     :", len(TRAIN_TXT_GEN_DATAFRAME), 
      "\n+ Validation datasize (10%): ",len(DEV_TXT_GEN_DATAFRAME),
      "\n+ Test datasize (10%)      : ", len(TEST_TXT_GEN_DATAFRAME), 
      "\n", "-"*32, "\n", 
      " Total QA size            :", TXT_GEN_SIZE)

  Train datasize (80%)     : 583 
+ Validation datasize (10%):  73 
+ Test datasize (10%)      :  73 
 -------------------------------- 
  Total QA size            : 729


## Added $\lt$BOS$\gt$ and $\lt$EOS$\gt$ tags

In [35]:
TRAIN_TXT_GEN_DATAFRAME['Answer'][0]

'Signs and symptoms of osteosarcoma and MFH include swelling over a bone or a bony part of the body and joint pain. These and other signs and symptoms may be caused by osteosarcoma or MFH or by other conditions. Check with a doctor if your child has any of the following: Swelling over a bone or bony part of the body. Pain in a bone or joint. A bone that breaks for no known reason.'

In [36]:
def dataFrame_to_text_generate(DataFrame_name, OutPut_json_file_name):
    with open(OutPut_json_file_name, 'w') as outfile:
        for index in range(len(DataFrame_name)):
            temp_answer = DataFrame_name.Answer[index].split('.')
            for line in temp_answer:
                line = line.strip()
                if line:
                    temp_line = "<BOS>"+line+".<EOS>\n"
                    outfile.write(temp_line)

## Saving data to file

In [37]:
PREPARED_DATA_PATH ='../../dataset/new-prepared-data/text-generation/'

TRAIN_TXT_GEN_TXT = PREPARED_DATA_PATH + 'train_text-generation.txt'
DEV_TXT_GEN_TXT = PREPARED_DATA_PATH + 'dev_text-generation.txt'
TEST_TXT_GEN_TXT = PREPARED_DATA_PATH + 'test_text-generation.txt'

dataFrame_to_text_generate(TRAIN_TXT_GEN_DATAFRAME, TRAIN_TXT_GEN_TXT)
dataFrame_to_text_generate(DEV_TXT_GEN_DATAFRAME, DEV_TXT_GEN_TXT)
dataFrame_to_text_generate(TEST_TXT_GEN_DATAFRAME, TEST_TXT_GEN_TXT)