In [1]:
import json
import stanza
from time import sleep

# Data

In [2]:
with open('drop_dataset_train.json') as json_file:
    train_data = json.load(json_file)
with open('drop_dataset_dev.json') as json_file:
    dev_data = json.load(json_file)
dataset = train_data

In [3]:
print('Datasize: {s}'.format(s=len(dataset.keys())))

Datasize: 5565


In [4]:
passage_text = {}
question_answer_text ={}
passage_type = {}
question_answer_type ={}

# CoreNLP

In [5]:
nlp_model = stanza.Pipeline(logging_level = 'FATAL')

# Passage

In [6]:
temp = 0
total = len(dataset.keys())
ex_list =[]
for key,item in dataset.items():
    try:
        doc = nlp_model(item['passage'])
        passage_text[key] = [x.text for x in doc.ents]
        passage_type[key] = [(x.text,x.type) for x in doc.ents]
    
        question_answer_text[key] = []
        question_answer_type[key] = []
        for q in item["qa_pairs"]:
            q_text = {} 
            q_type = {} 
            doc = nlp_model( q["question"])
            q_text["question"] = [x.text for x in doc.ents]
            q_type["question"] = [(x.text,x.type) for x in doc.ents]
            if len(q["answer"]["number"])!= 0:
                q_text["answer"] = q["answer"]["number"]
                q_type["answer"] = [(q["answer"]["number"],"number")] 
            elif len(q["answer"]["spans"])!= 0:
                q_text["answer"] =[]
                q_type["answer"] =[]
                for anw in q["answer"]["spans"]:
                    doc = nlp_model(anw)
                    q_text["answer"].append([x.text for x in doc.ents])
                    q_type["answer"].append([(x.text,x.type) for x in doc.ents])
            else:
                q_text["answer"] =q["answer"]["date"]
                q_type["answer"] =[(q["answer"]["date"],"date")]
            question_answer_text[key].append(q_text)
            question_answer_type[key].append(q_type)    
        temp += 1
        print('\r' + '[Progress]:[%s%s]%.2f%%;' % ('█' * int(temp*20/total), ' ' * (20-int(temp*20/total)),float(temp/total*100)), end='')
        if temp%100 == 1:
            with open('passage_text.json', 'w') as fp:
                json.dump(passage_text, fp)
            with open('passage_type.json', 'w') as fp:
                json.dump(passage_type, fp)
            with open('question_answer_text.json', 'w') as fp:
                json.dump(question_answer_text, fp)
            with open('question_answer_type.json', 'w') as fp:
                json.dump(question_answer_type, fp)  
    except:
        temp += 1
        pass    
    

[Progress]:[████████████████████]100.00%;

In [7]:
with open('passage_text.json', 'w') as fp:
    json.dump(passage_text, fp)
with open('passage_type.json', 'w') as fp:
    json.dump(passage_type, fp) 
with open('question_answer_text.json', 'w') as fp:
    json.dump(question_answer_text, fp)
with open('question_answer_type.json', 'w') as fp:
    json.dump(question_answer_type, fp)  

# Output:

<details>
  <summary>Passage</summary>
  
  * Passage_text(dict)  
      * KEY: passage key(list):
          * passage word (from CoreNLP)(str):
  * Passage_type(dict)  
      * KEY: passage key(list):
          * (passage word,type)(tuple):
</details>
<details>
  <summary>Question_Answer</summary>
  
  * question_answer_text(dict)  
      * KEY: passage key(list):
          * question and answer(dict):
              * KEY: question(list):
                  * word (from CoreNLP)(str): 
              * KEY: answer(list):
                  * answer, different types have different formates:
                      * number,date same as original dictionary
                      * spans(list):
                          * single span(list):
                              * word (from CoreNLP)(str)
  * question_answer_type(dict)  
      * KEY: passage key(list):
          * question and answer(dict):
              * KEY:  question(list):
                  * words (from CoreNLP)(str): 
              * KEY: answer(list):
                  * answer, different types have different formates:
                      * number,date same as original dictionary: (answer(original original dictionary),type)(tuple)
                      * spans(list):
                          * single span(list):
                              * (word,type)(str)
</details>

# Test


count = 0
for key,item in dataset.items():
    doc = nlp_model(item['passage'])
    passage_text[key] = [x.text for x in doc.ents]
    passage_type[key] = [(x.text,x.type) for x in doc.ents]
    
    question_answer_text[key] = []
    question_answer_type[key] = []
    for q in item["qa_pairs"]:
        q_text = {} 
        q_type = {} 
        doc = nlp_model( q["question"])
        q_text["question"] = [x.text for x in doc.ents]
        q_type["question"] = [(x.text,x.type) for x in doc.ents]
        if len(q["answer"]["number"])!= 0:
            q_text["answer"] = q["answer"]["number"]
            q_type["answer"] = [(q["answer"]["number"],"number")] 
        elif len(q["answer"]["spans"])!= 0:
            q_text["answer"] =[]
            q_type["answer"] =[]
            for anw in q["answer"]["spans"]:
                doc = nlp_model(anw)
                q_text["answer"].append([x.text for x in doc.ents])
                q_type["answer"].append([(x.text,x.type) for x in doc.ents])
        else:
            q_text["answer"] =q["answer"]["date"]
            q_type["answer"] =[(q["answer"]["date"],"date")]
        question_answer_text[key].append(q_text)
        question_answer_type[key].append(q_type)    
    break