## Notebook used for exploring WebquestionsSP dataset and saving data in proper format

To use, download WebQSP zip file from https://www.microsoft.com/en-us/download/details.aspx?id=52763 and unzip in the CL4Code folder.

In [6]:
import json

%cd ~/CL4Code/WebQSP/data


fp = open("WebQSP.train.json", "r")
dataset = json.load(fp)

print(dataset.keys())

/data/lily/nos6/CL4Code/WebQSP/data
dict_keys(['Version', 'FreebaseVersion', 'Questions'])


In [3]:
print(dataset['Questions'][0].keys())
print(dataset['Questions'][0])

dict_keys(['QuestionId', 'RawQuestion', 'ProcessedQuestion', 'Parses'])
{'QuestionId': 'WebQTrn-0', 'RawQuestion': 'what is the name of justin bieber brother?', 'ProcessedQuestion': 'what is the name of justin bieber brother', 'Parses': [{'ParseId': 'WebQTrn-0.P0', 'AnnotatorId': 1, 'AnnotatorComment': {'ParseQuality': 'Complete', 'QuestionQuality': 'Good', 'Confidence': 'Normal', 'FreeFormComment': 'First-round parse verification'}, 'Sparql': "PREFIX ns: <http://rdf.freebase.com/ns/>\nSELECT DISTINCT ?x\nWHERE {\nFILTER (?x != ns:m.06w2sn5)\nFILTER (!isLiteral(?x) OR lang(?x) = '' OR langMatches(lang(?x), 'en'))\nns:m.06w2sn5 ns:people.person.sibling_s ?y .\n?y ns:people.sibling_relationship.sibling ?x .\n?x ns:people.person.gender ns:m.05zppz .\n}\n", 'PotentialTopicEntityMention': 'justin bieber', 'TopicEntityName': 'Justin Bieber', 'TopicEntityMid': 'm.06w2sn5', 'InferentialChain': ['people.person.sibling_s', 'people.sibling_relationship.sibling'], 'Constraints': [{'Operator': 'Equ

In [19]:
print(dataset['Questions'][0]['Parses'])
print(dataset['Questions'][2]['Parses'])
for i in range(0, len(dataset['Questions'])):
    if len(dataset['Questions'][i]['Parses']) != 1:
        print(dataset['Questions'][i]['Parses'][1])
        break
        
print(dataset['Questions'][0]['Parses'][0].keys())

[{'ParseId': 'WebQTrn-0.P0', 'AnnotatorId': 1, 'AnnotatorComment': {'ParseQuality': 'Complete', 'QuestionQuality': 'Good', 'Confidence': 'Normal', 'FreeFormComment': 'First-round parse verification'}, 'Sparql': "PREFIX ns: <http://rdf.freebase.com/ns/>\nSELECT DISTINCT ?x\nWHERE {\nFILTER (?x != ns:m.06w2sn5)\nFILTER (!isLiteral(?x) OR lang(?x) = '' OR langMatches(lang(?x), 'en'))\nns:m.06w2sn5 ns:people.person.sibling_s ?y .\n?y ns:people.sibling_relationship.sibling ?x .\n?x ns:people.person.gender ns:m.05zppz .\n}\n", 'PotentialTopicEntityMention': 'justin bieber', 'TopicEntityName': 'Justin Bieber', 'TopicEntityMid': 'm.06w2sn5', 'InferentialChain': ['people.person.sibling_s', 'people.sibling_relationship.sibling'], 'Constraints': [{'Operator': 'Equal', 'ArgumentType': 'Entity', 'Argument': 'm.05zppz', 'EntityName': 'Male', 'SourceNodeIndex': 1, 'NodePredicate': 'people.person.gender', 'ValueType': 'String'}], 'Time': None, 'Order': None, 'Answers': [{'AnswerType': 'Entity', 'Answe

In [27]:
def load_all_items(dataset, train, NE=False):
    instance_list = []

    for item in dataset['Questions']:
        
        instance = {}
        instance['question'] = item['RawQuestion']
        instance['task_id'] = item['QuestionId']
        instance['dsl_code'] = []
        instance['answer'] = []
        
        # add all parses and answers to the instance
        if len(item['Parses']) > 0:
            
            for parse in item['Parses']:
                # matching eval script provided by Microsoft, filter by good quality items
                if parse['AnnotatorComment']['QuestionQuality'] == 'Good' and parse['AnnotatorComment']['ParseQuality'] == 'Complete':
                    
                    if NE:
                        query = parse['Sparql']
                        if parse['TopicEntityMid'] is not None:
                            query = query.replace('ns:' + parse['TopicEntityMid'], parse['TopicEntityName'])
                        if len(parse['Constraints']) > 0:
                            for constraint in parse['Constraints']:
                                if constraint['EntityName'] is not None:
                                    if constraint['EntityName'] != '':
                                        query = query.replace('ns:' + constraint['Argument'], constraint['EntityName'])

                        instance['dsl_code'].append(query)
                    else:
                        instance['dsl_code'].append(parse['Sparql'])
                        

                    for answer in parse['Answers']:
                        if answer['EntityName'] not in instance['answer']:
                            instance['answer'].append(answer['EntityName'])
                        
                    

            instance_list.append(instance)

        else:
            instance = None
    print(len(instance_list))
    if train:
        train_set = instance_list[:int(len(instance_list)*0.8)]
        val_set = instance_list[int(len(instance_list)*0.8):]
        print(len(train_set), len(val_set))

        train_file = '../../cl/data/webqsp_train'
        val_file = '../../cl/data/webqsp_val'
        if NE:
            train_file = train_file + '_NE'
            val_file = val_file + '_NE'

        with open(train_file + '.json', 'w+') as f: 
            json.dump(train_set, f)
        with open(val_file + '.json', 'w+') as f: 
            json.dump(val_set, f)

    else:
        test_file = '../../cl/data/webqsp_test'
        if NE:
            test_file = test_file + '_NE'
        with open(test_file + '.json', 'w+') as f: 
            json.dump(instance_list, f)

        return instance_list

    

load_all_items(dataset, train=True)
load_all_items(dataset, train=True, NE=True)

# with open('../../cl/data/webqsp_train.json', 'w+') as f: 
#     json.dump(instance_list, f)
# with open('../../cl/data/webqsp_val.json', 'w+') as f: 
#     json.dump(val_set, f)

3098
2478 620
3098
2478 620


In [28]:
fp = open("WebQSP.test.json", "r")
test_dataset = json.load(fp)

load_all_items(test_dataset, train=False)
load_all_items(test_dataset, train=False, NE=True)
# with open('../../cl/data/webqsp_test.json', 'w+') as f: 
#     json.dump(test_list, f)

1639
1639


[{'question': 'what does jamaican people speak?',
  'task_id': 'WebQTest-0',
  'dsl_code': ["PREFIX ns: <http://rdf.freebase.com/ns/>\nSELECT DISTINCT ?x\nWHERE {\nFILTER (?x != Jamaica)\nFILTER (!isLiteral(?x) OR lang(?x) = '' OR langMatches(lang(?x), 'en'))\nJamaica ns:location.country.languages_spoken ?x .\n}\n",
   "PREFIX ns: <http://rdf.freebase.com/ns/>\nSELECT DISTINCT ?x\nWHERE {\nFILTER (?x != Jamaica)\nFILTER (!isLiteral(?x) OR lang(?x) = '' OR langMatches(lang(?x), 'en'))\nJamaica ns:location.country.official_language ?x .\n}\n"],
  'answer': ['Jamaican English', 'Jamaican Creole English Language']},
 {'question': 'what did james k polk do before he was president?',
  'task_id': 'WebQTest-1',
  'dsl_code': ['#MANUAL SPARQL\nPREFIX ns: <http://rdf.freebase.com/ns/>\nSELECT DISTINCT ?x\nWHERE {\n  {\n    SELECT ?pFrom \n    WHERE {\n      James K. Polk ns:government.politician.government_positions_held ?y . # James K. Polk\n      ?y ns:government.government_position_held.offi