In [1]:
import pandas as pd
import numpy as np
import re
import random
import warnings; warnings.simplefilter('ignore')
import operator
import json
from SPARQLWrapper import SPARQLWrapper, JSON
from sklearn.model_selection import train_test_split
import time

In [3]:
wikidata_endpoint = "https://query.wikidata.org/sparql"
sparql = SPARQLWrapper(wikidata_endpoint, agent='sparqlwrapper 1.8.6.dev0 (rdflib.github.io/sparqlwrapper)')

In [4]:
l2_train_df = pd.read_json('../data_files/lcq_with_entities/lcquad2.1_with_mentions_train_18April.json')
l2_test_df = pd.read_json('../data_files/lcq_with_entities/lcquad2.1_with_mentions_test_18April.json')

In [5]:
l2_test_df.head(2)

Unnamed: 0,uid,NNQT_question,question,sparql_wikidata,sparql_dbpedia18,template,template_id,paraphrased_question,type,answer,entity_mentions_question,entity_mentions_question_paraphrased,entites
0,20258,What is the {country} for {head of state} of {...,Who is the country for head of state of Mahmo...,select distinct ?sbj where { ?sbj wdt:P35 wd:...,select distinct ?subj where { ?statement <http...,<?S P O ; ?S InstanceOf Type>,10,What country is Mahmoud Abbas the head of stat...,test,,"[{'label': 'country', 'span': '{country}', 'sc...","[{'label': 'country', 'span': 'country', 'scor...",[mahmoud abbas]
1,7141,What is {population} of {Somalia} that is {poi...,What was the population of Somalia in 2009-0-0?,SELECT ?obj WHERE { wd:Q1045 p:P1082 ?s . ?s p...,select distinct ?obj where {\n?statement <htt...,(E pred F) prop ?value,7,"As of 2009, how many people lived in Somalia?",test,,"[{'label': 'population', 'span': 'population',...","[{'label': 'population', 'span': 'people lived...",[somalia]


In [7]:
l2_train_sel_df = l2_train_df[['uid','question','sparql_wikidata','entites']]
l2_test_sel_df = l2_test_df[['uid','question','sparql_wikidata','entites']]

In [8]:
l2_test_sel_df.head(2)

Unnamed: 0,uid,question,sparql_wikidata,entites
0,20258,Who is the country for head of state of Mahmo...,select distinct ?sbj where { ?sbj wdt:P35 wd:...,[mahmoud abbas]
1,7141,What was the population of Somalia in 2009-0-0?,SELECT ?obj WHERE { wd:Q1045 p:P1082 ?s . ?s p...,[somalia]


In [9]:
def get_pred(qry):
    qry_arr = qry.split(' ')
    tmp_lst = list(qry_arr)
    lst_words = [x.replace('.','').replace('}','').replace('{}','') for x in tmp_lst]
    lst_pred = [x.split(':')[-1] for x in lst_words if 'wdt:' in x or 'p:' in x or 'ps:' in x or 'pq' in x]
    new_lst = []
    for item in lst_pred:
        if not item in new_lst:
           new_lst.append(item)
    return new_lst

In [10]:
#adding a column which has list of predicates being used in the dataset
l2_train_sel_df['pred'] = l2_train_sel_df['sparql_wikidata'].apply(lambda x: get_pred(x))
l2_test_sel_df['pred'] = l2_test_sel_df['sparql_wikidata'].apply(lambda x: get_pred(x))

In [11]:
l2_train_sel_df.head(2)

Unnamed: 0,uid,question,sparql_wikidata,entites,pred
0,19719,What periodical literature does Delta Air Line...,select distinct ?obj where { wd:Q188920 wdt:P...,"[periodical literature, delta air lines]","[P2813, P31]"
1,15554,Who is the child of Ranavalona I's husband?,SELECT ?answer WHERE { wd:Q169794 wdt:P26 ?X ....,[ranavalona i],"[P26, P22]"


In [10]:
qry1 ="""SELECT ?predLabel 
WHERE
{
  ?pred wikibase:directClaim wdt:"""

qry2 ="""  .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } 
}
LIMIT 1"""

In [11]:
#function to get relation labels 
def fetch_rel(qry1, qry2, lst_pred_ids, st_predLabels):
    lst_pred_labels_new = []
    if len(lst_pred_ids) > 0:
       for item in lst_pred_ids:
           qry = qry1 + item + qry2
           try:
             sparql.setQuery(qry)
             sparql.setReturnFormat(JSON)
             results = sparql.query().convert() 
             for item in results['results']['bindings']: 
                 if len(item['predLabel']['value'].lower().strip()) > 0:
                    st_predLabels = st_predLabels + str(item['predLabel']['value'].lower().strip()) + '; '
                 else:
                    st_predLabels = st_predLabels + ' '
           except:
             st_predLabels = ''
             pass
    else:
       st_predLabels = ''
    if st_predLabels != '':
       st_predLabels = st_predLabels[:-2]
       st_arr = st_predLabels.split(';')
       for split_lbl in st_arr:
           if len(split_lbl)>0: 
              lst_pred_labels_new.append(split_lbl.strip())
       return lst_pred_labels_new
    else:
       return lst_pred_labels_new

In [12]:
l2_test_sel_df['predLabels'] = l2_test_sel_df['pred'].apply(lambda x: fetch_rel(qry1, qry2, x, ''))

In [13]:
l2_test_sel_df.to_json('../data_files/lcq2_test_with_ent_rel.json', orient='records', indent=len(l2_test_sel_df.columns))

In [14]:
l2_train_sel_df['predLabels'] = l2_train_sel_df['pred'].apply(lambda x: fetch_rel(qry1, qry2, x, ''))

In [15]:
l2_train_sel_df.to_json('../data_files/lcq2_train_with_ent_rel.json', orient='records', indent=len(l2_train_sel_df.columns))

### Annotations for Rony 25th Nov 2021

In [12]:
l2_rony_train_df = pd.read_json('../data_files/lcq2_MT_annot_train.json')
l2_rony_test_df = pd.read_json('../data_files/lcq2_MT_annot_test.json')

In [14]:
l2_rony_test_df.head(2)

Unnamed: 0,uid,question,sparql_wikidata,template_id,question_pos,ent_masked_ques,question_with_pos,triples_template,entities,predLabels,annotated_question
0,20258,Who is the country for head of state of Mahmo...,select distinct ?sbj where { ?sbj wdt:P35 wd:...,10,WP VBZ DT NN IN NN IN NN IN NNP NNP,who is the country for head of state of [Enti...,Who/WP is/VBZ the/DT country/NN for/IN head/NN...,subject < head of state > [ mahmoud abbas ] . ...,[mahmoud abbas],"[head of state, instance of]",[mahmoud abbas] <head of state> <instance of> ...
1,7141,What was the population of Somalia in 2009-0-0?,SELECT ?obj WHERE { wd:Q1045 p:P1082 ?s . ?s p...,7,WP VBD DT NN IN NNP IN CD,what was the population of [Entity1] in 2009-0-0?,What/WP was/VBD the/DT population/NN of/IN Som...,[ somalia ] < population > ?s . ?s < populatio...,[somalia],"[population, point in time]",[somalia] <population> <point in time> What wa...


In [15]:
l2_train_sel_df.head(2)

Unnamed: 0,uid,question,sparql_wikidata,entites,pred
0,19719,What periodical literature does Delta Air Line...,select distinct ?obj where { wd:Q188920 wdt:P...,"[periodical literature, delta air lines]","[P2813, P31]"
1,15554,Who is the child of Ranavalona I's husband?,SELECT ?answer WHERE { wd:Q169794 wdt:P26 ?X ....,[ranavalona i],"[P26, P22]"


In [26]:
def concat_predLabels_mit_pred(l2_rony_df,l2_sel_df):
    lst_new_predLabels = []
    for idx,row in l2_rony_df.iterrows():
        uid = row['uid']
        temp_l2_sel_df = l2_sel_df.loc[l2_sel_df['uid']==uid]
        lst = []
        for l,c in zip(row['predLabels'],temp_l2_sel_df['pred'].values[0]):
            st = str(l) + ': ' + c
            lst.append(st)
        lst_new_predLabels.append(lst)
    l2_rony_df['newPredLabels'] = lst_new_predLabels   
    return l2_rony_df

In [31]:
new_l2_rony_test_df = concat_predLabels_mit_pred(l2_rony_test_df,l2_test_sel_df) 
new_l2_rony_test_df.tail(1)

Unnamed: 0,uid,question,sparql_wikidata,template_id,question_pos,ent_masked_ques,question_with_pos,triples_template,entities,predLabels,annotated_question,newPredLabels
5968,29372,What Theoi Project ID does Manticore has?,select distinct ?answer where { wd:Q223795 wdt...,22,WP NNP NNP NNP VBZ NNP VBZ,what theoi project id does [Entity1] has?,What/WP Theoi/NNP Project/NNP ID/NNP does/VBZ ...,[ manticore ] < theoi project id > answer,[],[theoi project id],<theoi project id> What Theoi Project ID does ...,[theoi project id: P3545]


In [32]:
new_l2_rony_train_df = concat_predLabels_mit_pred(l2_rony_train_df,l2_train_sel_df) 
new_l2_rony_train_df.tail(1)

Unnamed: 0,uid,question,sparql_wikidata,template_id,question_pos,ent_masked_ques,question_with_pos,triples_template,entities,predLabels,annotated_question,newPredLabels
23885,22794,For which film did Anil Kapoor win a Screen Ac...,select distinct ?sbj where { ?sbj wdt:P1346 w...,10,IN WDT NN VBD NNP NNP VBP DT JJ NNP NNP NNP,for which film did [Entity1] win a screen acto...,For/IN which/WDT film/NN did/VBD Anil/NNP Kapo...,subject < winner > [ anil kapoor ] . subject <...,[anil kapoor],"[winner, instance of]",[anil kapoor] <winner> <instance of> For which...,"[winner: P1346, instance of: P31]"


In [33]:
new_l2_rony_train_df.to_json('../data_files/lcq2_train_annot_rony_25thNov.json', orient='records', indent=len(new_l2_rony_train_df.columns))
new_l2_rony_test_df.to_json('../data_files/lcq2_test_annot_rony_25thNov.json', orient='records', indent=len(new_l2_rony_test_df.columns))

#### Now fetch entities and their respective labels

In [34]:
def get_ent(qry):
    qry_arr = qry.split(' ')
    tmp_lst = list(qry_arr)
    lst_words = [x.replace('.','').replace('}','').replace('{}','') for x in tmp_lst]
    lst_ent = [x.split(':')[-1] for x in lst_words if 'wd:' in x]
    return lst_ent

In [38]:
new_l2_rony_test_df['ent_id'] = new_l2_rony_test_df['sparql_wikidata'].apply(lambda x: get_ent(x))
new_l2_rony_train_df['ent_id'] = new_l2_rony_train_df['sparql_wikidata'].apply(lambda x: get_ent(x))

In [39]:
new_l2_rony_train_df.head(2)

Unnamed: 0,uid,question,sparql_wikidata,template_id,question_pos,ent_masked_ques,question_with_pos,triples_template,entities,predLabels,annotated_question,newPredLabels,ent_id
0,19719,What periodical literature does Delta Air Line...,select distinct ?obj where { wd:Q188920 wdt:P...,5,WP JJ NN VBZ NNP NNP NNPS NN IN DT NN,what [Entity1] does [Entity2] use as a moutpiece?,What/WP periodical/JJ literature/NN does/VBZ D...,[ delta air lines ] < house publication > obje...,"[periodical literature, delta air lines]","[house publication, instance of]",[periodical literature] [delta air lines] <hou...,"[house publication: P2813, instance of: P31]","[Q188920, Q1002697]"
1,15554,Who is the child of Ranavalona I's husband?,SELECT ?answer WHERE { wd:Q169794 wdt:P26 ?X ....,18,WP VBZ DT NN IN NNP PRP POS NN,who is the child of [Entity1]'s husband?,Who/WP is/VBZ the/DT child/NN of/IN Ranavalona...,[ ranavalona i of madagascar ] < spouse > ?x ....,[ranavalona i],"[spouse, father]",[ranavalona i] <spouse> <father> Who is the ch...,"[spouse: P26, father: P22]",[Q169794]


In [44]:
qry_pt1 = 'select distinct ?label where { wd:'
qry_pt2 = ' rdfs:label ?label . FILTER (langMatches( lang(?label), "EN" ) ) } LIMIT 1'

In [42]:
#function to fetch answer from wikidata endpoint
def fetch_label(qry):
    res = ''
    sparql.setQuery(qry)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert() 
    for item in results['results']['bindings']: 
        res = item['label']['value'].lower().strip()
    return res

In [63]:
def fetch_lst_entites_for_df(df,qry_pt1,qry_pt2):
    excp_ctr = 0
    try:
       ent_list_for_df = []
       for idx,row in df.iterrows():
           #list of answer labels for this particular question block
           temp_lst_labels = []
           if len(row['ent_id']) > 0:
              for item in row['ent_id']:
                  qry =  qry_pt1 + item + qry_pt2
                  curr_label = fetch_label(qry)    
                  temp_lst_labels.append(curr_label+': '+item)        
           ent_list_for_df.append(temp_lst_labels)
    except:
       print('exception for ent id ',item) 
       ent_list_for_df.append(temp_lst_labels)    
    print('No of exceptions: ',excp_ctr)
    return ent_list_for_df

In [64]:
new_l2_rony_test_df['new_LabelsEnt'] = fetch_lst_entites_for_df(new_l2_rony_test_df,qry_pt1,qry_pt2)
print(len(new_l2_rony_test_df))
new_l2_rony_test_df.tail(3)

No of exceptions:  0
5969


Unnamed: 0,uid,question,sparql_wikidata,template_id,question_pos,ent_masked_ques,question_with_pos,triples_template,entities,predLabels,annotated_question,newPredLabels,ent_id,new_LabelsEnt
5966,16961,What is antonym of of spore print color of Ple...,SELECT ?answer WHERE { wd:Q186451 wdt:P787 ?X ...,23,WP VBZ DT IN IN NN NN NN IN NNP NN,what is antonym of of spore print color of [En...,What/WP is/VBZ antonym/DT of/IN of/IN spore/NN...,[ pleurotus ostreatus ] < spore print color > ...,[pleurotus ostreatus],"[spore print color, opposite of]",[pleurotus ostreatus] <spore print color> <opp...,"[spore print color: P787, opposite of: P461]",[Q186451],[pleurotus ostreatus: Q186451]
5967,23859,Tell me mixture whose name has the word spirom...,SELECT DISTINCT ?sbj ?sbj_label WHERE { ?sbj w...,1,VB PRP NN WP NN VBZ DT NN NN IN PRP,tell me mixture whose name has the word spirom...,Tell/VB me/PRP mixture/NN whose/WP$ name/NN ha...,subject < instance of > [ mixture ] . subject ...,[],[instance of],<instance of> Tell me mixture whose name has t...,[instance of: P31],[Q169336],[mixture: Q169336]
5968,29372,What Theoi Project ID does Manticore has?,select distinct ?answer where { wd:Q223795 wdt...,22,WP NNP NNP NNP VBZ NNP VBZ,what theoi project id does [Entity1] has?,What/WP Theoi/NNP Project/NNP ID/NNP does/VBZ ...,[ manticore ] < theoi project id > answer,[],[theoi project id],<theoi project id> What Theoi Project ID does ...,[theoi project id: P3545],[Q223795],[manticore: Q223795]


In [65]:
new_l2_rony_test_df.to_json('../data_files/lcq2_test_annot_rony_entLbl_25thNov.json', orient='records', indent=len(new_l2_rony_test_df.columns))

In [66]:
new_l2_rony_train_df['new_LabelsEnt'] = fetch_lst_entites_for_df(new_l2_rony_train_df,qry_pt1,qry_pt2)
print(len(new_l2_rony_train_df))
new_l2_rony_train_df.tail(3)

No of exceptions:  0
23886


Unnamed: 0,uid,question,sparql_wikidata,template_id,question_pos,ent_masked_ques,question_with_pos,triples_template,entities,predLabels,annotated_question,newPredLabels,ent_id,new_LabelsEnt
23883,29450,Which is the College de France professor ID (1...,select distinct ?answer where { wd:Q295393 wdt...,22,WDT VBZ DT NNP FW NNP NN NNP JJ IN NNP NNP,which is the college de france professor id (1...,Which/WDT is/VBZ the/DT College/NNP de/FW Fran...,[ marcel mauss ] < college de france professor...,[],[college de france professor id (1909-1939)],<college de france professor id (1909-1939)> W...,[college de france professor id (1909-1939): P...,[Q295393],[marcel mauss: Q295393]
23884,12241,"What type of people live in Fresno, California...",SELECT ?answer WHERE { wd:Q42807 wdt:P190 ?ans...,21,WP NN IN NNS VBP IN NNP NNP POS NN NN,"what type of people live in fresno, california...",What/WP type/NN of/IN people/NNS live/VBP in/I...,[ nîmes ] < twinned administrative body > answ...,"[nîmes, category:people from fresno, california]","[twinned administrative body, category of asso...","[nîmes] [category:people from fresno, californ...","[twinned administrative body: P190, category o...","[Q42807, Q7118067]","[nîmes: Q42807, category:people from fresno, c..."
23885,22794,For which film did Anil Kapoor win a Screen Ac...,select distinct ?sbj where { ?sbj wdt:P1346 w...,10,IN WDT NN VBD NNP NNP VBP DT JJ NNP NNP NNP,for which film did [Entity1] win a screen acto...,For/IN which/WDT film/NN did/VBD Anil/NNP Kapo...,subject < winner > [ anil kapoor ] . subject <...,[anil kapoor],"[winner, instance of]",[anil kapoor] <winner> <instance of> For which...,"[winner: P1346, instance of: P31]","[Q313956, Q268200]","[anil kapoor: Q313956, screen actors guild awa..."


In [67]:
new_l2_rony_train_df.to_json('../data_files/lcq2_train_annot_rony_entLbl_25thNov.json', orient='records', indent=len(new_l2_rony_train_df.columns))

### End of changes for Rony's file 25th Nov 2021

#### To the files of MT train and test, append these additional columns of entities and relations and then add annot

In [1]:
l2_train_sel_df = pd.read_json('../data_files/lcq2_train_with_ent_rel.json')
l2_test_sel_df = pd.read_json('../data_files/lcq2_test_with_ent_rel.json')
l2_test_sel_df.head(5)

NameError: name 'pd' is not defined

In [17]:
def get_cnt_empty_lsts(lst):
    ctr = 0
    for item in lst: 
        if len(item) == 0:
           ctr += 1
    return ctr

In [20]:
print('total length of test dataframe is:',len(l2_test_sel_df))
print('total no of questions with empty entities in the test dataframe is: ',get_cnt_empty_lsts(list(l2_test_sel_df['entites'])))
print('total no of questions with empty predicate labels in the test dataframe is: ',get_cnt_empty_lsts(list(l2_test_sel_df['predLabels'])))

total length of test dataframe is: 5969
total no of questions with empty entities in the test dataframe is:  1351
total no of questions with empty predicate labels in the test dataframe is:  9


In [21]:
print('total length of train dataframe is:',len(l2_train_sel_df))
print('total no of questions with empty entities in the train dataframe is: ',get_cnt_empty_lsts(list(l2_train_sel_df['entites'])))
print('total no of questions with empty predicate labels in the train dataframe is: ',get_cnt_empty_lsts(list(l2_train_sel_df['predLabels'])))

total length of train dataframe is: 23886
total no of questions with empty entities in the train dataframe is:  5592
total no of questions with empty predicate labels in the train dataframe is:  26


### Bringing in Machine Translation LCQuAD dataset

In [59]:
l2_mt_train_df = pd.read_json('../data_files/lcq2_MT_train.json')
l2_mt_test_df = pd.read_json('../data_files/lcq2_MT_test.json')

In [23]:
l2_mt_test_df.head(2)

Unnamed: 0,uid,question,sparql_wikidata,template_id,question_pos,ent_masked_ques,question_with_pos,triples_template
0,20258,Who is the country for head of state of Mahmo...,select distinct ?sbj where { ?sbj wdt:P35 wd:...,10,WP VBZ DT NN IN NN IN NN IN NNP NNP,who is the country for head of state of [Enti...,Who/WP is/VBZ the/DT country/NN for/IN head/NN...,subject < head of state > [ mahmoud abbas ] . ...
1,7141,What was the population of Somalia in 2009-0-0?,SELECT ?obj WHERE { wd:Q1045 p:P1082 ?s . ?s p...,7,WP VBD DT NN IN NNP IN CD,what was the population of [Entity1] in 2009-0-0?,What/WP was/VBD the/DT population/NN of/IN Som...,[ somalia ] < population > ?s . ?s < populatio...


In [60]:
def add_ent_rel_annot(df,df_mt):
    lst_entities = []
    lst_predLabels = []
    lst_annot_ques = []
    for idx,row in df_mt.iterrows():
        st_ent_annot = ''
        st_rel_annot = ''
        uid = row['uid']
        red_df = df.loc[df['uid']==uid]
        if not red_df.empty:
           for ent_item in red_df['entites'].values[0]:
               st_ent_annot += "[" + ent_item +"] "
               st_ent_annot = st_ent_annot.replace('\n',' ')
           for rel_item in red_df['predLabels'].values[0]:
               st_rel_annot += "<" + rel_item +"> "
               st_rel_annot = st_rel_annot.replace('\n',' ')
           tmp_st = str(red_df['question'].values[0]).replace('\n','')
           lst_annot_ques.append(st_ent_annot + st_rel_annot + tmp_st)
           lst_entities.append(red_df['entites'].values[0])
           lst_predLabels.append(red_df['predLabels'].values[0])
        else:
           lst_annot_ques.append(str(row['question']).replace('\n',''))
           lst_entities.append([])
           lst_predLabels.append([])
    df_mt['entities'] = lst_entities
    df_mt['predLabels'] = lst_predLabels
    df_mt['annotated_question'] = lst_annot_ques
    return df_mt

In [61]:
df_mt_test = add_ent_rel_annot(l2_test_sel_df,l2_mt_test_df)
df_mt_test.head(2)

Unnamed: 0,uid,question,sparql_wikidata,template_id,question_pos,ent_masked_ques,question_with_pos,triples_template,entities,predLabels,annotated_question
0,20258,Who is the country for head of state of Mahmo...,select distinct ?sbj where { ?sbj wdt:P35 wd:...,10,WP VBZ DT NN IN NN IN NN IN NNP NNP,who is the country for head of state of [Enti...,Who/WP is/VBZ the/DT country/NN for/IN head/NN...,subject < head of state > [ mahmoud abbas ] . ...,[mahmoud abbas],"[head of state, instance of]",[mahmoud abbas] <head of state> <instance of> ...
1,7141,What was the population of Somalia in 2009-0-0?,SELECT ?obj WHERE { wd:Q1045 p:P1082 ?s . ?s p...,7,WP VBD DT NN IN NNP IN CD,what was the population of [Entity1] in 2009-0-0?,What/WP was/VBD the/DT population/NN of/IN Som...,[ somalia ] < population > ?s . ?s < populatio...,[somalia],"[population, point in time]",[somalia] <population> <point in time> What wa...


In [62]:
df_mt_test.to_json('../data_files/lcq2_MT_annot_test.json', orient='records', indent=len(df_mt_test.columns))

In [63]:
df_mt_train = add_ent_rel_annot(l2_train_sel_df,l2_mt_train_df)
df_mt_train.head(2)

Unnamed: 0,uid,question,sparql_wikidata,template_id,question_pos,ent_masked_ques,question_with_pos,triples_template,entities,predLabels,annotated_question
0,19719,What periodical literature does Delta Air Line...,select distinct ?obj where { wd:Q188920 wdt:P...,5,WP JJ NN VBZ NNP NNP NNPS NN IN DT NN,what [Entity1] does [Entity2] use as a moutpiece?,What/WP periodical/JJ literature/NN does/VBZ D...,[ delta air lines ] < house publication > obje...,"[periodical literature, delta air lines]","[house publication, instance of]",[periodical literature] [delta air lines] <hou...
1,15554,Who is the child of Ranavalona I's husband?,SELECT ?answer WHERE { wd:Q169794 wdt:P26 ?X ....,18,WP VBZ DT NN IN NNP PRP POS NN,who is the child of [Entity1]'s husband?,Who/WP is/VBZ the/DT child/NN of/IN Ranavalona...,[ ranavalona i of madagascar ] < spouse > ?x ....,[ranavalona i],"[spouse, father]",[ranavalona i] <spouse> <father> Who is the ch...


In [64]:
df_mt_train.to_json('../data_files/lcq2_MT_annot_train.json', orient='records', indent=len(df_mt_train.columns))

#### Filtering out those questions where entities and relation labels are blank

In [2]:
df_mt_test = pd.read_json('../data_files/lcq2_MT_annot_test.json')
df_mt_train = pd.read_json('../data_files/lcq2_MT_annot_train.json')

In [3]:
print(len(df_mt_train))
print(len(df_mt_test))

23886
5969


In [36]:
df_mt_test.loc[df_mt_test['template_id'].isin([19])].head(5)    #5, 3, 10, 11, 12, 15, 16, 19

Unnamed: 0,uid,question,sparql_wikidata,template_id,question_pos,ent_masked_ques,question_with_pos,triples_template,entities,predLabels,annotated_question
58,1261,Charles the Bald has how many noble titles?,SELECT (COUNT(?obj) AS ?value ) { wd:Q71231 wd...,19,NNP DT NNP VBZ WRB JJ JJ NNS,[Entity1] has how many noble titles?,Charles/NNP the/DT Bald/NNP has/VBZ how/WRB ma...,select (count(object) as value ) [ charles th...,[charles the bald],[noble title],[charles the bald] <noble title> Charles the B...
71,1436,What is the time-weighted average exposure lim...,SELECT (COUNT(?obj) AS ?value ) { wd:Q871 wdt:...,19,WP VBZ DT JJ JJ NN NN IN NN,what is the time-weighted average exposure lim...,What/WP is/VBZ the/DT time-weighted/JJ average...,select (count(object) as value ) [ arsenic ] ...,[arsenic],[time-weighted average exposure limit],[arsenic] <time-weighted average exposure limi...
72,1175,how many superpowers does wonder woman have?,SELECT (COUNT(?obj) AS ?value ) { wd:Q338430 w...,19,WRB JJ NNS VBZ JJR NN VB,how many superpowers does [Entity1] have?,how/WRB many/JJ superpowers/NNS does/VBZ wonde...,select (count(object) as value ) [ wonder wom...,[wonder woman],[superhuman feature or ability],[wonder woman] <superhuman feature or ability>...
80,1397,In French have a person in grammatical?,SELECT (COUNT(?obj) AS ?value ) { wd:Q150 wdt:...,19,IN NNP VBP DT NN IN JJ,in [Entity1] have a person in grammatical?,In/IN French/NNP have/VBP a/DT person/NN in/IN...,select (count(object) as value ) [ french ] <...,[french],[has grammatical person],[french] <has grammatical person> In French ha...
101,1584,How much does Primavera depict?,SELECT (COUNT(?obj) AS ?value ) { wd:Q549847 w...,19,WRB JJ VBZ VB VB,how much does [Entity1] depict?,How/WRB much/JJ does/VBZ Primavera/VB depict/V...,select (count(object) as value ) [ primavera ...,[primavera],[depicts],[primavera] <depicts> How much does Primavera ...


In [37]:
#filtering teh dataset to avoid complex questions  ... only have 3,5,11,15,19
df_mt_train = df_mt_train.loc[df_mt_train['template_id'].isin([3,5,11,15,19])]

In [38]:
len(df_mt_train)

4618

In [39]:
df_mt_train['annotated_question'] = df_mt_train['annotated_question'].apply(lambda x: str(x[:]).replace('\n', ' '))
#df_mt_test['annotated_question'] = df_mt_test['annotated_question'].apply(lambda x: str(x[:]).replace('\n', ' '))

In [4]:
df_mt_test.head(2)

Unnamed: 0,uid,question,sparql_wikidata,template_id,question_pos,ent_masked_ques,question_with_pos,triples_template,entities,predLabels,annotated_question
0,20258,Who is the country for head of state of Mahmo...,select distinct ?sbj where { ?sbj wdt:P35 wd:...,10,WP VBZ DT NN IN NN IN NN IN NNP NNP,who is the country for head of state of [Enti...,Who/WP is/VBZ the/DT country/NN for/IN head/NN...,subject < head of state > [ mahmoud abbas ] . ...,[mahmoud abbas],"[head of state, instance of]",[mahmoud abbas] <head of state> <instance of> ...
1,7141,What was the population of Somalia in 2009-0-0?,SELECT ?obj WHERE { wd:Q1045 p:P1082 ?s . ?s p...,7,WP VBD DT NN IN NNP IN CD,what was the population of [Entity1] in 2009-0-0?,What/WP was/VBD the/DT population/NN of/IN Som...,[ somalia ] < population > ?s . ?s < populatio...,[somalia],"[population, point in time]",[somalia] <population> <point in time> What wa...


In [40]:
#put an indicator where entity or predLabels list is blank
def get_df_non_empty_ent_rel(df):
    lst_ent_pred_present = []
    for idx,row in df.iterrows(): 
        if len(row['entities']) == 0 or len(row['predLabels']) == 0:
           lst_ent_pred_present.append(0)
        else:
           lst_ent_pred_present.append(1)
    df['ent_pred_present'] = lst_ent_pred_present
    return df

In [6]:
df_mt_test_rev = get_df_non_empty_ent_rel(df_mt_test)
df_mt_test_rev = df_mt_test_rev.loc[df_mt_test_rev['ent_pred_present']==1]
print(len(df_mt_test_rev))

4615


In [41]:
df_mt_train_rev = get_df_non_empty_ent_rel(df_mt_train)
df_mt_train_rev = df_mt_train_rev.loc[df_mt_train_rev['ent_pred_present']==1]
print(len(df_mt_train_rev))

3338


In [8]:
test_df = df_mt_test_rev[['annotated_question','triples_template']]
test_df['data'] = test_df.apply(lambda row: (str(row['annotated_question']) + ' ' + '\t' + ' ' + str(row['triples_template'])), axis=1) 
test_df[['data']].to_csv('../data_files/lcq2_ann_MT_test.txt', header=None, index=None, mode='a')

In [47]:
#train_df = df_mt_train_rev[['annotated_question','triples_template','question']]
#train_df['data'] = train_df.apply(lambda row: (str(row['annotated_question']) + ' ' + '\t' + ' ' + str(row['triples_template'])), axis=1) 
train_df['data'] = train_df.apply(lambda row: (str(row['question']) + ' ' + '\t' + ' ' + str(row['triples_template'])), axis=1) 
train_df['data'].to_csv('../data_files/lcq2_ann_MT_train_23nov_q.txt', header=None, index=None, mode='a')

In [46]:
train_df[['question']].to_csv('../data_files/lcq2_ann_Q_MT_train_23nov_q.en.txt', header=None, index=None, sep='\t', mode='a')

In [43]:
train_df[['annotated_question']].to_csv('../data_files/lcq2_ann_Q_MT_train_23nov.en.txt', header=None, index=None, sep='\t', mode='a')
train_df[['triples_template']].to_csv('../data_files/lcq2_ann_T_MT_train_23nov.sparql.txt', header=None, index=None, sep='\t', mode='a')

In [14]:
print(len(train_df['data']))

18287


In [12]:
print(len(train_df[['triples_template']]))

18287


In [29]:
print(df_mt_train_rev.iloc[3534])

uid                                                               12992
question              "What is  Hiroshima Prefecture'S borders, that...
sparql_wikidata       SELECT ?answer WHERE { wd:Q617375 wdt:P47 ?ans...
template_id                                                          21
question_pos          WP VBZ NNP NNP POS NNS WDT VBZ NNP NNP IN NN I...
ent_masked_ques       "what is  [Entity1]'s borders, that has [Entit...
question_with_pos     / What/WP is/VBZ Hiroshima/NNP Prefecture/NNP'...
triples_template      [ hiroshima prefecture ] < shares border with ...
annotated_question    [hiroshima prefecture] [tsugumasa muraoka] <sh...
entities                      [hiroshima prefecture, tsugumasa muraoka]
predLabels                     [shares border with, head of government]
con_ques_tpl          [hiroshima prefecture] [tsugumasa muraoka] <sh...
ent_pred_present                                                      1
Name: 4561, dtype: object


### 31st Dec'21: Question length of different types of questions

In [33]:
df_mt_test = pd.read_json('../annotatedLabels_25Nov/lcq2_test_annot_25thNov21.json')
df_mt_train = pd.read_json('../annotatedLabels_25Nov/lcq2_train_annot_25thNov21.json')

In [34]:
df_mt_test.head(2)

Unnamed: 0,uid,question,sparql_wikidata,template_id,question_pos,ent_masked_ques,question_with_pos,triples_template,entities,predLabels,annotated_question,newPredLabels,ent_id,new_LabelsEnt
0,20258,Who is the country for head of state of Mahmo...,select distinct ?sbj where { ?sbj wdt:P35 wd:...,10,WP VBZ DT NN IN NN IN NN IN NNP NNP,who is the country for head of state of [Enti...,Who/WP is/VBZ the/DT country/NN for/IN head/NN...,subject < head of state > [ mahmoud abbas ] . ...,[mahmoud abbas],"[head of state, instance of]",[mahmoud abbas] <head of state> <instance of> ...,"[head of state: P35, instance of: P31]","[Q127998, Q6256]","[mahmoud abbas: Q127998, country: Q6256]"
1,7141,What was the population of Somalia in 2009-0-0?,SELECT ?obj WHERE { wd:Q1045 p:P1082 ?s . ?s p...,7,WP VBD DT NN IN NNP IN CD,what was the population of [Entity1] in 2009-0-0?,What/WP was/VBD the/DT population/NN of/IN Som...,[ somalia ] < population > ?s . ?s < populatio...,[somalia],"[population, point in time]",[somalia] <population> <point in time> What wa...,"[population: P1082, point in time: P585]",[Q1045],[somalia: Q1045]


In [35]:
df_mt_test['cnt_ques_chr_fr_tmpl'] = df_mt_test.question.apply(lambda x: len(x))
print(min(df_mt_test['cnt_ques_chr_fr_tmpl']))
df_mt_train['cnt_ques_chr_fr_tmpl'] = df_mt_train.question.apply(lambda x: len(x))
print(min(df_mt_train['cnt_ques_chr_fr_tmpl']))

11
10


In [47]:
#string_ques_df, temp_id 1,2,3,4
string_ques_test_df = df_mt_test.loc[df_mt_test['template_id'].isin([1,2,3,4])]
string_ques_train_df = df_mt_train.loc[df_mt_train['template_id'].isin([1,2,3,4])]
string_ques_test_df.sort_values(by=['cnt_ques_chr_fr_tmpl'], inplace=True)
string_ques_train_df.sort_values(by=['cnt_ques_chr_fr_tmpl'], inplace=True)
print(string_ques_test_df.question.head(5))
#print(string_ques_train_df.question.head(5))

4906         WHAT IS SCHISM STARTS WITH S
434      which annual event starts with t
3452     which tagma starts with letter a
3950    What FFH habitat starts with "v"?
5729    which type of value starts with b
Name: question, dtype: object


In [46]:
#two_hop_ques_df, temp_id 18,21,23
two_hop_ques_test_df = df_mt_test.loc[df_mt_test['template_id'].isin([18,21,23])]
two_hop_ques_train_df = df_mt_train.loc[df_mt_train['template_id'].isin([18,21,23])]
two_hop_ques_test_df.sort_values(by=['cnt_ques_chr_fr_tmpl'], inplace=True)
two_hop_ques_train_df.sort_values(by=['cnt_ques_chr_fr_tmpl'], inplace=True)
print(two_hop_ques_test_df.question.head(5))
#print(two_hop_ques_train_df.question.head(15))

2340            What is it?
2747            What is it?
4767            What is it?
4464           Governmental
138     Who wrote Trishira?
Name: question, dtype: object


In [48]:
#boolean_df, temp_id 15,16,17
bool_test_df = df_mt_test.loc[df_mt_test['template_id'].isin([15,16,17])]
bool_train_df = df_mt_train.loc[df_mt_train['template_id'].isin([15,16,17])]
bool_test_df.sort_values(by=['cnt_ques_chr_fr_tmpl'], inplace=True)
bool_train_df.sort_values(by=['cnt_ques_chr_fr_tmpl'], inplace=True)
print(bool_test_df.question.head(5))
#print(bool_train_df.question.head(15))

3048        Is Ringo Starr a drummer?
2528        Did the Lion make a roar?
5403     Is 0.0 the minimum age of G?
4053     Is Deepika Padukone a model?
1781    Does 0 mean the same as zero?
Name: question, dtype: object


In [49]:
#qual_df, temp_id 6,7
qual_test_df = df_mt_test.loc[df_mt_test['template_id'].isin([6,7])]
qual_train_df = df_mt_train.loc[df_mt_train['template_id'].isin([6,7])]
qual_test_df.sort_values(by=['cnt_ques_chr_fr_tmpl'], inplace=True)
qual_train_df.sort_values(by=['cnt_ques_chr_fr_tmpl'], inplace=True)
print(qual_test_df.question.head(5))
#print(qual_train_df.question.head(15))

282          When did Pausanias die?
3820        What is Ruby in English?
5539       When did Philopoemen die?
1924     When beer started in Egypt?
1273    What is Madonna's real name?
Name: question, dtype: object


In [50]:
#two_int_ques_df tmp_id 8,9,24
two_int_test_df = df_mt_test.loc[df_mt_test['template_id'].isin([8,9,24])]
two_int_train_df = df_mt_train.loc[df_mt_train['template_id'].isin([8,9,24])]
two_int_test_df.sort_values(by=['cnt_ques_chr_fr_tmpl'], inplace=True)
two_int_train_df.sort_values(by=['cnt_ques_chr_fr_tmpl'], inplace=True)
print(two_int_test_df.question.head(5))
#print(two_int_train_df.question.head(15))

4154        What is another name for sex?
5004      When was Sandra Bernhardt born?
1174     Which was the student of Cicero?
1601     Where did Joseph Goebbels study?
5042    When did John Ruskin get married?
Name: question, dtype: object


In [None]:
#i.e. shortest 2 int ques - "Who was Poseidon mother and child?"