In [85]:
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
from tqdm import tqdm

In [86]:
df_test = pd.read_csv('../../../data/qald-9-preprocess/2021-04-19/test.csv')
df_train = pd.read_csv('../../../data/qald-9-preprocess/2021-04-19/train.csv')

In [87]:
del df_test['id']
del df_test['orig_query']
del df_test['answers']
del df_train['id']
del df_train['orig_query']
del df_train['answers']

In [88]:
df_test.head(2)

Unnamed: 0,question,sparql
0,What is the time zone of Salt Lake City?,SELECT DISTINCT ?uri WHERE { res:Salt_Lake_City onto:timeZone ?uri }
1,Who killed Caesar?,SELECT DISTINCT ?uri WHERE { ?uri dct:subject dbc:Assassins_of_Julius_Caesar }


In [89]:
df_prefix = pd.read_csv('../../../data/qald-9-preprocess/2021-04-19/prefix.csv', index_col=1, squeeze = True)

In [90]:
prefix_dict = df_prefix.to_dict()

In [91]:
prefix_dict

{'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf:',
 'http://dbpedia.org/ontology/': 'onto:',
 'http://dbpedia.org/property/': 'dbpedia2:',
 'http://dbpedia.org/resource/': 'res:',
 'http://www.w3.org/2004/02/skos/core#': 'skos:',
 'http://purl.org/dc/terms/': 'dct:',
 'http://xmlns.com/foaf/0.1/': 'foaf:',
 'http://dbpedia.org/resource/Category:': 'dbc:',
 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs:',
 'http://dbpedia.org/class/yago/': 'yago:',
 'http://www.w3.org/2002/07/owl#': 'owl:',
 'http://www.w3.org/2001/XMLSchema#': 'xsd:',
 'http://dbpedia.org/': 'db:'}

In [92]:
rep_dict = {
    "ASK": "ask",
    "WHERE": "where",
    "SELECT": "select",
    "{": "[",
    "}": "]",
    "DISTINCT": "distinct",
    "ORDER": "order",
    "LIMIT": "limit",
    "FILTER": "filter",
    "?sbj": "?subject",
    "?obj": "?object",
}
rep_dict2 = {"{": "", "}": ""}


def replace_all(text, dict):
    for i, j in dict.items():
        text = text.replace(i, j)
    return text


## Goal: Trim + Replace LowerCase + Remove weirdly long Question + replaceQ/P

In [93]:
def prepare(ds):
    col = 'translation'
    df = pd.DataFrame(columns=[col])
    for d in tqdm(ds):
        try:
            qry = replace_all(d['sparql'], prefix_dict)
            qry = replace_all(qry, rep_dict)
            q = d['question'].strip().strip("\"").lower()
            q = replace_all(q, rep_dict2)
            df = df.append({col: {'en':q, 'sparql': qry}}, ignore_index=True)
        except: 
            print("err:"+d)
    return df

In [94]:
ds_train = Dataset.from_pandas(df_train)
ds_test = Dataset.from_pandas(df_test)

In [95]:
df_test2 = prepare(ds_test)

100%|██████████| 150/150 [00:00<00:00, 707.68it/s]


In [96]:
df_train2 = prepare(ds_train)

100%|██████████| 408/408 [00:00<00:00, 744.80it/s]


In [97]:
print(df_test2.shape)
print(df_train2.shape)

(150, 1)
(408, 1)


In [98]:
pd.options.display.max_colwidth = 100
df_test2.head()

Unnamed: 0,translation
0,"{'en': 'what is the time zone of salt lake city?', 'sparql': 'select distinct ?uri where [ res:S..."
1,"{'en': 'who killed caesar?', 'sparql': 'select distinct ?uri where [ ?uri dct:subject dbc:Assass..."
2,"{'en': 'what is the highest mountain in germany?', 'sparql': 'select ?uri where [ ?uri rdf:type ..."
3,"{'en': 'which american presidents were in office during the vietnam war?', 'sparql': 'select ?ur..."
4,"{'en': 'butch otter is the governor of which u.s. state?', 'sparql': 'select distinct ?uri where..."


In [99]:
ds_train = Dataset.from_pandas(df_train2)
ds_test = Dataset.from_pandas(df_test2)

In [100]:
mother_ds = DatasetDict({'train': ds_train, 'test':ds_test})

In [101]:
ds_path='../../../data/dataset/qald-text-to-sparql'
mother_ds.save_to_disk(ds_path)

In [102]:
df_train2.to_csv(f'{ds_path}/train.csv')
df_test2.to_csv(f'{ds_path}/test.csv')

In [103]:
df_test2.iloc[10]['translation']['sparql']

'select ?uri where [ ?airline <rdf:type> <onto:Airline> . ?airline dbp:frequentFlyer ?uri. ] GROUP BY ?uri order BY DESC(COUNT(distinct ?airline)) OFFSET 0 limit 1'

In [104]:
df_test.iloc[10]['sparql']

'SELECT ?uri WHERE { ?airline <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Airline> . ?airline dbp:frequentFlyer ?uri. } GROUP BY ?uri ORDER BY DESC(COUNT(DISTINCT ?airline)) OFFSET 0 LIMIT 1'