In [1]:
import json
import sys
import argparse
import os

In [2]:
def get_nl_sql_pairs(filepath, splits, with_dbs=False):
    """Gets pairs of natural language and corresponding gold SQL for Michigan.

    TODO: This is Google code. Add LICENSE.

    From the XSP codebase.
    """
    with open(filepath) as infile:
        data = json.load(infile)

    pairs = list()

    tag = '[' + filepath.split('/')[-1].split('.')[0] + ']'
    print('Getting examples with tag ' + tag)

    # The UMichigan data is split by anonymized queries, where values are
    # anonymized but table/column names are not. However, our experiments are
    # performed on the original splits of the data.
    for query in data:
        # Take the first SQL query only. From their Github documentation:
        # "Note - we only use the first query, but retain the variants for
        #  completeness"
        anonymized_sql = query['sql'][0]

        # It's also associated with a number of natural language examples, which
        # also contain anonymous tokens. Save the de-anonymized utterance and query.
        for example in query['sentences']:
            if example['question-split'] not in splits:
                continue

            nl = example['text']
            sql = anonymized_sql

            # Go through the anonymized values and replace them in both the natural
            # language and the SQL.
            #
            # It's very important to sort these in descending order. If one is a
            # substring of the other, it shouldn't be replaced first lest it ruin the
            # replacement of the superstring.
            for variable_name, value in sorted(
                    example['variables'].items(), key=lambda x: len(x[0]), reverse=True):
                if not value:
                    # TODO(alanesuhr) While the Michigan repo says to use a - here, the
                    # thing that works is using a % and replacing = with LIKE.
                    #
                    # It's possible that I should remove such clauses from the SQL, as
                    # long as they lead to the same table result. They don't align well
                    # to the natural language at least.
                    #
                    # See: https://github.com/jkkummerfeld/text2sql-data/tree/master/data
                    value = '%'

                nl = nl.replace(variable_name, value)
                sql = sql.replace(variable_name, value)

            # In the case that we replaced an empty anonymized value with %, make it
            # compilable new allowing equality with any string.
            sql = sql.replace('= "%"', 'LIKE "%"')
            nl = nl.lower()
            if with_dbs:
                pairs.append((nl, sql, example['table-id']))
            else:
                pairs.append((nl, sql))

    return pairs

In [3]:
data_dir = '../../../language/language/xsp/data'
output_dir = '../../../featurestorage/data/spider-20200607'

In [39]:
with open('../../../featurestorage/data/spider-20200607/geography_dev.json','r') as f:
    example = json.load(f)[0]
    example['sql']['where'] = []
    display(example)

{'db_id': 'geography',
 'query': "SELECT CITYalias0.CITY_NAME FROM CITY AS CITYalias0 WHERE CITYalias0.POPULATION = ( SELECT MAX ( CITYalias1.POPULATION ) FROM CITY AS CITYalias1 WHERE CITYalias1.STATE_NAME = 'arizona' ) AND CITYalias0.STATE_NAME = 'arizona' ;",
 'query_toks': ['SELECT',
  'CITYalias0.CITY_NAME',
  'FROM',
  'CITY',
  'AS',
  'CITYalias0',
  'WHERE',
  'CITYalias0.POPULATION',
  '=',
  '(',
  'SELECT',
  'MAX',
  '(',
  'CITYalias1.POPULATION',
  ')',
  'FROM',
  'CITY',
  'AS',
  'CITYalias1',
  'WHERE',
  'CITYalias1.STATE_NAME',
  '=',
  "'arizona'",
  ')',
  'AND',
  'CITYalias0.STATE_NAME',
  '=',
  "'arizona'",
  ';'],
 'question': 'what is the biggest city in arizona',
 'question_toks': ['what', 'is', 'the', 'biggest', 'city', 'in', 'arizona'],
 'sql': {'from': {'table_units': [['table_unit', 1]], 'conds': []},
  'select': [False, [[0, [0, [0, 3, False], None]]]],
  'where': [],
  'groupBy': [],
  'having': [],
  'orderBy': [],
  'limit': None,
  'intersect': No

In [41]:
pairs = get_nl_sql_pairs(os.path.join(data_dir,'geoquery/geoquery.json'), ['train','dev'])
items = []
for question, query in pairs:
    items.append({
        'db_id': 'geography',
        'query': query,
        'query_toks': query.split(),
        'sql': example['sql'],
        'question': question,
        'question_toks': question.split()
    })
with open(os.path.join(output_dir,'geography_origin.json'), 'w') as f:
    json.dump(items, f)

Getting examples with tag [geoquery]


In [45]:
pairs = get_nl_sql_pairs(os.path.join(data_dir,'atis/atis.json'), ['dev'])
items = []
for question, query in pairs:
    items.append({
        'db_id': 'atis',
        'query': query,
        'query_toks': query.split(),
        'sql': example['sql'],
        'question': question,
        'question_toks': question.split()
    })
with open(os.path.join(output_dir,'atis_origin.json'), 'w') as f:
    json.dump(items, f)

Getting examples with tag [atis]


In [43]:
for db_name in ['restaurants','academic','yelp','imdb']:
    pairs = get_nl_sql_pairs(os.path.join(data_dir,f'{db_name}/{db_name}.json'), [str(x) for x in range(10)])
    items = []
    for question, query in pairs:
        items.append({
            'db_id': db_name,
            'query': query,
            'query_toks': query.split(),
            'sql': example['sql'],
            'question': question,
            'question_toks': question.split()
        })
    with open(os.path.join(output_dir,f'{db_name}_origin.json'), 'w') as f:
        json.dump(items, f)

Getting examples with tag [restaurants]
Getting examples with tag [academic]
Getting examples with tag [yelp]
Getting examples with tag [imdb]


In [44]:
for db_name in ['scholar','advising']:
    pairs = get_nl_sql_pairs(os.path.join(data_dir,f'{db_name}/{db_name}.json'), ['train','dev'])
    items = []
    for question, query in pairs:
        items.append({
            'db_id': db_name,
            'query': query,
            'query_toks': query.split(),
            'sql': example['sql'],
            'question': question,
            'question_toks': question.split()
        })
    with open(os.path.join(output_dir,f'{db_name}_origin.json'), 'w') as f:
        json.dump(items, f)

Getting examples with tag [scholar]
Getting examples with tag [advising]


In [16]:
inferred_results = [
    "final_nocvlink_bert_large_bs24_decmin10_spideronly_nocolvaluev1_nomaskcol_noencaction_0_0_lr7.44e-4_seed1_warmup5k-step40000",
#     "final_nocvlink_bert_large_bs24_decmin10_spideronly_nocolvaluev1_nomaskcol_noencaction_0_0_lr7.44e-4_seed3_warmup5k-step40000",
#     "final_nocvlink_bert_large_bs24_decmin10_spideronly_nocolvaluev1_nomaskcol_noencaction_0_0_lr7.44e-4_seed5_warmup5k-step40000",
#     "final_nocvlink_neg1_1-1-1_anony5_bs24_decmin10_spideronly_nocolvaluev1_nomaskcol_noencaction_0_0_lr3e-4_seed1_warmup10k-step40000",
#     "final_nocvlink_neg1_1-1-1_anony5_bs24_decmin10_spideronly_nocolvaluev1_nomaskcol_noencaction_0_0_lr3e-4_seed3_warmup10k-step40000",
#     "final_nocvlink_neg1_1-1-1_anony5_bs24_decmin10_spideronly_nocolvaluev1_nomaskcol_noencaction_0_0_lr3e-4_seed5_warmup10k-step40000",
#     "final_nocvlink_unsupervised_neg1_1-1-1_anony5_bs24_decmin10_spideronly_nocolvaluev1_nomaskcol_noencaction_0_0_lr3e-4_seed1_warmup10k-step40000",
#     "final_nocvlink_unsupervised_neg1_1-1-1_anony5_bs24_decmin10_spideronly_nocolvaluev1_nomaskcol_noencaction_0_0_lr3e-4_seed3_warmup10k-step40000",
#     "final_nocvlink_unsupervised_neg1_1-1-1_anony5_bs24_decmin10_spideronly_nocolvaluev1_nomaskcol_noencaction_0_0_lr3e-4_seed5_warmup10k-step40000",
#     "final_bert_large_bs24_decmin10_spideronly_nocolvaluev1_nomaskcol_noencaction_0_0_lr7.44e-4_seed1_warmup5k-step40000",
#     "final_bert_large_bs24_decmin10_spideronly_nocolvaluev1_nomaskcol_noencaction_0_0_lr7.44e-4_seed3_warmup5k-step40000",
#     "final_bert_large_bs24_decmin10_spideronly_nocolvaluev1_nomaskcol_noencaction_0_0_lr7.44e-4_seed5_warmup5k-step40000",
#     "final_neg1_1-1-1_anony5_bs24_decmin10_spideronly_nocolvaluev1_nomaskcol_noencaction_0_0_lr3e-4_seed1_warmup5k-step40000",
#     "final_neg1_1-1-1_anony5_bs24_decmin10_spideronly_nocolvaluev1_nomaskcol_noencaction_0_0_lr3e-4_seed3_warmup5k-step40000",
#     "final_neg1_1-1-1_anony5_bs24_decmin10_spideronly_nocolvaluev1_nomaskcol_noencaction_0_0_lr3e-4_seed5_warmup5k-step40000",
#     "final_unsupervised_neg1_1-1-1_anony5_bs24_decmin10_spideronly_nocolvaluev1_nomaskcol_noencaction_0_0_lr7.44e-4_seed1_warmup10k-step40000",
#     "final_unsupervised_neg1_1-1-1_anony5_bs24_decmin10_spideronly_nocolvaluev1_nomaskcol_noencaction_0_0_lr7.44e-4_seed3_warmup10k-step40000",
#     "final_unsupervised_neg1_1-1-1_anony5_bs24_decmin10_spideronly_nocolvaluev1_nomaskcol_noencaction_0_0_lr7.44e-4_seed5_warmup10k-step40000"
]
# for inferred_dir in inferred_results:
#     os.mkdir(os.path.join('../../../language/language/xsp/output',inferred_dir))

In [5]:
def convert_predictions(origs, inferreds, db_path):
    predictions = []
    for inferred in inferreds:
        orig = origs[inferred['index']]
        predictions.append({
            'predictions': [x['inferred_code'] for x in inferred['beams']],
            'scores': [x['score'] for x in inferred['beams']],
            'database_path': os.path.join(db_path,f"{orig['db_id']}/{orig['db_id']}.sqlite"),
            'gold': orig['query'],
            'utterance': orig['question']
        })
    return predictions

In [6]:
db_path = '/datadrive/xiaden/workspace/featurestorage/data/spider-20200607/database/'

In [17]:
predictions = {}
for inferred_dir in inferred_results:
    try:
        print('geo')
        with open(os.path.join('../../../NL2CodeOverData/logdirs',inferred_dir,'val_origin_geography.infer'),'r') as f1,\
            open('../../../featurestorage/data/spider-20200607/geography_origin.json','r') as f2:
            origs = json.load(f2)
            inferreds = []
            for line in f1:
                inferreds.append(json.loads(line.strip()))
            predictions[inferred_dir] = convert_predictions(origs, inferreds, db_path)
            print(len(predictions[inferred_dir]))
            with open(os.path.join('../../../language/language/xsp/output',inferred_dir,'geo.infer'),'w') as f:
                json.dump(predictions[inferred_dir],f)
    except:
        print(inferred_dir)

    try:
        print('imdb')
        with open(os.path.join('../../../NL2CodeOverData/logdirs',inferred_dir,'val_origin_imdb.infer'),'r') as f1,\
            open('../../../featurestorage/data/spider-20200607/imdb_origin.json','r') as f2:
            origs = json.load(f2)
            inferreds = []
            for line in f1:
                inferreds.append(json.loads(line.strip()))
            predictions[inferred_dir] = convert_predictions(origs, inferreds, db_path)
            print(len(predictions[inferred_dir]))
            with open(os.path.join('../../../language/language/xsp/output',inferred_dir,'imdb.infer'),'w') as f:
                json.dump(predictions[inferred_dir],f)
    except:
        print(inferred_dir)    

    try:
        print('atis')
        with open(os.path.join('../../../NL2CodeOverData/logdirs',inferred_dir,'val_origin_atis.infer'),'r') as f1,\
            open('../../../featurestorage/data/spider-20200607/atis_origin.json','r') as f2:
            origs = json.load(f2)
            inferreds = []
            for line in f1:
                inferreds.append(json.loads(line.strip()))
            predictions[inferred_dir] = convert_predictions(origs, inferreds, db_path)
            print(len(predictions[inferred_dir]))
            with open(os.path.join('../../../language/language/xsp/output',inferred_dir,'atis.infer'),'w') as f:
                json.dump(predictions[inferred_dir],f)
    except:
        print(inferred_dir)

    try:
        print('restaurants')
        with open(os.path.join('../../../NL2CodeOverData/logdirs',inferred_dir,'val_origin_restaurants.infer'),'r') as f1,\
            open('../../../featurestorage/data/spider-20200607/restaurants_origin.json','r') as f2:
            origs = json.load(f2)
            inferreds = []
            for line in f1:
                inferreds.append(json.loads(line.strip()))
            predictions[inferred_dir] = convert_predictions(origs, inferreds, db_path)
            print(len(predictions[inferred_dir]))
            with open(os.path.join('../../../language/language/xsp/output',inferred_dir,'restaurants.infer'),'w') as f:
                json.dump(predictions[inferred_dir],f)
    except:
        print(inferred_dir)
    try:
        print('yelp')
        with open(os.path.join('../../../NL2CodeOverData/logdirs',inferred_dir,'val_origin_yelp.infer'),'r') as f1,\
            open('../../../featurestorage/data/spider-20200607/yelp_origin.json','r') as f2:
            origs = json.load(f2)
            inferreds = []
            for line in f1:
                inferreds.append(json.loads(line.strip()))
            predictions[inferred_dir] = convert_predictions(origs, inferreds, db_path)
            print(len(predictions[inferred_dir]))
            with open(os.path.join('../../../language/language/xsp/output',inferred_dir,'yelp.infer'),'w') as f:
                json.dump(predictions[inferred_dir],f)
    except:
        print(inferred_dir)

    try:
        print('advising')
        with open(os.path.join('../../../NL2CodeOverData/logdirs',inferred_dir,'val_origin_advising.infer'),'r') as f1,\
            open('../../../featurestorage/data/spider-20200607/advising_origin.json','r') as f2:
            origs = json.load(f2)
            inferreds = []
            for line in f1:
                inferreds.append(json.loads(line.strip()))
            predictions[inferred_dir] = convert_predictions(origs, inferreds, db_path)
            print(len(predictions[inferred_dir]))
            with open(os.path.join('../../../language/language/xsp/output',inferred_dir,'advising.infer'),'w') as f:
                json.dump(predictions[inferred_dir],f)
    except:
        print(inferred_dir)

    try:
        print('academic')
        with open(os.path.join('../../../NL2CodeOverData/logdirs',inferred_dir,'val_origin_academic.infer'),'r') as f1,\
            open('../../../featurestorage/data/spider-20200607/academic_origin.json','r') as f2:
            origs = json.load(f2)
            inferreds = []
            for line in f1:
                inferreds.append(json.loads(line.strip()))
            predictions[inferred_dir] = convert_predictions(origs, inferreds, db_path)
            print(len(predictions[inferred_dir]))
            with open(os.path.join('../../../language/language/xsp/output',inferred_dir,'academic.infer'),'w') as f:
                json.dump(predictions[inferred_dir],f)
    except:
        print(inferred_dir)

    try:
        print('scholar')
        with open(os.path.join('../../../NL2CodeOverData/logdirs',inferred_dir,'val_origin_scholar.infer'),'r') as f1,\
            open('../../../featurestorage/data/spider-20200607/scholar_origin.json','r') as f2:
            origs = json.load(f2)
            inferreds = []
            for line in f1:
                inferreds.append(json.loads(line.strip()))
            predictions[inferred_dir] = convert_predictions(origs, inferreds, db_path)
            print(len(predictions[inferred_dir]))
            with open(os.path.join('../../../language/language/xsp/output',inferred_dir,'scholar.infer'),'w') as f:
                json.dump(predictions[inferred_dir],f)
    except:
        print(inferred_dir)

geo
598
imdb
131
atis
486
restaurants
378
yelp
128
advising
2858
academic
196
scholar
599


In [15]:
inferred_dir = "final_nocvlink_bert_large_bs24_decmin10_spideronly_nocolvaluev1_nomaskcol_noencaction_0_0_lr7.44e-4_seed1_warmup5k-step40000"
with open(os.path.join('../../../NL2CodeOverData/logdirs',inferred_dir,'val_origin_scholar.infer'),'r') as f1,\
    open('../../../featurestorage/data/spider-20200607/scholar_origin.json','r') as f2:
    origs = json.load(f2)
    inferreds = []
    for line in f1:
        inferreds.append(json.loads(line.strip()))
    predictions[inferred_dir] = convert_predictions(origs, inferreds, db_path)
    print(len(predictions[inferred_dir]))
    with open(os.path.join('../../../language/language/xsp/output',inferred_dir,'scholar.infer'),'w') as f:
        json.dump(predictions[inferred_dir],f)

599
