In [1]:
import pandas as pd
import os
from nltk.tokenize import word_tokenize

In [2]:
dtypes = {'int64': 'int', 'object': 'str', 'float64': 'float'}

In [3]:
columns = {}

for file in os.listdir('./dataset/database/'):
    table = pd.read_csv('./dataset/database/' + file)
    cols = [c.lower() + ' ' +  dtypes[str(t)] for c, t in zip(table.columns, table.dtypes)]
    
    columns[file.split('.', 1)[0]] = ' '.join(cols)

In [4]:
vocab_source = set()
vocab_label = set()

for each in ['train.csv', 'dev.csv', 'test.csv']:
    df = pd.read_csv('./dataset/' + each)
    final = {'file': [], 'source': [], 'label': []}
    
    for _, row in df.iterrows():
        vega_zero = row['vega_zero']
        tokens = vega_zero.lower().split(' ')
        
        file = (row['db_id'] + '_' + tokens[tokens.index('data') + 1]).lower()
        cols = columns[file]
        
        if not isinstance(row['question'], str):
            continue
        
        source = cols + ' | ' + row['question']
        
        final['file'].append(file)
        final['source'].append(source)
        final['label'].append(vega_zero)
        
        for token in word_tokenize(source, language='english'):
            vocab_source.add(token.lower())
            
        for token in word_tokenize(vega_zero, language='english'):
            vocab_label.add(token.lower()) 
        
    final = pd.DataFrame(final)
    final.to_csv('./dataset/final/' + each, index=False)

In [5]:
with open('./dataset/vocab_source.txt', 'w') as file:
    file.write('\n'.join(sorted(list(vocab_source))))
    
with open('./dataset/vocab_label.txt', 'w') as file:
    file.write('\n'.join(sorted(list(vocab_label))))