In [42]:
import argparse
import random
import os
import pickle
import glob
import json
import time
import logging
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import torch
from torch import nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
# from tqdm.notebook import tqdm_notebook as tqdm

from transformers import AdamW, get_linear_schedule_with_warmup
from table_bert import TableBertModel
from table_bert import Table, Column

[nltk_data] Downloading package punkt to /Users/yunyawang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
pd.set_option('display.max_colwidth', -1)

In [2]:
test_f = pd.read_json("./data/test_files.json")
train_f = pd.read_json("./data/train_files.json")
dev_f = pd.read_json("./data/dev_files.json")
print(test_f.columns)
test_f.head()

Index(['phase', 'table_id', 'question', 'sql', 'answer', 'sql_query'], dtype='object')


Unnamed: 0,phase,table_id,question,sql,answer,sql_query
0,1,1-10015132-16,What is terrence ross' nationality,"{'sel': 2, 'conds': [[0, 0, 'Terrence Ross']],...",[united states],"{'agg_index': 0, 'cond_ops': ['=', '>', '<', '..."
1,1,1-10015132-16,What clu was in toronto 1995-96,"{'sel': 5, 'conds': [[4, 0, '1995-96']], 'agg'...",[arkansas],"{'agg_index': 0, 'cond_ops': ['=', '>', '<', '..."
2,1,1-10015132-16,which club was in toronto 2003-06,"{'sel': 5, 'conds': [[4, 0, '2003-06']], 'agg'...",[michigan],"{'agg_index': 0, 'cond_ops': ['=', '>', '<', '..."
3,1,1-10015132-16,how many schools or teams had jalen rose,"{'sel': 5, 'conds': [[0, 0, 'Jalen Rose']], 'a...",[1],"{'agg_index': 3, 'cond_ops': ['=', '>', '<', '..."
4,1,1-10083598-1,Where was Assen held?,"{'sel': 2, 'conds': [[3, 0, 'Assen']], 'agg': 0}",[netherlands],"{'agg_index': 0, 'cond_ops': ['=', '>', '<', '..."


In [3]:
test_tb = spark.read.json("./data/test_tables.jsonl").toPandas().rename(columns={'id':'table_id'})
train_tb = spark.read.json("./data/train_tables.jsonl").toPandas().rename(columns={'id':'table_id'})
dev_tb = spark.read.json("./data/dev_tables.jsonl").toPandas().rename(columns={'id':'table_id'})
test_tb.loc[0]

caption                                                          R
header           [Player, No., Nationality, Position, Years in ...
table_id                                             1-10015132-16
name                                             table_10015132_16
page_id                                                        NaN
page_title                         Toronto Raptors all-time roster
rows             [[Aleksandar Radojević, 25, Serbia, Center, 19...
section_title                                                    R
types                         [text, text, text, text, text, text]
Name: 0, dtype: object

In [4]:
def header_trans(df):
    header_list = []
    for i, row in df.iterrows():
        header_new = list(zip(row['header'], row['types'],row['rows'][0]))
        header_list.append(header_new) 
    df['header_new']=header_list
    df = df.drop(columns=['header'])
    return df

In [5]:
test_tb= header_trans(test_tb)
train_tb= header_trans(train_tb)
dev_tb= header_trans(dev_tb)

In [6]:
test_tb.loc[0]

caption                                                          R
table_id                                             1-10015132-16
name                                             table_10015132_16
page_id                                                        NaN
page_title                         Toronto Raptors all-time roster
rows             [[Aleksandar Radojević, 25, Serbia, Center, 19...
section_title                                                    R
types                         [text, text, text, text, text, text]
header_new       [(Player, text, Aleksandar Radojević), (No., t...
Name: 0, dtype: object

In [7]:
test = pd.merge(test_f,test_tb,on=['table_id']).rename(columns={'question':'context','page_title':'title','header_new':'header'})
train = pd.merge(train_f,train_tb,on=['table_id']).rename(columns={'question':'context','page_title':'title','header_new':'header'})
dev = pd.merge(dev_f,dev_tb,on=['table_id']).rename(columns={'question':'context','page_title':'title','header_new':'header'})

In [10]:
test.iloc[1]

phase            1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
table_id         1-10015132-16                                                                                                                                                                                                                                                                                                                                                                                                                

In [None]:
test.to_json('./data/test_tabert.json')
train.to_json('./data/train_tabert.json')
dev.to_json('./data/dev_tabert.json')

### subset of traning sample 100 roles

In [81]:
train = pd.read_json('./data/train_tabert.json')
dev = pd.read_json('./data/dev_tabert.json')

In [82]:
train_subset = train[:100]
dev_subset = dev[:100]

In [84]:
print(len(train_subset),len(dev_subset))

100 100


In [85]:
train_subset.to_json('./data/train_tabert_100.json')
dev_subset.to_json('./data/dev_tabert_100.json')

### Creating Dictionary

In [43]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [44]:
class Lang:
    def __init__(self, minimum_count=1):
        self.word2index = {}
        self.word2count = {}

        self.index2word = [None] * 4
        self.index2word[SOS_IDX] = SOS_TOKEN
        self.index2word[EOS_IDX] = EOS_TOKEN
        self.index2word[UNK_IDX] = UNK_TOKEN
        self.index2word[PAD_IDX] = PAD_TOKEN

        self.word2count[SOS_TOKEN] = 100;
        self.word2count[EOS_TOKEN] = 100;
        self.word2count[UNK_TOKEN] = 100;
        self.word2count[PAD_TOKEN] = 100;

        self.word2index[SOS_TOKEN] = SOS_IDX;
        self.word2index[EOS_TOKEN] = EOS_IDX;
        self.word2index[UNK_TOKEN] = UNK_IDX;
        self.word2index[PAD_TOKEN] = PAD_IDX;
        self.n_words = 4  # Count SOS and EOS

        self.minimum_count = minimum_count;

    def add_ans(self, ans):
        for word in ans:
            self.addWord(word.lower())

    def addWord(self, word):
        if word not in self.word2count.keys():
            self.word2count[word] = 1
        else:
            self.word2count[word] += 1
        if self.word2count[word] >= self.minimum_count:
            if word not in self.index2word:
                word = str(word);
                self.word2index[word] = self.n_words
                self.index2word.append(word)
                self.n_words += 1

    def vec2txt(self, list_idx):
        word_list = []
        if type(list_idx) == list:
            for i in list_idx:
                if i not in [EOS_IDX, SOS_IDX, PAD_IDX]:
                    word_list.append(self.index2word[i])
        else:
            for i in list_idx:
                if i.item() not in [EOS_IDX, SOS_IDX, PAD_IDX]:
                    word_list.append(self.index2word[i.item()])
        return (' ').join(word_list)

    def txt2vec(self, ans):
        token_list = ans;
        index_list = [self.word2index[token] if token in self.word2index else UNK_IDX for token in token_list]
        return torch.from_numpy(np.array(index_list)).to(device)



In [70]:
class voc():
    def __init__(self, df, voc_location, minimum_count = 1, max_num = 35):
        #df here is answers, list of list of tokens
        self.df=df
        self.minimum_count = minimum_count;
        self.max_num = max_num;
        self.voc_location = voc_location;
        self.main_df, self.target_voc = self.load_or_create_voc()
        #main df includes target_tokenized, target_indized, target_len
        #target_voc is the Lang class with full vocab and can perform idx to token, token to idx, token to count opertations
    def __len__(self):
        return len(self.main_df) if self.max_num is None else self.max_num
    def __getitem__(self, idx):
        return_list = [self.main_df.iloc[idx]['target_indized'], self.main_df.iloc[idx]['target_len'] ]
        return return_list
    def load_or_create_voc(self):
        if not os.path.exists(self.voc_location):
            os.makedirs(self.voc_location)
        full_file_path = os.path.join(self.voc_location, 'mincnt_maxnum' +
                                      str(self.minimum_count) + '_' + \
                                      str(self.max_num)+'.p')
        #if the address exits, we will load the dictionary from the full path,
        #ow, we will create a new voc dictionary and pickle dump to full path
        if os.path.isfile(full_file_path):
            print('Load Pre-existing Voc Dictionary')
            target_voc = pickle.load(open(full_file_path,'rb'))
        else:
            print('Create New Voc Dictionary')
            target_voc = Lang(minimum_count = self.minimum_count);
            for ans in self.df: # load ans into voc
                target_voc.add_ans(ans)
            pickle.dump(target_voc,open(full_file_path,'wb'))
        indices_data = []
        for ans in self.df: # ans tokens to idx
            index_list = [target_voc.word2index[token] if token in target_voc.word2index else UNK_IDX for token in ans]
            if len(index_list)<=self.max_num:
                index_list = index_list + [PAD_IDX]*(self.max_num-len(index_list))
            else:
                index_list = index_list[:self.max_num]
            index_list.append(EOS_IDX)
            indices_data.append(index_list)
        main_df = pd.DataFrame();
        main_df['target_tokenized'] = self.df;
        main_df['target_indized'] = indices_data;
        main_df['target_len'] = main_df['target_tokenized'].apply(lambda x: len(x)+1) #+1 for EOS
        main_df =  main_df[main_df['target_len'] >=2] #filter out ans that are empty
        return main_df,target_voc

In [71]:
class WikiDataset():
    def __init__(self, path, voc_location, model, minimum_count=1, max_num=35):
        # the initalization will end up with four parts: tabs, context, answers and target_voc
        self.path = path
        self.voc_location = voc_location
        self.model = model
        self.minimum_count=minimum_count
        self.max_num=max_num

        self.data = pd.read_json(self.path)
        self.data['title'] = self.data['title'].fillna('unknown')
        lens = self.data['answer'].apply(lambda x: len(model.tokenizer.tokenize(str(x[0]))))
        self.data = self.data.reset_index(drop=True)
        # print(self.data.shape)

        self.tabs = []
        self.context = []
        self.answers = []

        self._build()
        self.voc_obj = voc(self.answers, self.voc_location, minimum_count=self.minimum_count, max_num=self.max_num)

        self.answers = self.voc_obj.main_df.target_indized.tolist()
        self.target_voc=self.voc_obj.target_voc

    def _build(self):
        for idx in tqdm(range(len(self.data))):
            qs = self.data.loc[idx, 'context']
            ans = self.data.loc[idx, 'answer']
            heads = self.data.loc[idx, 'header']
            tit = self.data.loc[idx, 'title']
            rs = self.data.loc[idx, 'rows']

            col = [Column(z[0], z[1], sample_value=z[2]) for z in heads]
            table = Table(
                id=tit,
                header=col,
                data=rs
            ).tokenize(self.model.tokenizer)
            self.tabs.append(table)

            self.context.append(self.model.tokenizer.tokenize(qs))
            self.answers.append(self.model.tokenizer.tokenize(str(ans[0])))

    def __len__(self):
        return len(self.context)
    def __getitem__(self, index):
        tabi = self.tabs[index]
        conti = self.context[index]
        ansi = self.answers[index]
        return {"table": tabi, "context": conti, "answer": ansi}


def get_dataset(path, voc_location, model):
    return WikiDataset(path=path, voc_location=voc_location,model=model)

def collate_fn(batch):
    return [batch[i]['table'] for i in range(len(batch))], [batch[i]['context'] for i in range(len(batch))], torch.tensor([batch[i]['answer'] for i in range(len(batch))])


In [57]:
model = TableBertModel.from_pretrained('bert-base-uncased')

In [58]:
set_seed(42)

PAD_IDX = 0
UNK_IDX = 1
SOS_IDX = 2
EOS_IDX = 3
SEP_IDX = 4
PAD_TOKEN = '<pad>'
UNK_TOKEN = '<unk>'
SOS_TOKEN = '<sos>'
EOS_TOKEN = '<eos>'
SEP_TOKEN = '<sep>'

In [72]:
train = WikiDataset("./data/train_tabert.json",'./voc',model)

100%|██████████| 56355/56355 [05:21<00:00, 175.08it/s] 


Load Pre-existing Voc Dictionary


In [75]:
print('The len of the voc is :',train.target_voc.n_words)

The len of the voc is : 11575
