In [71]:
from typing import Iterator, List, Dict, Optional
import torch
import torch.optim as optim
from torch.nn import MSELoss
from torch.nn import functional as F
from torch.nn import ModuleList

import numpy as np
from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField, ArrayField, MetadataField, ListField
from allennlp.data.dataset_readers import DatasetReader
from allennlp.common.file_utils import cached_path
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding, TokenEmbedder
from allennlp.modules.token_embedders.pretrained_transformer_embedder import PretrainedTransformerEmbedder
from allennlp.modules.token_embedders.pretrained_transformer_mismatched_embedder import PretrainedTransformerMismatchedEmbedder
# from allennlp.modules.seq2seq_encoders.multi_head_self_attention import MultiHeadSelfAttention
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.seq2vec_encoders.cnn_encoder import CnnEncoder
from allennlp.modules.attention import Attention
from allennlp.modules.matrix_attention.matrix_attention import MatrixAttention
from allennlp.modules.matrix_attention.linear_matrix_attention import LinearMatrixAttention
from allennlp.modules.matrix_attention.cosine_matrix_attention import CosineMatrixAttention

from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits, \
    get_device_of, masked_softmax, weighted_sum, \
    get_mask_from_sequence_lengths, get_lengths_from_binary_sequence_mask, tensors_equal

from allennlp.training.metrics import CategoricalAccuracy, MeanAbsoluteError
from allennlp.data.samplers import BucketBatchSampler
from allennlp.data.dataloader import DataLoader
from allennlp.training.trainer import GradientDescentTrainer
# from allennlp.predictors import Predictor, Seq2SeqPredictor, SimpleSeq2SeqPredictor, SentenceTaggerPredictor
from allennlp.predictors import Predictor, SentenceTaggerPredictor
from allennlp.nn.activations import Activation
from allennlp.common.tqdm import Tqdm
from allennlp.common.params import Params
from allennlp.common.util import JsonDict, sanitize

from allennlp_models.generation.predictors import Seq2SeqPredictor
from allennlp_models.generation.models.simple_seq2seq import SimpleSeq2Seq

# from spacy.tokenizer import Tokenizer as SpacyTokenizer
# from spacy.lang.en import English
# nlp = English()
# Create a blank Tokenizer with just the English vocab
# tokenizer = Tokenizer(nlp.vocab)

from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

from tqdm import tqdm

from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import ShortTermFeatures

import os
import itertools
import json
from collections import defaultdict
from inspect import signature
import warnings
import pickle
import importlib
from copy import deepcopy

from transformers import BertPreTrainedModel, BertModel, BertConfig, BertTokenizer

from utils.spider import process_sql, evaluation
from utils.schema_gnn.spider_utils import Table, TableColumn, read_dataset_schema
from utils.misc_utils import EvaluateSQL

from dataset_readers.reranker_reader import extractAudioFeatures, extractAudioFeatures_NoPooling, dbToTokens, \
    SpiderASRRerankerReaderV2_Siamese
from dataset_readers.reranker_reader_legacy import SpiderASRRerankerReaderV1, SpiderASRRerankerReaderV2
from modules.encoder import SpeakQLEncoderV1
from models.reranker import SpiderASRRerankerV2, SpiderASRReranker_Siamese
from models.reranker_legacy import SpiderASRRerankerV0, SpiderASRRerankerV1
from predictors.reranker_predictor import SpiderASRRerankerPredictor, SpiderASRRerankerPredictor_Siamese

torch.manual_seed(1)

<torch._C.Generator at 0x10dd0e730>

#### For future, first implement things here, but when finished move to python files and import

### Dataset Reader

In [11]:
AUDIO_DIM = 136
AUDIO_DIM_NO_POOLING = 68

In [None]:
tables_json_fname = '/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/tables.json'
dataset_reranker_dir = '/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/my'
token_indexers = {'bert': TokenIndexer.by_name('pretrained_transformer_mismatched')('bert-base-uncased')}

dataset_reader = SpiderASRRerankerReaderV2_Siamese(tables_json_fname=tables_json_fname,
                                         dataset_reranker_dir=dataset_reranker_dir,
                                         token_indexers=token_indexers,
                                         debug=True)

train_dataset = dataset_reader.read('train')

dev_dataset = dataset_reader.read('dev')
# test_dataset = dataset_reader.read('test')

In [None]:
train_dataset[0].fields

In [None]:
len(train_dataset[0].fields['sentence_1']), \
train_dataset[0].fields['text_mask_1'].array.shape, \
train_dataset[0].fields['text_mask_1'].array.sum(), \
train_dataset[0].fields['schema_mask_1'].array.shape, \
train_dataset[0].fields['schema_mask_1'].array.sum(), \
train_dataset[0].fields['audio_mask_1'].array.shape, \
train_dataset[0].fields['audio_mask_1'].array.sum(axis=-1), \
train_dataset[0].fields['metadata_1'].metadata, \
len(train_dataset[0].fields['audio_feats_1'].field_list), \
train_dataset[0].fields['audio_feats_1'].field_list[0].array.shape

In [None]:
len(train_dataset[0].fields['sentence_2']), \
train_dataset[0].fields['text_mask_2'].array.shape, \
train_dataset[0].fields['text_mask_2'].array.sum(), \
train_dataset[0].fields['schema_mask_2'].array.shape, \
train_dataset[0].fields['schema_mask_2'].array.sum(), \
train_dataset[0].fields['audio_mask_2'].array.shape, \
train_dataset[0].fields['audio_mask_2'].array.sum(axis=-1), \
train_dataset[0].fields['metadata_2'].metadata, \
len(train_dataset[0].fields['audio_feats_2'].field_list), \
train_dataset[0].fields['audio_feats_2'].field_list[0].array.shape

In [None]:
audio_shapes = [train_dataset[0].fields['audio_feats_1'].field_list[i].array.shape for i in range(len(train_dataset[0].fields['audio_feats_1'].field_list))]
tokens = list(train_dataset[0].fields['sentence_1'])
list(zip(audio_shapes, tokens))


### Model

In [None]:
bert_embedder = BasicTextFieldEmbedder(
#         embedder_to_indexer_map={
#             "bert": ["bert", "bert-offsets"],
#         },
        token_embedders={
            "bert": TokenEmbedder.by_name("pretrained_transformer_mismatched")("bert-base-uncased")
        })

In [None]:
bert_embedder.token_embedder_bert.__dict__.keys()

In [None]:
_bert_indexer = token_indexers['bert']
_bert_indexer
# vocab = Vocabulary({'token_bert': vocab_dict})

In [None]:
vocab = Vocabulary.from_instances(train_dataset + dev_dataset)
vocab

In [104]:
EMBEDDING_DIM = 768
AUDIO_DIM_NO_POOLING = 68
AUDIO_ENC_DIM = 128
LSTM_DIM = 32
SA_HEADS = 3
SA_DIM = 3 * 32

margin = 0.25

# sa_layer = MultiHeadSelfAttention(num_heads = SA_HEADS,
#                                   input_dim = EMBEDDING_DIM + AUDIO_DIM,
#                                   attention_dim = SA_DIM,
#                                   values_dim = SA_DIM,
#                                   output_projection_dim = SA_DIM,
#                                   attention_dropout_prob = 0.0)

audio_s2v = CnnEncoder(embedding_dim = AUDIO_DIM_NO_POOLING,
                       num_filters = 4,
                       ngram_filter_sizes = (2, 3, 4, 5),
                       output_dim = AUDIO_ENC_DIM)

lstm_s2v = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM + AUDIO_ENC_DIM, LSTM_DIM, batch_first=True))

model = SpiderASRRerankerV2(word_embeddings=bert_embedder,
                          bert_pretrained_model='bert-base-uncased',
                          audio_seq2vec_encoder=audio_s2v,
                          audio_attention_layer=CosineMatrixAttention(),
                          audio_attention_residual='+',
                          seq2seq_encoders=None,
                          seq2vec_encoder=lstm_s2v,
                          ff_dimension=8,
                          concat_audio=True,
                          vocab=vocab)

siamese_model = SpiderASRReranker_Siamese(model,
                                          margin=margin)

In [105]:
train_dataset[0].fields

{'sentence_1': <allennlp.data.fields.text_field.TextField at 0x183998a00>,
 'text_mask_1': <allennlp.data.fields.array_field.ArrayField at 0x183998690>,
 'schema_mask_1': <allennlp.data.fields.array_field.ArrayField at 0x183998640>,
 'audio_feats_1': <allennlp.data.fields.list_field.ListField at 0x13f5cfe10>,
 'audio_mask_1': <allennlp.data.fields.array_field.ArrayField at 0x183998050>,
 'metadata_1': <allennlp.data.fields.metadata_field.MetadataField at 0x140d85450>,
 'sentence_2': <allennlp.data.fields.text_field.TextField at 0x18399a0f0>,
 'text_mask_2': <allennlp.data.fields.array_field.ArrayField at 0x18399a1e0>,
 'schema_mask_2': <allennlp.data.fields.array_field.ArrayField at 0x18399ae10>,
 'audio_feats_2': <allennlp.data.fields.list_field.ListField at 0x1406866d0>,
 'audio_mask_2': <allennlp.data.fields.array_field.ArrayField at 0x18399ae60>,
 'metadata_2': <allennlp.data.fields.metadata_field.MetadataField at 0x13ddd12d0>}

In [106]:
train_dataset[0]['sentence_1'].__slots__

['tokens', '_token_indexers', '_indexed_tokens']

In [107]:
optimizer = optim.SGD(model.parameters(), lr=0.1)
# iterator = BucketIterator(batch_size=8, sorting_keys=[("sentence_1", "num_tokens")])
# iterator.index_with(vocab)

# train_dataset_indexed = train_dataset.index_with(vocab)
# dev_dataset_indexed = dev_dataset.index_with(vocab)

train_dataset.index_with(vocab)
dev_dataset.index_with(vocab)

# train_sampler = BucketBatchSampler(train_dataset_indexed, batch_size=8, sorting_keys=[("sentence_1", "num_tokens")])
# dev_sampler = BucketBatchSampler(dev_dataset_indexed, batch_size=8, sorting_keys=[("sentence_1", "num_tokens")])

train_data_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
dev_data_loader = DataLoader(dev_dataset, batch_size=8, shuffle=False)

trainer = GradientDescentTrainer(model=siamese_model,
                  optimizer=optimizer,
                  data_loader=train_data_loader,
                  validation_data_loader=dev_data_loader,
                  patience=10,
                  num_epochs=1,
                  cuda_device=-1)
trainer.train()

HBox(children=(IntProgress(value=0, max=11), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




{'best_epoch': 0,
 'peak_worker_0_memory_MB': 1681.453056,
 'training_duration': '0:01:19.082855',
 'training_start_epoch': 0,
 'training_epochs': 0,
 'epoch': 0,
 'training_loss': 0.24971853467551144,
 'training_reg_loss': 0.0,
 'training_worker_0_memory_MB': 1681.453056,
 'validation_loss': 0.2497857684890429,
 'validation_reg_loss': 0.0,
 'best_validation_loss': 0.2497857684890429,
 'best_validation_reg_loss': 0.0}

In [None]:
train_dataset[0]

In [None]:
vars(train_dataset[0])

In [None]:
vars(train_dataset[0]['sentence'])

In [None]:
vars(train_dataset[0]['score'])

In [176]:
_iter = iterator._create_batches(dev_dataset, shuffle=False)

In [177]:
_batch = next(_iter)

In [178]:
_batch_tensor_dict = _batch.as_tensor_dict()

In [179]:
_batch_tensor_dict.keys()

dict_keys(['sentence', 'text_mask', 'schema_mask', 'audio_feats', 'audio_mask', 'metadata', 'score'])

In [153]:
get_lengths_from_binary_sequence_mask(_batch_tensor_dict['text_mask']), \
get_lengths_from_binary_sequence_mask(_batch_tensor_dict['schema_mask'])
# Why 63 ??

(tensor([15, 14, 15, 15, 11, 12,  7, 15]),
 tensor([63, 63, 63, 63, 63, 63, 63, 63]))

In [154]:
_batch_tensor_dict['schema_mask'].size()

torch.Size([8, 79])

In [155]:
get_text_field_mask(_batch_tensor_dict['sentence']).size()

torch.Size([8, 79])

In [156]:
_batch_tensor_dict['audio_mask'].size()

torch.Size([8, 79, 68])

In [None]:
all_instance_fields_and_types = [{k: v.__class__.__name__ for k, v in x.fields.items()} for x in train_dataset]

In [None]:
all_instance_fields_and_types

### Predict

In [108]:
class SpiderASRRerankerPredictor(Predictor):
    def predict_instance(self, instance: Instance) -> JsonDict:
        outputs = self._model.forward_on_instance(instance)
        outputs['question'] = ' '.join([str(tok) for tok in instance.fields['sentence']]).split(' [SEP] ')[0]
        outputs['original_id'] = instance.fields['metadata']['original_id']
        return sanitize(outputs)

In [109]:
class SpiderASRRerankerPredictor_Siamese(Predictor):
    def predict_instance(self, instance: Instance) -> JsonDict:
        outputs = self._model.regression_model.forward_on_instance(instance)
        outputs['question'] = ' '.join([str(tok) for tok in instance.fields['sentence']]).split(' [SEP] ')[0]
        outputs['original_id'] = instance.fields['metadata']['original_id']
        return sanitize(outputs)

In [110]:
dataset_reader_test = SpiderASRRerankerReaderV2(tables_json_fname=tables_json_fname,
                                         dataset_reranker_dir=dataset_reranker_dir,
                                         token_indexers=token_indexers,
                                         debug=True)
test_dataset = dataset_reader_test.read('test')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [111]:
predictor = SpiderASRRerankerPredictor_Siamese(siamese_model, dataset_reader_test)

In [112]:
predictor.predict_instance(test_dataset[0])

{'score_preds': 0.43298661708831787,
 'loss': 0.3215041756629944,
 'question': 'find the number of pets whose weight is heavier than 10 .',
 'original_id': 45}

In [None]:
vars(test_dataset[0])

In [None]:
vars(_batch.instances[0])

In [None]:
for _inst in _batch.instances:
    print(_inst.fields['sentence'])

#### Evaluate (moved to ratsql-infer.ipynb)

In [213]:
## Evaluate for comparing groups (FIRST, ORACLE, RANDOM)

# gold_scores_list = [] # List[List]: gold scores for each ASR candidate 

hyp_first_list = []
hyp_oracle_list = [] # Oracle for rat-sql score, not bleu 
hyp_all_list = []
ref_list = []
ref_all_list = []

eval_ids = []

first_corr = 0
expect_corr = 0

first_score_sum = 0
expect_score_sum = 0
oracle_score_sum = 0

first_exact_sum = 0
expect_exact_sum = 0
oracle_exact_sum = 0

for i, curr_golds in enumerate(golds):

    if len(curr_golds) == 0:
        # skipped 
        continue
        
    o_id = curr_golds[0]['original_id']
    eval_ids.append(o_id)
    
    # eval 
    g = [_g['ratsql_pred_score'] for _g in curr_golds]
    g_ex = [_g['ratsql_pred_exact'] for _g in curr_golds]
    
    gold_max = np.max(g)
    if np.isclose(g[0], gold_max):
        first_corr += 1
    expect_corr += (sum([np.isclose(s, gold_max) for s in g]) / len(g))
    
    first_score_sum += g[0]
    expect_score_sum += np.mean(g)
    oracle_score_sum += np.max(g)
    
    first_exact_sum += g_ex[0]
    expect_exact_sum += np.mean(g_ex)
    oracle_exact_sum += np.max(g_ex)

    # keep track of selected & ref (original) sentences 
    oracle_c_id = np.argmax(g)

    curr_hyps = [[w.lower() for w in curr_golds[i]['question_toks']] for i in range(len(curr_golds))]
    ref = [w.lower() for w in curr_golds[0]['gold_question_toks']]
    
    hyp_first_list.append(curr_hyps[0])
    hyp_oracle_list.append(curr_hyps[oracle_c_id])
    ref_list.append([ref])
    hyp_all_list.extend(curr_hyps)
    ref_all_list.extend([[ref] for _ in range(len(curr_golds))])

len(hyp_first_list), len(hyp_oracle_list), len(ref_list), len(hyp_all_list), len(ref_all_list), len(eval_ids)

(100, 100, 100, 712, 712, 100)

In [214]:
bleu_first = corpus_bleu(list_of_references=ref_list, hypotheses=hyp_first_list)
bleu_oracle = corpus_bleu(list_of_references=ref_list, hypotheses=hyp_oracle_list)
bleu_all = corpus_bleu(list_of_references=ref_all_list, hypotheses=hyp_all_list)

In [215]:
print('[Selection @1 accuracy]')
print('First: {}/{} = {:.4f}'.format(first_corr, len(eval_ids), first_corr / len(eval_ids)))
print('Expectation(random): {:.4f}/{} = {:.4f}'.format(expect_corr, len(eval_ids), expect_corr / len(eval_ids)))
print()
print('[Selection ratsql prediction score]')
print('First: {:.4f}/{} = {:.4f}'.format(first_score_sum, len(eval_ids), first_score_sum / len(eval_ids)))
print('Expectation(random): {:.4f}/{} = {:.4f}'.format(expect_score_sum, len(eval_ids), expect_score_sum / len(eval_ids)))
print('Oracle: {:.4f}/{} = {:.4f}'.format(gold_score_sum, len(eval_ids), gold_score_sum / len(eval_ids)))
print()
print('[Selection ratsql prediction exact]')
print('First: {:.4f}/{} = {:.4f}'.format(first_exact_sum, len(eval_ids), first_exact_sum / len(eval_ids)))
print('Expectation(random): {:.4f}/{} = {:.4f}'.format(expect_exact_sum, len(eval_ids), expect_exact_sum / len(eval_ids)))
print('Oracle: {:.4f}/{} = {:.4f}'.format(gold_exact_sum, len(eval_ids), gold_exact_sum / len(eval_ids)))
print()
print('[BLEU score]')
print(f'First: {bleu_first:.4f}')
print(f'Oracle: {bleu_oracle:.4f}')
print(f'All: {bleu_all:.4f}')


[Selection @1 accuracy]
First: 70/100 = 0.7000
Expectation(random): 69.3730/100 = 0.6937

[Selection ratsql prediction score]
First: 67.7936/100 = 0.6779
Expectation(random): 67.3080/100 = 0.6731
Oracle: 79.7952/100 = 0.7980

[Selection ratsql prediction exact]
First: 35.0000/100 = 0.3500
Expectation(random): 35.0992/100 = 0.3510
Oracle: 49.0000/100 = 0.4900

[BLEU score]
First: 0.6934
Oracle: 0.7070
All: 0.5973


### Analysis

In [57]:
original_dev_fname = '/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/dev.json'

In [58]:
with open(original_dev_fname, 'r') as f:
    original_dev = json.load(f)
len(original_dev)

1034

In [59]:
[(i, d['question']) for i, d in enumerate(original_dev) if '2 or more friends' in d['question']]

[(911,
  'What are the names of high schoolers who have a grade of over 5 and have 2 or more friends?')]

In [60]:
[(i, p['score_preds'], p['question']) for i, p in enumerate(predicts) if p['original_id'] == 911]

[(3030,
  0.003166439477354288,
  'What are the names of high schoolers who have a great of over five and have two or more friends ?'),
 (3031,
  1.1134460464745644e-08,
  "What are the names of high schoolers who have a great of over five and have to ? We're more friends ."),
 (3032,
  0.00015574961435049772,
  'What are the names of high schoolers who have a great of over five and have two were more friends ?'),
 (3033,
  3.5951459722127765e-05,
  'what are the names of high schoolers who have a great of over five and have to wear more friends ?'),
 (3034,
  1.1044487280287285e-07,
  "What are the names of high schoolers who have a great of over five and have two ? We're more friends ."),
 (3035,
  0.00010368470248067752,
  'What are the names of high schoolers who have a great of over five and have to worm or friends ?'),
 (3036,
  1.2194815326438402e-06,
  'what are the names of high schoolers who have a great of over five and have to were more friends ?'),
 (3037,
  0.000285907473

In [178]:
print('\n'.join([str((i, p['score_preds'], p['question'])) for i, p in enumerate(predicts) if p['original_id'] == 911]))

(2355, 0.49413105845451355, 'What are the names of high schoolers who have a great of over five and have two or more friends ?')
(2356, 0.4690669775009155, "What are the names of high schoolers who have a great of over five and have to ? We're more friends .")
(2357, 0.47279447317123413, 'What are the names of high schoolers who have a great of over five and have two were more friends ?')
(2358, 0.4957186281681061, 'what are the names of high schoolers who have a great of over five and have to wear more friends ?')
(2359, 0.4678690731525421, "What are the names of high schoolers who have a great of over five and have two ? We're more friends .")
(2360, 0.46797066926956177, 'What are the names of high schoolers who have a great of over five and have to worm or friends ?')
(2361, 0.4689886271953583, 'what are the names of high schoolers who have a great of over five and have to were more friends ?')
(2362, 0.4720613658428192, 'What are the names of high schoolers who have a great of over

In [151]:
set([p['original_id'] for i, p in enumerate(predicts)])

{45,
 46,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 56,
 58,
 59,
 60,
 62,
 64,
 67,
 68,
 72,
 73,
 74,
 76,
 77,
 78,
 79,
 82,
 84,
 85,
 179,
 180,
 181,
 183,
 184,
 185,
 186,
 188,
 195,
 196,
 197,
 198,
 200,
 201,
 202,
 206,
 207,
 208,
 209,
 210,
 211,
 212,
 214,
 217,
 218,
 219,
 220,
 225,
 226,
 227,
 228,
 231,
 235,
 237,
 239,
 240,
 241,
 246,
 247,
 248,
 249,
 251,
 252,
 253,
 254,
 255,
 257,
 258,
 299,
 301,
 302,
 303,
 304,
 305,
 308,
 309,
 310,
 311,
 312,
 313,
 314,
 315,
 316,
 317,
 319,
 320,
 321,
 322,
 323,
 324,
 325,
 327,
 328,
 333,
 335,
 338,
 339,
 345,
 347,
 348,
 349,
 350,
 351,
 353,
 354,
 355,
 356,
 359,
 360,
 361,
 362,
 363,
 364,
 367,
 368,
 369,
 370,
 371,
 372,
 375,
 377,
 378,
 379,
 380,
 381,
 382,
 384,
 385,
 386,
 387,
 388,
 389,
 391,
 392,
 394,
 395,
 396,
 397,
 398,
 399,
 400,
 401,
 402,
 403,
 405,
 406,
 410,
 412,
 413,
 414,
 415,
 416,
 417,
 418,
 420,
 421,
 423,
 424,
 425,
 428,
 429,
 430,
 431,
 432,
 

### Allennlp features test

In [513]:
TokenIndexer.list_available()

['single_id',
 'openai_transformer_byte_pair',
 'dependency_label',
 'ner_tag',
 'pos_tag',
 'characters',
 'elmo_characters',
 'bert-pretrained',
 'spacy',
 'pretrained_transformer']

In [195]:
instance_sentence = train_dataset[0]['sentence']
vars(instance_sentence)

{'tokens': [listx,
  thex,
  namex,
  bornx,
  statex,
  andx,
  agex,
  ofx,
  thex,
  headsx,
  ofx,
  departmentsx,
  orderedx,
  byx,
  agex,
  .x],
 '_token_indexers': {'bert': <allennlp.data.token_indexers.wordpiece_indexer.PretrainedBertIndexer at 0x152f1e750>},
 '_indexed_tokens': None,
 '_indexer_name_to_indexed_token': None,
 '_token_index_to_indexer_name': None}

In [196]:
signature(instance_sentence.index)

<Signature (vocab: allennlp.data.vocabulary.Vocabulary)>

In [197]:
bert_indexer = token_indexers['bert']
bert_indexer.__dict__.keys()

dict_keys(['_token_min_padding_length', 'vocab', 'wordpiece_tokenizer', '_namespace', '_added_to_vocabulary', 'max_pieces', 'use_starting_offsets', '_do_lowercase', '_truncate_long_sequences', '_warned_about_truncation', '_never_lowercase', '_start_piece_ids', '_end_piece_ids', '_separator_ids'])

In [198]:
signature(bert_indexer.as_padded_tensor)

<Signature (tokens: Dict[str, List[int]], desired_num_tokens: Dict[str, int], padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]>

In [199]:
vocab = Vocabulary.from_instances(train_dataset + dev_dataset)

100%|██████████| 15/15 [00:00<00:00, 82456.83it/s]


In [200]:
vars(vocab)

{'_padding_token': '@@PADDING@@',
 '_oov_token': '@@UNKNOWN@@',
 '_non_padded_namespaces': {'*labels', '*tags'},
 '_token_to_index': _TokenToIndexDefaultDict(None, {}),
 '_index_to_token': _IndexToTokenDefaultDict(None, {}),
 '_retained_counter': {}}

In [203]:
# For a sentence, token_indexers: {Bert: 1-dim; Word: 1-dim; Char: 2-dim}
instance_sentence.index(vocab)
vars(instance_sentence)

{'tokens': [listx,
  thex,
  namex,
  bornx,
  statex,
  andx,
  agex,
  ofx,
  thex,
  headsx,
  ofx,
  departmentsx,
  orderedx,
  byx,
  agex,
  .x],
 '_token_indexers': {'bert': <allennlp.data.token_indexers.wordpiece_indexer.PretrainedBertIndexer at 0x152f1e750>},
 '_indexed_tokens': {'bert': [101,
   2862,
   2595,
   1996,
   2595,
   2171,
   2595,
   2141,
   2595,
   2110,
   2595,
   1998,
   2595,
   2287,
   2595,
   1997,
   2595,
   1996,
   2595,
   4641,
   2595,
   1997,
   2595,
   7640,
   2595,
   3641,
   2595,
   2011,
   2595,
   2287,
   2595,
   1012,
   2595,
   102],
  'bert-offsets': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32],
  'bert-type-ids': [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  'mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 '_indexer_name_to_indexed_token': 

### BERT test

In [74]:
# ...
bert_model_name = 'bert-base-uncased'
bert_config = BertConfig.from_pretrained(bert_model_name, finetuning_task='reranking')
bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True)
bert_model = BertModel.from_pretrained(bert_model_name, from_tf=False, config=bert_config)
# ...

In [75]:
bert_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [87]:
TokenIndexer.list_available()

['single_id',
 'openai_transformer_byte_pair',
 'dependency_label',
 'ner_tag',
 'pos_tag',
 'characters',
 'elmo_characters',
 'bert-pretrained',
 'spacy',
 'pretrained_transformer']

In [85]:
TokenIndexer.by_name('bert-pretrained')

allennlp.data.token_indexers.wordpiece_indexer.PretrainedBertIndexer

In [97]:
TokenIndexer.by_name('bert-pretrained')('bert-base-uncased')

<allennlp.data.token_indexers.wordpiece_indexer.PretrainedBertIndexer at 0x15bab9490>

In [None]:
# utils_bert_pa.convert_examples_to_features() <- need to modify this (labels?)

In [90]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [92]:
tokenizer.tokenize('A quick brown foxx jumped over the old dogg, ohhh')

['a', 'quick', 'brown', 'fox', '##x', 'jumped', 'over', 'the', 'old', 'dogg']

In [455]:
token_indexer = TokenIndexer.by_name('bert-pretrained')('bert-base-uncased')

In [456]:
token_indexer.__dict__.keys()

dict_keys(['_token_min_padding_length', 'vocab', 'wordpiece_tokenizer', '_namespace', '_added_to_vocabulary', 'max_pieces', 'use_starting_offsets', '_do_lowercase', '_truncate_long_sequences', '_warned_about_truncation', '_never_lowercase', '_start_piece_ids', '_end_piece_ids', '_separator_ids'])

In [508]:
tokens = [Token(w) for w in 'A quick brown foxx jumped over the old dogg , ohhh [SEP] another sentence & another sentence'.split(' ')]
tokens, len(tokens)

([A,
  quick,
  brown,
  foxx,
  jumped,
  over,
  the,
  old,
  dogg,,
  ohhh,
  [SEP],
  another,
  sentence,
  &,
  another,
  sentence],
 16)

In [499]:
token_indexed = token_indexer.tokens_to_indices(tokens, vocab, 'bert')
token_indexed_tensor = {k : torch.LongTensor([v]) for k, v in token_indexed.items()}
token_indexed_tensor

{'bert': tensor([[  101,  1037,  4248,  2829,  4419,  2595,  5598,  2058,  1996,  2214,
          28844,   102,  2178,  6251,  1004,  2178,  6251,   102]]),
 'bert-offsets': tensor([[ 1,  2,  3,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]]),
 'bert-type-ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]]),
 'mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [507]:
id2w = vocab.get_index_to_token_vocabulary('bert')
[(id2w[token_indexed['bert'][i]], token_indexed['bert-type-ids'][i]) for i in range(len(token_indexed['bert']))]

[('[CLS]', 0),
 ('a', 0),
 ('quick', 0),
 ('brown', 0),
 ('fox', 0),
 ('##x', 0),
 ('jumped', 0),
 ('over', 0),
 ('the', 0),
 ('old', 0),
 ('dogg', 0),
 ('[SEP]', 0),
 ('another', 1),
 ('sentence', 1),
 ('&', 1),
 ('another', 1),
 ('sentence', 1),
 ('[SEP]', 1)]

In [501]:
embedding = bert_embedder(token_indexed_tensor)
embedding.size()

torch.Size([1, 15, 768])

In [502]:
token_indexed_tensor['bert'].size()

torch.Size([1, 18])

### Schema test

In [7]:
schema = read_dataset_schema('/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/tables.json')
schema

{'perpetrator': {'people': <__main__.Table at 0x13b3b10d0>,
  'perpetrator': <__main__.Table at 0x13b3b1110>},
 'college_2': {'prereq': <__main__.Table at 0x13b3b1850>,
  'classroom': <__main__.Table at 0x13b3b1890>,
  'department': <__main__.Table at 0x13b3b1a50>,
  'course': <__main__.Table at 0x13b3b1c10>,
  'instructor': <__main__.Table at 0x13b3b1e50>,
  'section': <__main__.Table at 0x13b3b40d0>,
  'teaches': <__main__.Table at 0x13b3b44d0>,
  'student': <__main__.Table at 0x13b3b47d0>,
  'takes': <__main__.Table at 0x13b3b4a10>,
  'advisor': <__main__.Table at 0x13b3b4d90>,
  'time_slot': <__main__.Table at 0x13b3b4ed0>},
 'flight_company': {'flight': <__main__.Table at 0x13b3b6590>,
  'airport': <__main__.Table at 0x13b3b65d0>,
  'operate_company': <__main__.Table at 0x13b3b6950>},
 'icfp_1': {'Authorship': <__main__.Table at 0x13b3b80d0>,
  'Inst': <__main__.Table at 0x13b3b8110>,
  'Authors': <__main__.Table at 0x13b3b82d0>,
  'Papers': <__main__.Table at 0x13b3b8490>},
 'bod

In [8]:
schema['perpetrator']

{'people': <__main__.Table at 0x13b3b10d0>,
 'perpetrator': <__main__.Table at 0x13b3b1110>}

In [9]:
vars(schema['perpetrator']['perpetrator'])

{'name': 'perpetrator',
 'text': 'perpetrator',
 'columns': [<__main__.TableColumn at 0x13b3b1190>,
  <__main__.TableColumn at 0x13b3b1210>,
  <__main__.TableColumn at 0x13b3b1290>,
  <__main__.TableColumn at 0x13b3b1310>,
  <__main__.TableColumn at 0x13b3b1390>,
  <__main__.TableColumn at 0x13b3b1450>,
  <__main__.TableColumn at 0x13b3b14d0>,
  <__main__.TableColumn at 0x13b3b1550>]}

In [10]:
[vars(schema['perpetrator']['perpetrator'].columns[i]) for i in range(len(schema['perpetrator']['perpetrator'].columns))]

[{'name': 'perpetrator_id',
  'text': 'perpetrator id',
  'column_type': 'number',
  'is_primary_key': True,
  'foreign_key': None},
 {'name': 'people_id',
  'text': 'people id',
  'column_type': 'number',
  'is_primary_key': False,
  'foreign_key': 'people:people_id'},
 {'name': 'date',
  'text': 'date',
  'column_type': 'text',
  'is_primary_key': False,
  'foreign_key': None},
 {'name': 'year',
  'text': 'year',
  'column_type': 'number',
  'is_primary_key': False,
  'foreign_key': None},
 {'name': 'location',
  'text': 'location',
  'column_type': 'text',
  'is_primary_key': False,
  'foreign_key': None},
 {'name': 'country',
  'text': 'country',
  'column_type': 'text',
  'is_primary_key': False,
  'foreign_key': None},
 {'name': 'killed',
  'text': 'killed',
  'column_type': 'number',
  'is_primary_key': False,
  'foreign_key': None},
 {'name': 'injured',
  'text': 'injured',
  'column_type': 'number',
  'is_primary_key': False,
  'foreign_key': None}]

In [11]:
_db_id = 'perpetrator'
_db_tokens = []
for table_name, table in schema[_db_id].items():
    _db_tokens.append(table.text)
    _db_tokens.append(':')
    for column in table.columns:
        _db_tokens.append(column.text)
        _db_tokens.append(',')
    _db_tokens[-1] = '.'
' '.join(_db_tokens)

'people : people id , name , height , weight , home town . perpetrator : perpetrator id , people id , date , year , location , country , killed , injured .'

In [214]:
for db_id in schema:
    db_tokens = dbToTokens(schema[db_id])
    if len(db_tokens) == 225:
        print(db_id, db_tokens)

department_store ['staff', 'department', 'assignments', ':', 'staff', 'id', ',', 'department', 'id', ',', 'date', 'assigned', 'from', ',', 'job', 'title', 'code', ',', 'date', 'assigned', 'to', '.', 'addresses', ':', 'address', 'id', ',', 'address', 'details', '.', 'staff', ':', 'staff', 'id', ',', 'staff', 'gender', ',', 'staff', 'name', '.', 'suppliers', ':', 'supplier', 'id', ',', 'supplier', 'name', ',', 'supplier', 'phone', '.', 'department', 'store', 'chain', ':', 'department', 'store', 'chain', 'id', ',', 'department', 'store', 'chain', 'name', '.', 'customers', ':', 'customer', 'id', ',', 'payment', 'method', 'code', ',', 'customer', 'code', ',', 'customer', 'name', ',', 'customer', 'address', ',', 'customer', 'phone', ',', 'customer', 'email', '.', 'products', ':', 'product', 'id', ',', 'product', 'type', 'code', ',', 'product', 'name', ',', 'product', 'price', '.', 'supplier', 'addresses', ':', 'supplier', 'id', ',', 'address', 'id', ',', 'date', 'from', ',', 'date', 'to', '.

### Test

In [27]:
dir_path = os.path.join('/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/my', 'dev')
question_diff_fname = os.path.join(dir_path, 'questions_diff_tags.txt')
db_id_fname = os.path.join(dir_path, 'db_id.txt')

with open(question_diff_fname, 'r') as f1, open(db_id_fname, 'r') as f2:
    for l1, l2 in zip(f1, f2):
        print(l1.strip() + '\t' + l2)

how many singers do we have ?	0 0 0 0 0 0 0	concert_singer

What is the total number of singers ?	0 0 0 0 0 0 0 0	concert_singer

show name Country age for all singers ordered by age from the oldest to the youngest .	0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0	concert_singer

what are the names , countries and ages for every singer in descending order of age ?	0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0	concert_singer

what is the average minimum and maximum age of all singers from France ?	0 0 0 0 0 0 0 0 0 0 0 0 0 0	concert_singer

what is the average minimum and maximum age for all French singers ?	0 0 0 0 0 0 0 0 0 0 0 0 0	concert_singer

show the name and the release year of the song by the youngest singer .	0 0 0 0 0 0 0 0 0 0 0 0 0 0 0	concert_singer

whatever names and release years for all the songs of the youngest singer .	1 0 0 0 0 0 0 0 0 0 0 0 0 0	concert_singer

what are all distinct countries where singers above age 20 year from	0 0 0 0 0 0 0 0 0 0 1 0	concert_singer

what are the different

In [36]:
np.hstack([np.array(1), np.array(2)])

array([1, 2])

In [207]:
a = {'a' : 1}
b = {'b' : 2}
a.update(b)
a

{'a': 1, 'b': 2}

In [76]:
tuple([slice(0, x) for x in (68, 0)])

(slice(0, 68, None), slice(0, 0, None))

In [120]:
a = torch.LongTensor(np.array([
    [1,2,3],
    [4,5,6],
    [7,8,9]
]))
a.view(-1)

tensor([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [259]:
json.loads('{"dataset_reader": {"type": "spider_ASR_reranker_reader_v2", "token_indexers": {"bert": {"type": "bert-pretrained", "pretrained_model": "bert-buncased"}}, "tables_json_fname": "SPIDER_DIR/tables.json", "dataset_reranker_dir": "SPIDER_DIR/my", "max_sequence_len": 300, "debug": true}}')


{'dataset_reader': {'type': 'spider_ASR_reranker_reader_v2',
  'token_indexers': {'bert': {'type': 'bert-pretrained',
    'pretrained_model': 'bert-buncased'}},
  'tables_json_fname': 'SPIDER_DIR/tables.json',
  'dataset_reranker_dir': 'SPIDER_DIR/my',
  'max_sequence_len': 300,
  'debug': True}}

In [61]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('predicts', 26744),
 ('original_dev', 9032),
 ('eval_ids', 4856),
 ('eval_score_pairs', 4856),
 ('golds', 4856),
 ('hyp_list', 4856),
 ('ref_list', 4856),
 ('Activation', 1064),
 ('Attention', 1064),
 ('BasicTextFieldEmbedder', 1064),
 ('BertConfig', 1064),
 ('BertModel', 1064),
 ('BertPreTrainedModel', 1064),
 ('BertTokenizer', 1064),
 ('BucketBatchSampler', 1064),
 ('CategoricalAccuracy', 1064),
 ('CnnEncoder', 1064),
 ('CosineMatrixAttention', 1064),
 ('DataLoader', 1064),
 ('DatasetReader', 1064),
 ('Embedding', 1064),
 ('GradientDescentTrainer', 1064),
 ('LinearMatrixAttention', 1064),
 ('MSELoss', 1064),
 ('MatrixAttention', 1064),
 ('MeanAbsoluteError', 1064),
 ('Model', 1064),
 ('ModuleList', 1064),
 ('Params', 1064),
 ('Predictor', 1064),
 ('PretrainedTransformerEmbedder', 1064),
 ('PretrainedTransformerMismatchedEmbedder', 1064),
 ('PytorchSeq2SeqWrapper', 1064),
 ('PytorchSeq2VecWrapper', 1064),
 ('SentenceTaggerPredictor', 1064),
 ('Seq2SeqEncoder', 1064),
 ('Seq2SeqPredi

In [382]:
%load_ext memory_profiler

In [383]:
%memit

peak memory: 891.11 MiB, increment: 0.40 MiB
