In [1]:
from typing import Iterator, List, Dict, Optional, cast
import torch
import torch.optim as optim
from torch.nn import MSELoss, CosineEmbeddingLoss
from torch.nn import functional as F
from torch.nn import ModuleList

import numpy as np
from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField, ArrayField, MetadataField, ListField
from allennlp.data.dataset_readers import DatasetReader
from allennlp.common.file_utils import cached_path
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding, TokenEmbedder
from allennlp.modules.token_embedders.pretrained_transformer_embedder import PretrainedTransformerEmbedder
from allennlp.modules.token_embedders.pretrained_transformer_mismatched_embedder import PretrainedTransformerMismatchedEmbedder
# from allennlp.modules.seq2seq_encoders.multi_head_self_attention import MultiHeadSelfAttention
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.seq2vec_encoders.cnn_encoder import CnnEncoder
from allennlp.modules.attention import Attention
from allennlp.modules.matrix_attention.matrix_attention import MatrixAttention
from allennlp.modules.matrix_attention.linear_matrix_attention import LinearMatrixAttention
from allennlp.modules.matrix_attention.cosine_matrix_attention import CosineMatrixAttention
from allennlp.modules.matrix_attention.bilinear_matrix_attention import BilinearMatrixAttention

from allennlp.modules.conditional_random_field import allowed_transitions, ConditionalRandomField

from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits, \
    get_device_of, masked_softmax, weighted_sum, \
    get_mask_from_sequence_lengths, get_lengths_from_binary_sequence_mask, tensors_equal, \
    batched_span_select

from allennlp.training.metrics import BooleanAccuracy, CategoricalAccuracy, MeanAbsoluteError, Average
from allennlp.data.samplers import BucketBatchSampler
from allennlp.data.dataloader import DataLoader, PyTorchDataLoader
from allennlp.training.trainer import GradientDescentTrainer
# from allennlp.predictors import Predictor, Seq2SeqPredictor, SimpleSeq2SeqPredictor, SentenceTaggerPredictor
from allennlp.predictors import Predictor, SentenceTaggerPredictor
from allennlp.nn.activations import Activation
from allennlp.common.tqdm import Tqdm
from allennlp.common.params import Params
from allennlp.common.util import JsonDict, sanitize

from allennlp_models.generation.predictors import Seq2SeqPredictor
from allennlp_models.generation.models.simple_seq2seq import SimpleSeq2Seq
from allennlp_models.generation.modules.seq_decoders.seq_decoder import SeqDecoder
from allennlp_models.generation.modules.decoder_nets.decoder_net import DecoderNet

from allennlp.common.util import START_SYMBOL, END_SYMBOL

from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

# from spacy.tokenizer import Tokenizer as SpacyTokenizer
# from spacy.lang.en import English
# nlp = English()
# Create a blank Tokenizer with just the English vocab
# tokenizer = Tokenizer(nlp.vocab)

from tqdm.notebook import tqdm

from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import ShortTermFeatures

import os
import sys
import itertools
import json
from collections import defaultdict, OrderedDict
from inspect import signature
import warnings
import pickle
from copy import copy, deepcopy
from overrides import overrides
import importlib
import string

import matplotlib.pyplot as plt

from transformers import BertPreTrainedModel, BertModel, BertConfig, BertTokenizer

from utils.spider import process_sql, evaluation
from utils.schema_gnn.spider_utils import Table, TableColumn, read_dataset_schema
from utils.misc_utils import Postprocess_rewrite_seq

import dataset_readers, models

from dataset_readers.reader_utils import extractAudioFeatures, extractAudioFeatures_NoPooling, dbToTokens, \
    read_DB, Get_align_tags
# from dataset_readers.reranker_reader_legacy import SpiderASRRerankerReaderV1, SpiderASRRerankerReaderV2
# from dataset_readers.reranker_reader import SpiderASRRerankerReaderV2_Siamese
from modules.encoder import SpeakQLEncoder, SpeakQLEncoderV1
# from models.reranker import SpiderASRRerankerV0, SpiderASRRerankerV1, SpiderASRRerankerV2, SpiderASRReranker_Siamese
# from predictors.reranker_predictor import SpiderASRRerankerPredictor, SpiderASRRerankerPredictor_Siamese

# from dataset_readers.rewriter_s2s_tabert_reader import SpiderASRRewriterReader_Seq2seq_TaBERT
# from models.rewriter_s2s_tabert import SpiderASRRewriter_Seq2seq_TaBERT 
# from predictors.rewriter_predictor import SpiderASRRewriterPredictor_Tagger_ILM, SpiderASRRewriterPredictor_Seq2seq
from dataset_readers import SpiderASRRewriterReader_ILM_Combined_new, SpiderASRRewriterReader_Seq2seq_Combined_new
from models import SpiderASRRewriter_ILM_Combined_new, SpiderASRRewriter_Seq2seq_Combined_new

# import dataset_readers.rewriter_reader
# import models.rewriter
# import predictors.rewriter_predictor


torch.manual_seed(1)

<torch._C.Generator at 0x10f938f90>

In [2]:
# importlib.reload(dataset_readers.rewriter_s2s_tabert_reader)
# importlib.reload(models.rewriter_s2s_tabert)
# # importlib.reload(models.rewriter)
# # importlib.reload(predictors.rewriter_predictor)

# from dataset_readers.rewriter_s2s_tabert_reader import SpiderASRRewriterReader_Seq2seq_TaBERT
# from models.rewriter_s2s_tabert import SpiderASRRewriter_Seq2seq_TaBERT 
# # from models.rewriter import SpiderASRRewriter_Tagger_ILM, SpiderASRRewriter_Seq2seq
# # from predictors.rewriter_predictor import SpiderASRRewriterPredictor_Tagger_ILM, SpiderASRRewriterPredictor_Seq2seq

In [3]:
AUDIO_DIM = 136
AUDIO_DIM_NO_POOLING = 68

## Tagger-ILM

### Tagger-ILM - Dataset Reader

In [5]:
# from dataset_readers import SpiderASRRewriterReader_ILM_Combined
# from models import SpiderASRRewriter_ILM_Combined

In [12]:
tables_json_fname = '/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/tables.json'
dataset_dir = '/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/my'
databases_dir = '/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/database'

src_token_indexers = {
    "bert": TokenIndexer.by_name('pretrained_transformer_mismatched')('bert-base-uncased'),
    "t5": TokenIndexer.by_name('pretrained_transformer_mismatched')('t5-base'),
    "char": TokenIndexer.by_name('characters')(namespace="token_characters", min_padding_length=5)
}
tgt_token_indexers = {'tgt_tokens': SingleIdTokenIndexer(namespace='tgt_tokens')}

dataset_reader = SpiderASRRewriterReader_ILM_Combined_new(tables_json_fname=tables_json_fname,
                                                    dataset_dir=dataset_dir,
                                                    databases_dir=databases_dir,
                                                    src_token_indexers=src_token_indexers,
                                                    tgt_token_indexers=tgt_token_indexers,
                                                    samples_limit=3)

train_dataset = dataset_reader.read('train')

dev_dataset = dataset_reader.read('dev')


HBox(children=(IntProgress(value=1, bar_style='info', description='reading instances', max=1, style=ProgressSt…

Loading literals failed: wta_1::players
['first_name', 'last_name', 'hand', 'country_code']
Could not decode to UTF-8 column 'last_name' with text 'Treyes Albarrac��N'





HBox(children=(IntProgress(value=1, bar_style='info', description='reading instances', max=1, style=ProgressSt…

Loading literals failed: wta_1::players
['first_name', 'last_name', 'hand', 'country_code']
Could not decode to UTF-8 column 'last_name' with text 'Treyes Albarrac��N'





In [13]:
train_dataset[0].fields

{'sentence': <allennlp.data.fields.text_field.TextField at 0x147ad8e10>,
 'text_mask': <allennlp.data.fields.array_field.ArrayField at 0x14e814b40>,
 'schema_mask': <allennlp.data.fields.array_field.ArrayField at 0x14e814c80>,
 'schema_column_ids': <allennlp.data.fields.array_field.ArrayField at 0x14e814d20>,
 'audio_feats': <allennlp.data.fields.list_field.ListField at 0x149578b90>,
 'audio_mask': <allennlp.data.fields.array_field.ArrayField at 0x14e814cd0>,
 'source_to_target': <allennlp.data.fields.namespace_swapping_field.NamespaceSwappingField at 0x14e810e60>,
 'rewriter_tags': <allennlp.data.fields.sequence_label_field.SequenceLabelField at 0x14a3523b0>,
 'rewrite_seq': <allennlp.data.fields.text_field.TextField at 0x14e810eb0>,
 'source_token_ids': <allennlp.data.fields.array_field.ArrayField at 0x14e810c80>,
 'target_token_ids': <allennlp.data.fields.array_field.ArrayField at 0x14e810cd0>,
 'metadata': <allennlp.data.fields.metadata_field.MetadataField at 0x145ba8f50>}

In [16]:
train_dataset[10].fields['rewrite_seq'].tokens

[@start@, are, [ANS], CCTV, [ANS], ?, [ANS], @end@]

In [17]:
train_dataset[10].fields['sentence'].tokens

[which,
 channels,
 air,
 not,
 owned,
 by,
 CC,
 TV,
 .,
 Give,
 me,
 the,
 channel,
 names,
 .,
 [SEP],
 broadcast,
 :,
 channel,
 id,
 ,,
 program,
 id,
 ,,
 time,
 of,
 day,
 .,
 broadcast,
 share,
 :,
 channel,
 id,
 ,,
 program,
 id,
 ,,
 date,
 ,,
 share,
 in,
 percent,
 .,
 channel,
 :,
 channel,
 id,
 ,,
 name,
 ,,
 owner,
 ,,
 share,
 in,
 percent,
 ,,
 rating,
 in,
 percent,
 .,
 program,
 :,
 program,
 id,
 ,,
 name,
 ,,
 origin,
 ,,
 launch,
 ,,
 owner,
 .]

In [18]:
vocab = Vocabulary.from_instances(train_dataset + dev_dataset)
vocab

HBox(children=(IntProgress(value=0, description='building vocab', max=41, style=ProgressStyle(description_widt…




Vocabulary with namespaces:  token_characters, Size: 54 || rewriter_tags, Size: 7 || tgt_tokens, Size: 28 || Non Padded Namespaces: {'*labels', '*tags'}

In [None]:
vocab.get_index_to_token_vocabulary("token_characters")

#### Exp - Tokenization, Indexer

In [39]:
instance = copy(train_dataset[20])
instance.fields

{'sentence': <allennlp.data.fields.text_field.TextField at 0x14ea0cbe0>,
 'text_mask': <allennlp.data.fields.array_field.ArrayField at 0x14ea0cb90>,
 'schema_mask': <allennlp.data.fields.array_field.ArrayField at 0x1a75d0f50>,
 'schema_column_ids': <allennlp.data.fields.array_field.ArrayField at 0x1b0b5d050>,
 'audio_feats': <allennlp.data.fields.list_field.ListField at 0x14b854190>,
 'audio_mask': <allennlp.data.fields.array_field.ArrayField at 0x182bb6730>,
 'source_to_target': <allennlp.data.fields.namespace_swapping_field.NamespaceSwappingField at 0x1a75defa0>,
 'rewriter_tags': <allennlp.data.fields.sequence_label_field.SequenceLabelField at 0x14a47a290>,
 'rewrite_seq': <allennlp.data.fields.text_field.TextField at 0x1a75d9050>,
 'source_token_ids': <allennlp.data.fields.array_field.ArrayField at 0x1a75d90f0>,
 'target_token_ids': <allennlp.data.fields.array_field.ArrayField at 0x1a75d9140>,
 'metadata': <allennlp.data.fields.metadata_field.MetadataField at 0x14b854e10>}

In [40]:
instance.index_fields(vocab)

In [41]:
instance.fields['sentence'].tokens[:5]

[What, are, the, chip, model]

In [42]:
instance.fields['sentence']._indexed_tokens['bert'].keys()

dict_keys(['token_ids', 'mask', 'type_ids', 'offsets', 'wordpiece_mask'])

In [43]:
instance.fields['sentence']._indexed_tokens['bert']['token_ids'][:5]

[101, 2054, 2024, 1996, 9090]

In [44]:
instance.fields['sentence']._indexed_tokens['bert']['type_ids'][:5]

[0, 0, 0, 0, 0]

In [45]:
src_token_indexers['bert'].tokens_to_indices(instance.fields['sentence'].tokens[:5], vocabulary=vocab)


{'token_ids': [101, 2054, 2024, 1996, 9090, 2944, 102],
 'mask': [True, True, True, True, True],
 'type_ids': [0, 0, 0, 0, 0, 0, 0],
 'offsets': [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5)],
 'wordpiece_mask': [True, True, True, True, True, True, True]}

In [22]:
instance.fields['sentence']._indexed_tokens['t5'].keys()

dict_keys(['token_ids', 'mask', 'type_ids', 'offsets', 'wordpiece_mask'])

In [24]:
instance.fields['sentence']._indexed_tokens['t5']['token_ids'][:5], instance.fields['sentence']._indexed_tokens['t5']['type_ids'][:5]

([125, 130, 8, 336, 3056], [0, 0, 0, 0, 0])

In [25]:
src_token_indexers['t5'].tokens_to_indices(instance.fields['sentence'].tokens[:5], vocabulary=vocab)


{'token_ids': [125, 130, 8, 336, 3056],
 'mask': [True, True, True, True, True],
 'type_ids': [0, 0, 0, 0, 0],
 'offsets': [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)],
 'wordpiece_mask': [True, True, True, True, True]}

In [31]:
instance.fields['sentence']._indexed_tokens['char'].keys()

dict_keys(['token_characters'])

In [33]:
instance.fields['sentence']._indexed_tokens['char']['token_characters'][:5]

[[18, 8, 25],
 [13, 4, 10, 17],
 [13, 2, 13, 24, 2, 7, 9],
 [6, 8, 2, 9],
 [3, 18, 2]]

In [47]:
src_token_indexers['bert']._tokenizer.tokenize('AbcdE FG')

['abc', '##de', 'f', '##g']

#### Exp - T5 indexer / embedder

In [9]:
t5_token_indexer = TokenIndexer.by_name('pretrained_transformer_mismatched')('t5-base')

In [11]:
t5_token_indexer._tokenizer.tokenize('abcdef')

['▁ab', 'c', 'de', 'f']

In [28]:
instance = copy(train_dataset[0])
instance.index_fields(vocab)

In [29]:
instance.fields['sentence']._indexed_tokens['t5'].keys()

dict_keys(['token_ids', 'mask', 'type_ids', 'offsets', 'wordpiece_mask'])

In [30]:
src_token_indexers['t5'].tokens_to_indices(instance.fields['sentence'].tokens[:5], vocabulary=vocab)

{'token_ids': [125, 130, 8, 336, 3056],
 'mask': [True, True, True, True, True],
 'type_ids': [0, 0, 0, 0, 0],
 'offsets': [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)],
 'wordpiece_mask': [True, True, True, True, True]}

### Tagger-ILM - Model

In [25]:
vocab = Vocabulary.from_instances(train_dataset + dev_dataset)
vocab

HBox(children=(IntProgress(value=0, description='building vocab', max=111, style=ProgressStyle(description_wid…




Vocabulary with namespaces:  token_characters, Size: 56 || rewriter_tags, Size: 8 || tgt_tokens, Size: 60 || Non Padded Namespaces: {'*labels', '*tags'}

In [30]:
vocab.get_index_to_token_vocabulary("token_characters")

{0: '@@PADDING@@',
 1: '@@UNKNOWN@@',
 2: 'e',
 3: 't',
 4: 'a',
 5: 'i',
 6: 's',
 7: 'n',
 8: 'r',
 9: 'd',
 10: 'o',
 11: ',',
 12: 'c',
 13: 'm',
 14: 'l',
 15: 'p',
 16: 'h',
 17: 'u',
 18: 'g',
 19: '.',
 20: ':',
 21: 'y',
 22: 'f',
 23: 'b',
 24: 'w',
 25: 'v',
 26: 'k',
 27: 'S',
 28: 'z',
 29: 'P',
 30: 'E',
 31: '[',
 32: ']',
 33: 'x',
 34: '1',
 35: '?',
 36: 'j',
 37: "'",
 38: '2',
 39: 'W',
 40: 'A',
 41: 'I',
 42: 'U',
 43: 'T',
 44: '3',
 45: 'B',
 46: 'V',
 47: '8',
 48: '4',
 49: '9',
 50: 'C',
 51: 'H',
 52: 'N',
 53: 'L',
 54: 'F',
 55: 'O'}

In [31]:
# Hyperparams 
TAG_EMB_DIM = 64
SRC_EMB_DIM = 768 # BERT 
TGT_EMB_DIM = 300
AUDIO_ENC_DIM = 128
CHAR_EMB_DIM = 128

ENCODER_DIM = 256
TAGGING_FF_DIM = 64
DECODER_DIM = ENCODER_DIM # It seems that otherwise it can't work 

In [36]:
src_text_embedder = BasicTextFieldEmbedder(
        token_embedders={
            "bert": TokenEmbedder.by_name("pretrained_transformer_mismatched")("bert-base-uncased"),
            "char": TokenEmbedder.by_name("character_encoding")(
              # TokenCharactersEncoder(subclass of TokenEmbedder)
              embedding=Embedding(embedding_dim=CHAR_EMB_DIM, vocab_namespace="token_characters", num_embeddings=vocab.get_vocab_size('token_characters')),
              encoder=Seq2VecEncoder.by_name("cnn")(
                embedding_dim=CHAR_EMB_DIM,
                num_filters=4,
                ngram_filter_sizes=[2, 3, 4, 5],
                output_dim=CHAR_EMB_DIM,
              ),
              dropout=0.0,
            ),
        }
)

# tgt_text_embedder = BasicTextFieldEmbedder(
#         token_embedders={
#             "tgt_tokens": Embedding(
#                 embedding_dim=TGT_EMB_DIM,
#                 num_embeddings=vocab.get_vocab_size('tgt_tokens')
#             )
#         })
tgt_text_embedder = Embedding(embedding_dim=TGT_EMB_DIM,
                              num_embeddings=vocab.get_vocab_size('tgt_tokens'))

In [37]:
tag_embedder = Embedding(embedding_dim=TAG_EMB_DIM,
                         vocab_namespace='rewriter_tags',
                         vocab=vocab)
# tag_embedder = Embedding(embedding_dim=TAG_EMB_DIM,
#                          num_embeddings=vocab.get_vocab_size('rewriter_tags'))

In [38]:
rewrite_decoder = SeqDecoder.by_name('auto_regressive_seq_decoder')(
    vocab=vocab,
    decoder_net=DecoderNet.by_name('lstm_cell')(
        decoding_dim=DECODER_DIM,
        target_embedding_dim=TGT_EMB_DIM,
        attention=Attention.by_name('bilinear')(DECODER_DIM, ENCODER_DIM)
    ),
    max_decoding_steps=100,
    target_embedder=tgt_text_embedder,
    target_namespace='tgt_tokens',
    beam_size=4
)

In [48]:
audio_s2v = CnnEncoder(embedding_dim = AUDIO_DIM_NO_POOLING,
                       num_filters = 4,
                       ngram_filter_sizes = (2, 3, 4, 5),
                       output_dim = AUDIO_ENC_DIM)

# lstm_s2s_no_tags = PytorchSeq2SeqWrapper(torch.nn.LSTM(SRC_EMB_DIM + AUDIO_ENC_DIM, ENCODER_DIM, batch_first=True))
# lstm_s2s_with_tags = PytorchSeq2SeqWrapper(torch.nn.LSTM(SRC_EMB_DIM + AUDIO_ENC_DIM + TAG_EMB_DIM, ENCODER_DIM, batch_first=True))
lstm_s2s = PytorchSeq2SeqWrapper(torch.nn.LSTM(SRC_EMB_DIM + AUDIO_ENC_DIM + TAG_EMB_DIM + CHAR_EMB_DIM, ENCODER_DIM, batch_first=True))

# lstm_s2v_no_tags = PytorchSeq2VecWrapper(torch.nn.LSTM(ENCODER_DIM, ENCODER_DIM, batch_first=True))
# lstm_s2v_with_tags = PytorchSeq2VecWrapper(torch.nn.LSTM(ENCODER_DIM, ENCODER_DIM, batch_first=True))

# TODO: use s2s & s2v, instead of multilayer s2v, since we need sequence representations here 

# encoder_no_tags = SpeakQLEncoderV1(
#     audio_attention_layer=CosineMatrixAttention(),
#     audio_attention_residual='+',
#     seq2seq_encoders=[lstm_s2s_no_tags],
#     seq2vec_encoder=lstm_s2v_no_tags
# )
encoder_with_tags = SpeakQLEncoderV1(
    audio_attention_layer=CosineMatrixAttention(),
    audio_attention_residual='+',
    seq2seq_encoders=[lstm_s2s],
    seq2vec_encoder=None
)

In [49]:
# tagger_ILM_model = SpiderASRRewriter_Tagger_ILM(
#     src_text_embedder=src_text_embedder,
#     tag_embedder=tag_embedder,
#     bert_pretrained_model='bert-base-uncased',
#     audio_seq2vec_encoder=audio_s2v,
#     encoder_no_tags=encoder_no_tags,
#     encoder_with_tags=encoder_with_tags,
#     rewrite_decoder=rewrite_decoder,
#     ff_dimension=TAGGING_FF_DIM,
#     concat_audio=True,
#     vocab=vocab
# )

ILM_model = SpiderASRRewriter_ILM_Combined(
    src_text_embedder=src_text_embedder,
    rewriter_tag_embedder=tag_embedder,
    use_tabert=False,
    audio_seq2vec_encoder=audio_s2v,
    encoder=encoder_with_tags,
    rewrite_decoder=rewrite_decoder,
    concat_audio=True,
    vocab=vocab
)

self._start_index: 3, @start@
self._end_index: 4, @end@
self._pad_index: 0, @@PADDING@@


In [50]:
optimizer = optim.SGD(ILM_model.parameters(), lr=0.01)

train_dataset.index_with(vocab)
dev_dataset.index_with(vocab)

train_data_loader = PyTorchDataLoader(train_dataset, batch_size=8, shuffle=True)
dev_data_loader = PyTorchDataLoader(dev_dataset, batch_size=8, shuffle=False)

trainer = GradientDescentTrainer(model=ILM_model,
                                 optimizer=optimizer,
                                 data_loader=train_data_loader,
                                 validation_data_loader=dev_data_loader,
                                 patience=1,
                                 num_epochs=1,
                                 grad_norm=0.1,
                                 cuda_device=-1)
trainer.train()

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




HBox(children=(IntProgress(value=0, max=7), HTML(value='')))




{'best_epoch': 0,
 'peak_worker_0_memory_MB': 5002.682368,
 'training_duration': '0:01:38.061549',
 'training_start_epoch': 0,
 'training_epochs': 0,
 'epoch': 0,
 'training_rewrite_seq_NLL': 4.076723575592041,
 'training_loss': 4.076723575592041,
 'training_worker_0_memory_MB': 5002.682368,
 'validation_rewrite_seq_NLL': 4.064238548278809,
 'validation_rewrite_seq_BLEU': 5.293757876399208e-14,
 'validation_loss': 4.064238548278809,
 'best_validation_rewrite_seq_NLL': 4.064238548278809,
 'best_validation_rewrite_seq_BLEU': 5.293757876399208e-14,
 'best_validation_loss': 4.064238548278809}

In [57]:
dev_dataset[0].fields['sentence']._indexed_tokens['char']

{'token_characters': [[14, 5, 6, 3],
  [3, 16, 2],
  [12, 10, 6, 3],
  [10, 22],
  [2, 4, 12, 16],
  [3, 8, 2, 4, 3, 13, 2, 7, 3],
  [5, 7],
  [3, 16, 2],
  [12, 10, 8, 8, 2, 6, 15, 10, 7, 9, 5, 7, 18],
  [3, 8, 2, 4, 3, 13, 2, 7, 3],
  [3, 21, 15, 2],
  [9, 2, 6, 12, 8, 5, 15, 3, 5, 10, 7],
  [19],
  [31, 27, 30, 29, 32],
  [23, 8, 2, 2, 9, 6],
  [20],
  [23, 8, 2, 2, 9],
  [12, 10, 9, 2],
  [11],
  [23, 8, 2, 2, 9],
  [7, 4, 13, 2],
  [19],
  [12, 16, 4, 8, 18, 2, 6],
  [20],
  [12, 16, 4, 8, 18, 2],
  [5, 9],
  [11],
  [12, 16, 4, 8, 18, 2],
  [3, 21, 15, 2],
  [11],
  [12, 16, 4, 8, 18, 2],
  [4, 13, 10, 17, 7, 3],
  [19],
  [9, 10, 18, 6],
  [20],
  [9, 10, 18],
  [5, 9],
  [11],
  [10, 24, 7, 2, 8],
  [5, 9],
  [11],
  [4, 23, 4, 7, 9, 10, 7, 2, 9],
  [21, 2, 6],
  [10, 8],
  [7, 10],
  [11],
  [23, 8, 2, 2, 9],
  [12, 10, 9, 2],
  [11],
  [6, 5, 28, 2],
  [12, 10, 9, 2],
  [11],
  [7, 4, 13, 2],
  [11],
  [4, 18, 2],
  [11],
  [9, 4, 3, 2],
  [10, 22],
  [23, 5, 8, 3, 16],
  [11

In [None]:
# Load trained model 

tagger_ILM_model = Model.from_archive('runs/2.0.1/model.tar.gz')

### Tagger-ILM - Predictor

In [None]:
# Another dataset_reader, excluding gold tags and rewriter_seq in the instances 

# No... should remove the fields in predictor, because we might want to evaluate 
# the rewrite seq predictions given oracle tagging, or even with teacher forcing 

In [17]:
test_dataset = dataset_reader.read('test')
len(test_dataset)

HBox(children=(IntProgress(value=1, bar_style='info', description='reading instances', max=1, style=ProgressSt…




59

In [18]:
_test_instance = Instance(test_dataset[0].fields.copy())
_test_instance.fields

{'sentence': <allennlp.data.fields.text_field.TextField at 0x1770fa460>,
 'text_mask': <allennlp.data.fields.array_field.ArrayField at 0x1770fadc0>,
 'schema_mask': <allennlp.data.fields.array_field.ArrayField at 0x1770faaa0>,
 'schema_column_ids': <allennlp.data.fields.array_field.ArrayField at 0x1770fa0a0>,
 'audio_feats': <allennlp.data.fields.list_field.ListField at 0x189ba2090>,
 'audio_mask': <allennlp.data.fields.array_field.ArrayField at 0x1770fa5a0>,
 'rewriter_tags': <allennlp.data.fields.sequence_label_field.SequenceLabelField at 0x14205a830>,
 'rewrite_seq': <allennlp.data.fields.text_field.TextField at 0x1770eb2d0>,
 'metadata': <allennlp.data.fields.metadata_field.MetadataField at 0x189ba2f90>}

In [None]:
predictor = SpiderASRRewriterPredictor_Tagger_ILM(model=tagger_ILM_model,
                                                  dataset_reader=dataset_reader)

In [None]:
predictor.predict_instance(_test_instance)

In [None]:
predictor_output = predictor.predict_instance(_test_instance)

In [264]:
## Intermediate: make a dataset file with tagger predictor output as rewrite_tags, and feed to ILM predictor 
test_path = '/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/my/dev/test_rewriter.json'
tagger_output_path = '/Users/mac/Desktop/syt/Deep-Learning/Projects-M/SpeakQL/SpeakQL/Allennlp_models/outputs/local-test/output-2.2tL.json'

tagger_output_jsons = []
with open(test_path, 'r') as f:
    test_dataset_json = json.load(f)
with open(tagger_output_path, 'r') as f:
    for l in f:
        tagger_output_jsons.append(json.loads(l))

len(test_dataset_json), len(tagger_output_jsons)


(547, 86)

In [265]:
orig_test_samples_by_oid = defaultdict(list)
tagger_output_by_oid = defaultdict(list)

for d in tagger_output_jsons:
    o_id = d['original_id']
    tagger_output_by_oid[o_id].append(d)

for d in test_dataset_json:
    if len(d) == 0:
        continue
        
    o_id = d[0]['original_id']
    
    for c in d:
        assert c['original_id'] == o_id 
        orig_test_samples_by_oid[o_id].append(c)

len(orig_test_samples_by_oid), len(tagger_output_by_oid)

(547, 13)

In [266]:
tagger_output_test_dataset = []

for o_id, _outputs in tagger_output_by_oid.items():
    _test_samples = orig_test_samples_by_oid[o_id]
    assert len(_test_samples) == len(_outputs)
    
    d = []
    for c, o in zip(_test_samples, _outputs):
        assert ' '.join(c['question_toks']) == o['question']
        _seq_len = len(c['question_toks'])
        assert c['rewriter_tags'][:_seq_len] == o['gold_tags'][:_seq_len]
        
        c['tagger_predicted_rewriter_tags'] = o['tags_prediction']
        d.append(c)
    
    tagger_output_test_dataset.append(d)

len(tagger_output_test_dataset)

13

In [269]:
tagger_output_test_dataset[0][0].keys()

dict_keys(['db_id', 'query', 'query_toks', 'query_toks_no_value', 'question', 'question_toks', 'sql', 'span_ranges', 'original_id', 'ratsql_pred_sql', 'gold_question', 'gold_question_toks', 'ratsql_pred_score', 'question_toks_edit_distance', 'alignment_span_pairs', 'alignment_text_pairs', 'rewriter_tags', 'rewriter_edits', 'tagger_predicted_rewriter_tags'])

In [272]:
# output_test_path = '/Users/mac/Desktop/syt/Deep-Learning/Projects-M/SpeakQL/SpeakQL/Allennlp_models/outputs/local-test/test-rewriter-2.2tL.json'

# with open(output_test_path, 'w') as f:
#     json.dump(tagger_output_test_dataset, f, indent=4)

#### Tagger-ILM - Eval (moved to ratsql-infer.ipynb)

### Align tags analysis 


In [4]:
train_json_path = '/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/my/train/train_rewriter.json'

with open(train_json_path, 'r') as f:
    train_json_list = json.load(f)

len(train_json_list)

7000

In [5]:
for d in tqdm(train_json_list):
    Get_align_tags(d)

HBox(children=(IntProgress(value=0, max=7000), HTML(value='')))




In [29]:
c = train_json_list[0][0]
print(c['rewriter_tags'])
print(c['align_tags'])

['O-KEEP', 'O-KEEP', 'O-KEEP', 'O-KEEP', 'O-KEEP', 'O-KEEP', 'O-KEEP', 'O-KEEP', 'O-KEEP', 'O-KEEP', 'U-EDIT']
['[SAME]', '[SAME]', '[SAME]', '[SAME]', '[SAME]', '[SAME]', '[DIFF+0]', '[SAME]', '[SAME]', '[SAME]', '[PUNCT]']


In [8]:
all_rewriter_tags = set()
all_align_tags = set()

for d in tqdm(train_json_list):
    for c in d:
        all_rewriter_tags.update(c['rewriter_tags'])
        all_align_tags.update(c['align_tags'])

all_rewriter_tags = sorted(list(all_rewriter_tags))
all_align_tags = sorted(list(all_align_tags))
print(all_rewriter_tags)
print(all_align_tags)

HBox(children=(IntProgress(value=0, max=7000), HTML(value='')))


['B-DEL', 'B-EDIT', 'I-DEL', 'I-EDIT', 'L-DEL', 'L-EDIT', 'O-KEEP', 'U-DEL', 'U-EDIT']
['[DIFF+0]', '[DIFF+1]', '[DIFF+2]', '[DIFF+3]', '[DIFF+4]', '[DIFF-1]', '[DIFF-2]', '[DIFF-3]', '[DIFF-4]', '[PUNCT]', '[SAME]']


In [9]:
rewriter_tag2idx = {t : i for i, t in enumerate(all_rewriter_tags)}
align_tag2idx = {t : i for i, t in enumerate(all_align_tags)}
print(rewriter_tag2idx)
print(align_tag2idx)

{'B-DEL': 0, 'B-EDIT': 1, 'I-DEL': 2, 'I-EDIT': 3, 'L-DEL': 4, 'L-EDIT': 5, 'O-KEEP': 6, 'U-DEL': 7, 'U-EDIT': 8}
{'[DIFF+0]': 0, '[DIFF+1]': 1, '[DIFF+2]': 2, '[DIFF+3]': 3, '[DIFF+4]': 4, '[DIFF-1]': 5, '[DIFF-2]': 6, '[DIFF-3]': 7, '[DIFF-4]': 8, '[PUNCT]': 9, '[SAME]': 10}


In [11]:
cooccurrence = np.zeros((len(all_rewriter_tags), len(all_align_tags)), dtype=int)

for d in tqdm(train_json_list):
    for c in d:
        for _rt, _at in zip(c['rewriter_tags'], c['align_tags']):
            _ri = rewriter_tag2idx[_rt]
            _ai = align_tag2idx[_at]
            cooccurrence[_ri, _ai] += 1

cooccurrence

HBox(children=(IntProgress(value=0, max=7000), HTML(value='')))




array([[    49,     50,     29,     36,     82,     74,     59,     74,
           259,    610,    331],
       [   395,    620,    341,    569,   1048,    748,    643,    904,
          2331,   1365,   5708],
       [    10,     27,     12,     21,     41,     13,     16,     25,
           134,    120,    102],
       [    59,     92,     37,     83,    179,     90,     42,     98,
           225,    366,    781],
       [    19,     41,      7,     36,     54,     58,     38,     86,
           270,    732,    312],
       [   433,    668,    417,    477,    837,    704,    598,    915,
          2925,   1360,   5338],
       [  5834,   9289,   9030,   9418,  17670,   5182,   2168,   2206,
          3017,  30047, 394062],
       [   118,    170,     62,    127,    289,    258,    278,    421,
          1902,   7152,    910],
       [  1600,   2426,   1368,   1810,   2849,   3905,   3578,   5565,
         17624,  12040,  14469]])

In [27]:
print('\t\t' + ''.join([f'{_at:<10s}' for _at in all_align_tags]))
for i in range(len(all_rewriter_tags)):
    print(all_rewriter_tags[i] + '\t' + "".join([f'{cnt:10d}' for cnt in cooccurrence[i]]))
print()

		[DIFF+0]  [DIFF+1]  [DIFF+2]  [DIFF+3]  [DIFF+4]  [DIFF-1]  [DIFF-2]  [DIFF-3]  [DIFF-4]  [PUNCT]   [SAME]    
B-DEL	        49        50        29        36        82        74        59        74       259       610       331
B-EDIT	       395       620       341       569      1048       748       643       904      2331      1365      5708
I-DEL	        10        27        12        21        41        13        16        25       134       120       102
I-EDIT	        59        92        37        83       179        90        42        98       225       366       781
L-DEL	        19        41         7        36        54        58        38        86       270       732       312
L-EDIT	       433       668       417       477       837       704       598       915      2925      1360      5338
O-KEEP	      5834      9289      9030      9418     17670      5182      2168      2206      3017     30047    394062
U-DEL	       118       170        62       127       289       2

In [25]:
rew_keep_ids = [i for i, t in enumerate(all_rewriter_tags) if t.endswith('KEEP')]
rew_del_ids = [i for i, t in enumerate(all_rewriter_tags) if t.endswith('DEL')]
rew_edit_ids = [i for i, t in enumerate(all_rewriter_tags) if t.endswith('EDIT')]

print('\t' + ''.join([f'{_at:>10s}' for _at in all_align_tags]))
print('KEEP\t' + "".join([f'{cnt:10d}' for cnt in cooccurrence[rew_keep_ids].sum(0)]))
print('DEL\t' + "".join([f'{cnt:10d}' for cnt in cooccurrence[rew_del_ids].sum(0)]))
print('EDIT\t' + "".join([f'{cnt:10d}' for cnt in cooccurrence[rew_edit_ids].sum(0)]))
print()

	  [DIFF+0]  [DIFF+1]  [DIFF+2]  [DIFF+3]  [DIFF+4]  [DIFF-1]  [DIFF-2]  [DIFF-3]  [DIFF-4]   [PUNCT]    [SAME]
KEEP	      5834      9289      9030      9418     17670      5182      2168      2206      3017     30047    394062
DEL	       196       288       110       220       466       403       391       606      2565      8614      1655
EDIT	      2487      3806      2163      2939      4913      5447      4861      7482     23105     15131     26296



## Seq2seq rewriter

### Seq2seq - Dataset Reader

In [None]:
importlib.reload(dataset_readers.rewriter_s2s_tabert_reader)
importlib.reload(models.rewriter_s2s_tabert)

from dataset_readers.rewriter_s2s_tabert_reader import SpiderASRRewriterReader_Seq2seq_TaBERT
from models.rewriter_s2s_tabert import SpiderASRRewriter_Seq2seq_TaBERT 

In [16]:
tables_json_fname = '/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/tables.json'
dataset_dir = '/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/my'
databases_dir = '/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/database'
tabert_model_path = '/Users/mac/Desktop/syt/Deep-Learning/Repos/TaBERT/pretrained-models/tabert_base_k1/model.bin'


src_token_indexers = {'bert': TokenIndexer.by_name('pretrained_transformer_mismatched')('bert-base-uncased')}
tgt_token_indexers = {'tgt_tokens': SingleIdTokenIndexer(namespace='tgt_tokens')}

dataset_reader_s2s = SpiderASRRewriterReader_Seq2seq_TaBERT(tables_json_fname=tables_json_fname,
                                                            dataset_dir=dataset_dir,
                                                            databases_dir=databases_dir,
                                                            tabert_model_path=tabert_model_path,
                                                            src_token_indexers=src_token_indexers,
                                                            tgt_token_indexers=tgt_token_indexers,
                                                            debug=True)

train_dataset_s2s = dataset_reader_s2s.read('train')

dev_dataset_s2s = dataset_reader_s2s.read('dev')

HBox(children=(IntProgress(value=1, bar_style='info', description='reading instances', max=1, style=ProgressSt…




HBox(children=(IntProgress(value=1, bar_style='info', description='reading instances', max=1, style=ProgressSt…




In [None]:
len(train_dataset_s2s), len(dev_dataset_s2s)

In [None]:
list(train_dataset_s2s[0].fields['rewrite_seq_s2s'])

### Seq2seq - Model

In [17]:
vocab = Vocabulary.from_instances(train_dataset_s2s + dev_dataset_s2s)
vocab

HBox(children=(IntProgress(value=0, description='building vocab', max=426, style=ProgressStyle(description_wid…




Vocabulary with namespaces:  tgt_tokens, Size: 328 || Non Padded Namespaces: {'*tags', '*labels'}

In [18]:
# Hyperparams 
TAG_EMB_DIM = 64
SRC_EMB_DIM = 768 # BERT 
TGT_EMB_DIM = 300
AUDIO_ENC_DIM = 128

ENCODER_DIM = 256
TAGGING_FF_DIM = 64
DECODER_DIM = ENCODER_DIM # It seems that otherwise it can't work 

In [19]:
# src_text_embedder = BasicTextFieldEmbedder(
#         token_embedders={
#             "bert": TokenEmbedder.by_name("pretrained_transformer_mismatched")("bert-base-uncased")
#         })
src_text_embedder = None

tgt_text_embedder = Embedding(embedding_dim=TGT_EMB_DIM,
                              num_embeddings=vocab.get_vocab_size('tgt_tokens'))

In [20]:
rewrite_decoder = SeqDecoder.by_name('speakql_copynet_seq_decoder')(
    vocab=vocab,
    encoder_output_dim=ENCODER_DIM,
    is_bidirectional_input=True,
    attention=Attention.by_name('cosine')(),
    beam_size=4,
    max_decoding_steps=100,
    target_embedder=tgt_text_embedder,
    target_namespace='tgt_tokens'
)

In [21]:
audio_s2v = CnnEncoder(embedding_dim = AUDIO_DIM_NO_POOLING,
                       num_filters = 4,
                       ngram_filter_sizes = (2, 3, 4, 5),
                       output_dim = AUDIO_ENC_DIM)

lstm_s2s = PytorchSeq2SeqWrapper(torch.nn.LSTM(SRC_EMB_DIM + AUDIO_ENC_DIM, ENCODER_DIM, batch_first=True))
lstm_s2v = PytorchSeq2VecWrapper(torch.nn.LSTM(ENCODER_DIM, ENCODER_DIM, batch_first=True))

# TODO: use s2s & s2v, instead of multilayer s2v, since we need sequence representations here 

encoder_for_s2s = SpeakQLEncoderV1(
    audio_attention_layer=CosineMatrixAttention(),
    audio_attention_residual='+',
    seq2seq_encoders=[lstm_s2s],
    seq2vec_encoder=lstm_s2v
)



In [14]:
importlib.reload(dataset_readers.rewriter_s2s_tabert_reader)
importlib.reload(models.rewriter_s2s_tabert)

from dataset_readers.rewriter_s2s_tabert_reader import SpiderASRRewriterReader_Seq2seq_TaBERT
from models.rewriter_s2s_tabert import SpiderASRRewriter_Seq2seq_TaBERT 

In [22]:
s2s_model = SpiderASRRewriter_Seq2seq_TaBERT(
    src_text_embedder=src_text_embedder,
    tabert_model_path=tabert_model_path,
    finetune_tabert=False,
    audio_seq2vec_encoder=audio_s2v,
    encoder=encoder_for_s2s,
    rewrite_decoder=rewrite_decoder,
    concat_audio=True,
    vocab=vocab
)

In [23]:
train_dataset_s2s.index_with(vocab)
dev_dataset_s2s.index_with(vocab)

In [24]:
train_data_loader = PyTorchDataLoader(train_dataset_s2s, batch_size=8, shuffle=True)
dev_data_loader = PyTorchDataLoader(dev_dataset_s2s, batch_size=8, shuffle=False)

In [None]:
for _b in tqdm(train_data_loader.__iter__()):
    s2s_model(**_b)

In [None]:
optimizer = optim.SGD(s2s_model.parameters(), lr=0.01)

trainer = GradientDescentTrainer(model=s2s_model,
                                 optimizer=optimizer,
                                 data_loader=train_data_loader,
                                 validation_data_loader=dev_data_loader,
                                 patience=1,
                                 num_epochs=1,
                                 grad_norm=0.1,
                                 cuda_device=-1)
trainer.train()

### Seq2seq - Predictor

In [None]:
test_dataset_s2s = dataset_reader_s2s.read('test')
len(test_dataset_s2s)

In [None]:
_test_instance = Instance(test_dataset_s2s[0].fields.copy())
_test_instance.fields

In [None]:
predictor = SpiderASRRewriterPredictor_Seq2seq(model=s2s_model,
                                               dataset_reader=dataset_reader_s2s)

In [None]:
predictor.predict_instance(_test_instance)

#### Seq2seq - Analysis (moved to ratsql-infer.ipynb)

### Rewriter preds -> Reranker cands

In [226]:
def Generate_rewrt_cand_span_ranges(
    orig_tokens,
    orig_span_ranges,
    tags,
    ILM_tokens):
    
    _edits = []
    _curr_edit = []
    for tok in ILM_tokens:
        if tok == '[ANS]':
            _edits.append(_curr_edit)
            _curr_edit = []
        elif tok == END_SYMBOL:  # Allennlp END_SYMBOL 
            break
        else:
            _curr_edit.append(tok)
    
    # Get the span ranges, combining each edit span together 
    _coarse_span_ranges = [] # (st, ed) for each edit span 
    _is_edit_spans = []      # True for edit spans, False for other tokens 

    _span_st = 0
    _span_ed = 0
    
    for i, (st, ed) in enumerate(orig_span_ranges):
        if tags[i].endswith('KEEP'):
            # Not an edit span 
            _coarse_span_ranges.append((st, ed))
            _is_edit_spans.append(False)
            
        elif sum(_is_edit_spans) >= len(_edits):
            # Is an edit span, but no more edits available, treat as DEL
            # (TODO: in original processing, should treat as KEEP!) 
            pass
        
        elif (tags[i] == 'U-EDIT'):
            _coarse_span_ranges.append((st, ed))
            _is_edit_spans.append(True)
            
        elif (tags[i] == 'B-EDIT'):
            if type(st) == type(ed) == str:
                # actual token 
                _span_st = st
                _span_ed = ed
        
        elif (tags[i] == 'I-EDIT'):
            assert type(_span_st) == type(_span_ed)
            
            if type(st) == type(ed) == str:
                # actual token 
                if _span_st == _span_ed == 0:
                    # prev tokens are puncts 
                    _span_st = st
                    _span_ed = ed
                else:
                    # prev have actual tokens, update ed 
                    _span_ed = ed
            
        elif (tags[i] == 'L-EDIT'):
            assert type(_span_st) == type(_span_ed)
            
            if type(st) == type(ed) == str:
                # actual token 
                if _span_st == _span_ed == 0:
                    # prev tokens are puncts 
                    _span_st = st
                    _span_ed = ed
                else:
                    # prev have actual tokens, update ed 
                    _span_ed = ed
            
            _coarse_span_ranges.append((_span_st, _span_ed))
            _is_edit_spans.append(True)
            _span_st = 0
            _span_ed = 0
        
        elif tags[i].endswith('DEL'):
            pass
        
        else:
            print('Unknown tag: {}'.format(tags[i]))

    assert sum(_is_edit_spans) <= len(_edits)
    
    # Split each edit span range by number of chars (excluding puncts) 
    out_span_ranges = []
    
    _eid = 0
    for i, (st, ed) in enumerate(_coarse_span_ranges):
        # st, ed are 0 or str!
        if not _is_edit_spans[i]:
            out_span_ranges.append((st, ed))
            continue
        
        # is an edit span; split 
        _edit = _edits[_eid]
        _eid += 1
        
        if st == ed == 0:
            # original token is punct, no time span 
            out_span_ranges.extend([(0, 0)] * len(_edit))
            continue
        
        _token_lens = []
        for tok in _edit:
            _nchars = len([_c for _c in tok if _c not in string.punctuation])
            _token_lens.append(_nchars)
        
        if sum(_token_lens) == 0:
            # all edited tokens are puncts 
            out_span_ranges.extend([(0, 0)] * len(_edit))
            continue
        
        _unit_time = (float(ed) - float(st)) / sum(_token_lens)
        assert _unit_time > 0, \
            f'{orig_tokens}\n{orig_span_ranges}\n{tags}\n{ILM_tokens}\n\n{_coarse_span_ranges}\n{_is_edit_spans}'
        
        _st = _ed = float(st)
        for _l in _token_lens:
            if _l == 0:
                # this is punct
                out_span_ranges.append((0, 0))
                continue
            # this is not punct 
            _st = _ed
            _ed = _st + _l * _unit_time
            out_span_ranges.append((f'{_st:.4f}', f'{_ed:.4f}'))
        assert np.isclose(_ed, float(ed)), f'{_ed} != {ed}'
        
    return out_span_ranges


In [227]:
orig_tokens = 'a , c d e fgh i'.split(' ')
orig_span_ranges = [
    ("0.1", "0.2"),
    (0, 0),
    ("0.2", "0.3"),
    ("0.3", "0.5"),
    ("0.5", "0.6"),
    ("0.7", "1.0"),
    ("1.0", "1.2"),
]
tags = ['O-KEEP', 'U-EDIT', 'B-EDIT', 'I-EDIT', 'L-EDIT', 'U-EDIT', 'U-EDIT']
ILM_tokens = 'b , [ANS] cde & yu-iop [ANS] f g h [ANS] @end@'.split(' ')

print(Postprocess_rewrite_seq(tags=tags,
                              rewrite_seq=ILM_tokens,
                              question_toks=orig_tokens))
print()
print(Generate_rewrt_cand_span_ranges(
        orig_tokens,
        orig_span_ranges,
        tags,
        ILM_tokens))


--- Not enough edits ---
Tags: ['O-KEEP', 'U-EDIT', 'B-EDIT', 'I-EDIT', 'L-EDIT', 'U-EDIT', 'U-EDIT']
Edits: [['b', ','], ['cde', '&', 'yu-iop'], ['f', 'g', 'h']]
['a', 'b', ',', 'cde', '&', 'yu-iop', 'f', 'g', 'h']

[('0.1', '0.2'), (0, 0), (0, 0), ('0.2000', '0.3500'), (0, 0), ('0.3500', '0.6000'), ('0.7000', '0.8000'), ('0.8000', '0.9000'), ('0.9000', '1.0000')]


In [273]:
REWRITER_VERSION = '2.5.0.0t-2.4.0.0i' # Has to be taggerILM 
HUMAN_TEST = True

if not HUMAN_TEST:
    rewriter_output_path = f'/Users/mac/Desktop/syt/Deep-Learning/Projects-M/SpeakQL/SpeakQL/Allennlp_models/outputs/ratsql-test-save/{REWRITER_VERSION}.json'
    reranker_test_path = '/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/my/dev/test_reranker.json'
    reranker_extra_cands_test_path = f'/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/my/dev/aggreg_extra_cands/test_reranker_with_{REWRITER_VERSION}.json'
else:
    rewriter_output_path = f'/Users/mac/Desktop/syt/Deep-Learning/Projects-M/SpeakQL/SpeakQL/Allennlp_models/outputs/ratsql-test-save/humantest-yshao-{REWRITER_VERSION}.json'
    reranker_test_path = '/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/my/dev/human_test/human_test_yshao_reranker.json'
    reranker_extra_cands_test_path = f'/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/my/dev/aggreg_extra_cands/human_test_yshao_reranker_with_{REWRITER_VERSION}.json'

    

In [274]:
with open(rewriter_output_path, 'r') as f:
    rewriter_outputs = json.load(f)
with open(reranker_test_path, 'r') as f:
    reranker_test_samples = json.load(f)
    
len(rewriter_outputs), len(reranker_test_samples)

(100, 100)

In [275]:
rewriter_outputs[0][0].keys(), reranker_test_samples[0][0].keys()

(dict_keys(['db_id', 'query', 'query_toks', 'query_toks_no_value', 'question', 'question_toks', 'sql', 'original_id', 'span_ranges', 'ratsql_pred_sql', 'gold_question', 'gold_question_toks', 'ratsql_pred_exact', 'ratsql_pred_score', 'question_toks_edit_distance', 'alignment_span_pairs', 'alignment_text_pairs', 'rewriter_tags', 'rewriter_edits', 'pred_tags', 'pred_ILM', 'pred_ILM_cands', 'rewritten_question', 'pred_sql', 'score', 'exact']),
 dict_keys(['db_id', 'query', 'query_toks', 'query_toks_no_value', 'question', 'question_toks', 'sql', 'original_id', 'span_ranges', 'ratsql_pred_sql', 'gold_question', 'gold_question_toks', 'ratsql_pred_exact', 'ratsql_pred_score', 'question_toks_edit_distance']))

In [276]:
for d_out, d_orig in zip(rewriter_outputs, reranker_test_samples):
    _question_set = set([' '.join(c['question_toks']) for c in d_orig])
    
    for c in d_orig:
        c['from_rewriter'] = False
    
    _add_cands = []
    for i in range(1):
        # Add rewrites from c_out 
        # (Currently only 1st cands are predicted)
        
        c_out = d_out[i]
        c_orig = d_orig[i]
        
        # for k in c_orig:
        #     assert c_out[k] == c_orig[k], f'{k}, {c_out[k]}, {c_orig[k]}'
        
        for _pred_ILM_cand in c_out['pred_ILM_cands']:
            _rewritten_question_toks = Postprocess_rewrite_seq(
                tags=c_out['pred_tags'],
                rewrite_seq=_pred_ILM_cand,
                question_toks=c_out['question_toks'],
            )

            _rewritten_question = ' '.join(_rewritten_question_toks)
            if _rewritten_question in _question_set:
                # This cand already exists 
                continue

            _rewrt_span_ranges = Generate_rewrt_cand_span_ranges(
                orig_tokens=c_out['question_toks'],
                orig_span_ranges=c_out['span_ranges'],
                tags=c_out['pred_tags'],
                ILM_tokens=_pred_ILM_cand,
            )

            assert len(_rewrt_span_ranges) == len(_rewritten_question_toks), \
                f"{_rewrt_span_ranges}\n{_rewritten_question_toks}"

            c_add = deepcopy(c_orig)
            c_add['question'] = _rewritten_question
            c_add['question_toks'] = _rewritten_question_toks
            c_add['span_ranges'] = _rewrt_span_ranges
            c_add['ratsql_pred_sql'] = None
            c_add['ratsql_pred_exact'] = None
            c_add['ratsql_pred_score'] = None
            c_add['question_toks_edit_distance'] = None
            c_add['from_rewriter'] = True

            _add_cands.append(c_add)
            _question_set.add(c_add['question'])
    
    d_orig.extend(_add_cands)

--- Not enough edits ---
Tags: ['O-KEEP', 'O-KEEP', 'O-KEEP', 'O-KEEP', 'O-KEEP', 'O-KEEP', 'O-KEEP', 'O-KEEP', 'O-KEEP', 'O-KEEP', 'O-KEEP', 'B-EDIT', 'I-EDIT', 'I-EDIT', 'I-EDIT', 'L-EDIT', 'U-EDIT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Edits: [['US', 'museum', '?']]
--- Not enough edits ---
Tags: ['O-KEEP', 'O-KEEP', 'O-KEEP', 'O-KEEP', 'O-KEEP', 'O-KEEP', 'O-KEEP', 'U-EDIT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Edits: []


In [277]:
len(rewriter_outputs[3]), len(reranker_test_samples[3])

(7, 10)

In [278]:
[' '.join(c['question_toks']) for c in rewriter_outputs[3]]

['What is the idea off the path owned by the student whose last name is Smith ?',
 "What is the idea off the path owned by the student who's last name is Smith .",
 'What is the idea off the path owned by the students whose last name is Smith ?',
 'What is the idea off the pad owned by the student whose last name is Smith .',
 "What is the idea off the pad owned by the student who's last name is Smith .",
 'What is the idea off the pad owned by the students whose last name is Smith ?',
 "What is the idea off the path owned by the students who's last name is Smith ."]

In [279]:
[' '.join(c['question_toks']) for c in reranker_test_samples[3]]

['What is the idea off the path owned by the student whose last name is Smith ?',
 "What is the idea off the path owned by the student who's last name is Smith .",
 'What is the idea off the path owned by the students whose last name is Smith ?',
 'What is the idea off the pad owned by the student whose last name is Smith .',
 "What is the idea off the pad owned by the student who's last name is Smith .",
 'What is the idea off the pad owned by the students whose last name is Smith ?',
 "What is the idea off the path owned by the students who's last name is Smith .",
 'What is the id off the path owned by the student whose last name is Smith ?',
 'What is the the off the path owned by the student whose last name is Smith ?',
 'What is the and off the path owned by the student whose last name is Smith ?']

In [280]:
with open(reranker_extra_cands_test_path, 'w') as f:
    json.dump(reranker_test_samples, f, indent=4)

In [282]:
# empty span?
for d in reranker_test_samples:
    for c in d:
        for st, ed in c['span_ranges']:
            assert (st == ed == 0) or (float(ed) - float(st) >= 1e-4), c['span_ranges']

### Temp

In [None]:
instance = copy(train_dataset[0])
instance.fields

In [None]:
instance.index_fields(vocab)

In [None]:
list(zip(instance.fields['tags'].labels, instance.fields['tags']._indexed_labels))

In [None]:
instance.fields['sentence']._indexed_tokens

In [None]:
list(zip(instance.fields['rewrite_seq'].tokens, instance.fields['rewrite_seq']._indexed_tokens['tgt_tokens']['tokens']))

In [None]:
instance.fields['rewrite_seq']._indexed_tokens

In [None]:
Instance.add_field

In [None]:
instance = Instance(test_dataset[0].fields.copy())
instance.fields

In [None]:
len(instance.fields['rewrite_seq'].tokens)

In [None]:
l = instance.fields['rewrite_seq'].tokens
len(l)

In [None]:
l[0].text

In [None]:
a = torch.LongTensor([1])[0]
b = torch.LongTensor([2])[0]
a, b

In [None]:
torch.LongTensor([a, b])

In [None]:
schemas = read_dataset_schema('/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/tables.json')
len(schemas)

In [None]:
schemas['perpetrator']

In [None]:
[(c.name, c.text) for c in schemas['perpetrator']['people'].columns]

In [None]:
# schema_gnn.spider.utils.read_dataset_values

import sqlite3

db_id = 'perpetrator'
dataset_path = '/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/database'
tables = ["perpetrator", "people"]

db = os.path.join(dataset_path, db_id, db_id + ".sqlite")
try:
    conn = sqlite3.connect(db)
except Exception as e:
    raise Exception(f"Can't connect to SQL: {e} in path {db}")
conn.text_factory = str
cursor = conn.cursor()

values = {}

for table in tables:
    try:
        cursor.execute(f"SELECT * FROM {table} LIMIT 5000")
        values[table] = cursor.fetchall()
    except:
        conn.text_factory = lambda x: str(x, 'latin1')
        cursor = conn.cursor()
        cursor.execute(f"SELECT * FROM {table} LIMIT 5000")
        values[table] = cursor.fetchall()

In [None]:
values

In [None]:
cursor.execute(f"SELECT * FROM sqlite_master where type='table'")
cursor.fetchall()

In [None]:
cursor.execute(f"SELECT * FROM people LIMIT 5000")
[d[0] for d in cursor.description]

In [None]:
cursor.execute('PRAGMA TABLE_INFO(people)')
cursor.fetchall()

In [None]:
[d[0] for d in cursor.description]

In [None]:
d = OrderedDict({'4': 4, '2': 2, '0': 0})
d

In [None]:
d2 = OrderedDict(sorted(d.items(), key=lambda x : x[1] % 3))
d2

In [None]:
d2.values()

In [None]:
torch.zeros([0, 5])

In [None]:
str(None)

In [391]:
# CosineEmbeddingLoss 
arr1 = np.random.randn(2, 4) + np.ones((2, 4))
arr2 = np.random.randn(2, 4) + np.ones((2, 4))
tensor1 = torch.tensor(arr1, dtype=torch.float32)
tensor2 = torch.tensor(arr2, dtype=torch.float32)
y = torch.tensor(np.eye(2) * 2 - np.ones((2, 2)), dtype=tensor1.dtype, device=tensor1.device)
tensor1, tensor2, y

(tensor([[1.2787, 1.1783, 2.0894, 0.1343],
         [2.4704, 1.2044, 1.3239, 1.3491]]),
 tensor([[-0.0047,  0.9896,  0.2813,  2.1478],
         [ 0.7637,  2.0747,  0.1790, -1.4233]]),
 tensor([[ 1., -1.],
         [-1.,  1.]]))

In [385]:
y.diag()

tensor([1., 1.], dtype=torch.float64)

In [386]:
tensor1.size(), tensor2.size()

(torch.Size([2, 4]), torch.Size([2, 4]))

In [387]:
cos_loss = CosineEmbeddingLoss(margin=0, reduction='mean')

In [388]:
t1 = tensor1.unsqueeze(0).expand(2, 2, 4).reshape(4, 4)
t2 = tensor2.unsqueeze(1).expand(2, 2, 4).reshape(4, 4)
y_ = y.view(4)
t1, t2, y

(tensor([[2.4972, 2.0322, 1.4485, 1.8441],
         [0.4712, 1.2000, 1.9665, 1.2432],
         [2.4972, 2.0322, 1.4485, 1.8441],
         [0.4712, 1.2000, 1.9665, 1.2432]]),
 tensor([[1.8994, 1.5220, 1.3646, 0.9790],
         [1.8994, 1.5220, 1.3646, 0.9790],
         [1.2207, 0.7970, 1.5540, 0.8197],
         [1.2207, 0.7970, 1.5540, 0.8197]]),
 tensor([[ 1., -1.],
         [-1.,  1.]], dtype=torch.float64))

In [389]:
cos_loss(t1, t2, y_)

tensor(0.4648)

In [390]:
from scipy.spatial.distance import cosine as cos_sim
l = cos_sim(arr1[0], arr2[0]) + cos_sim(arr1[1], arr2[1])
l += (1 - cos_sim(arr1[0], arr2[1])) + (1 - cos_sim(arr1[1], arr2[0]))
l /= 4.0
l

0.46483609747567395

In [93]:
import tarfile

model_ckpt_bin = '/Users/mac/Desktop/syt/Deep-Learning/Projects-M/SpeakQL/SpeakQL/Allennlp_models/runs/3.1.0/model.tar.gz'

with tarfile.open(model_ckpt_bin, 'r:gz') as tar:
    f = tar.extractfile('weights.th')
    ckpt = torch.load(f, map_location=torch.device('cpu'))

type(ckpt)

collections.OrderedDict

In [95]:
list(ckpt.keys())[::10]

['tabert_model._bert_model.bert.embeddings.word_embeddings.weight',
 'tabert_model._bert_model.bert.encoder.layer.0.attention.self.value.bias',
 'tabert_model._bert_model.bert.encoder.layer.0.output.LayerNorm.bias',
 'tabert_model._bert_model.bert.encoder.layer.1.attention.output.LayerNorm.bias',
 'tabert_model._bert_model.bert.encoder.layer.2.attention.self.key.bias',
 'tabert_model._bert_model.bert.encoder.layer.2.output.dense.bias',
 'tabert_model._bert_model.bert.encoder.layer.3.attention.output.dense.bias',
 'tabert_model._bert_model.bert.encoder.layer.4.attention.self.query.bias',
 'tabert_model._bert_model.bert.encoder.layer.4.intermediate.dense.bias',
 'tabert_model._bert_model.bert.encoder.layer.5.attention.self.value.bias',
 'tabert_model._bert_model.bert.encoder.layer.5.output.LayerNorm.bias',
 'tabert_model._bert_model.bert.encoder.layer.6.attention.output.LayerNorm.bias',
 'tabert_model._bert_model.bert.encoder.layer.7.attention.self.key.bias',
 'tabert_model._bert_model.b

In [96]:
del ckpt

In [None]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

In [121]:
%load_ext memory_profiler

In [122]:
%memit

peak memory: 1329.27 MiB, increment: 0.56 MiB


In [128]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [4]:
_att_layer = MatrixAttention.by_name('cosine')()
_audio_feats = torch.randn(2, 3, 5)
_att_map_1 = _att_layer(_audio_feats, _audio_feats)
# att_map: (batch, seq_len, seq_len)
_att_map_2 = masked_softmax(_att_map_1, None)
_audio_feats, _att_map_1, _att_map_2

(tensor([[[ 0.7312,  1.1718, -0.9274,  0.5451,  0.0663],
          [-0.4370,  0.7626,  0.4415, -0.0091, -0.8425],
          [ 0.1374,  0.9386, -0.1860, -0.6446,  0.4100]],
 
         [[ 0.4085,  0.2579,  1.0950, -0.5065,  0.0998],
          [-0.6540,  0.7317, -1.4567,  1.6089,  0.0938],
          [-1.2597,  0.2546, -0.5020, -1.0412,  0.7323]]]),
 tensor([[[ 1.0000,  0.0458,  0.4858],
          [ 0.0458,  1.0000,  0.1467],
          [ 0.4858,  0.1467,  1.0000]],
 
         [[ 1.0000, -0.7979, -0.1627],
          [-0.7979,  1.0000,  0.0301],
          [-0.1627,  0.0301,  1.0000]]]),
 tensor([[[0.5043, 0.1942, 0.3015],
          [0.2126, 0.5521, 0.2352],
          [0.2954, 0.2105, 0.4941]],
 
         [[0.6765, 0.1121, 0.2115],
          [0.1072, 0.6473, 0.2454],
          [0.1848, 0.2241, 0.5911]]]))

In [28]:
F.kl_div(torch.log(torch.FloatTensor([[1/6, 1/6, 2/3]])), torch.FloatTensor([[1/3, 1/3, 1/3]]), reduction='batchmean')


tensor(0.2310)

In [27]:
F.kl_div(torch.log(torch.FloatTensor([[1/3, 1/3, 1/3]])), torch.FloatTensor([[1/3, 1/3, 1/3]]), reduction='batchmean')


tensor(0.)

In [30]:
F.kl_div(torch.log(torch.FloatTensor([[1/6, 1/6, 2/3]])), torch.FloatTensor([[0, 0, 1]]), reduction='batchmean')


tensor(0.4055)

In [31]:
F.kl_div(torch.log(torch.FloatTensor([[1/3, 1/3, 1/3]])), torch.FloatTensor([[0, 0, 1]]), reduction='batchmean')


tensor(1.0986)

In [32]:
F.kl_div(torch.log(torch.FloatTensor([[1/3, 1/3, 1/3]])), torch.FloatTensor([[0, 0, 0]]), reduction='batchmean')


tensor(0.)

In [33]:
os.path.exists('/Users/mac/Desktop/syt/')

True

In [34]:
os.path.exists('/Users/mac/Desktop/syt2/')

False