In [2]:
from typing import Iterator, List, Tuple, Dict, Union, Optional, cast
import torch
import torch.optim as optim
from torch.nn import MSELoss, CosineEmbeddingLoss
from torch.nn import functional as F
from torch.nn import ModuleList

import numpy as np
from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField, ArrayField, MetadataField, ListField, NamespaceSwappingField
from allennlp.data.dataset_readers import DatasetReader
from allennlp.common.file_utils import cached_path
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.batch import Batch
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding, TokenEmbedder
from allennlp.modules.token_embedders.pretrained_transformer_embedder import PretrainedTransformerEmbedder
from allennlp.modules.token_embedders.pretrained_transformer_mismatched_embedder import PretrainedTransformerMismatchedEmbedder
# from allennlp.modules.seq2seq_encoders.multi_head_self_attention import MultiHeadSelfAttention
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.seq2vec_encoders.cnn_encoder import CnnEncoder
from allennlp.modules.attention import Attention
from allennlp.modules.matrix_attention.matrix_attention import MatrixAttention
from allennlp.modules.matrix_attention.linear_matrix_attention import LinearMatrixAttention
from allennlp.modules.matrix_attention.cosine_matrix_attention import CosineMatrixAttention
from allennlp.modules.matrix_attention.bilinear_matrix_attention import BilinearMatrixAttention

from allennlp.modules.conditional_random_field import allowed_transitions, ConditionalRandomField

from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits, \
    get_device_of, masked_softmax, weighted_sum, \
    get_mask_from_sequence_lengths, get_lengths_from_binary_sequence_mask, tensors_equal, \
    batched_span_select, move_to_device

from allennlp.training.metrics import BooleanAccuracy, CategoricalAccuracy, MeanAbsoluteError, Average
from allennlp.data.samplers import BucketBatchSampler
from allennlp.data.dataloader import DataLoader, PyTorchDataLoader
from allennlp.training.trainer import GradientDescentTrainer
# from allennlp.predictors import Predictor, Seq2SeqPredictor, SimpleSeq2SeqPredictor, SentenceTaggerPredictor
from allennlp.predictors import Predictor, SentenceTaggerPredictor
from allennlp.nn.activations import Activation
from allennlp.common.tqdm import Tqdm
from allennlp.common.params import Params
from allennlp.common.util import JsonDict, sanitize

from allennlp_models.generation.predictors import Seq2SeqPredictor
from allennlp_models.generation.models.simple_seq2seq import SimpleSeq2Seq
from allennlp_models.generation.modules.seq_decoders.seq_decoder import SeqDecoder
from allennlp_models.generation.modules.decoder_nets.decoder_net import DecoderNet

from allennlp.common.util import START_SYMBOL, END_SYMBOL

from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

# from spacy.tokenizer import Tokenizer as SpacyTokenizer
# from spacy.lang.en import English
# nlp = English()
# Create a blank Tokenizer with just the English vocab
# tokenizer = Tokenizer(nlp.vocab)

from tqdm.notebook import tqdm

from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import ShortTermFeatures

import os
import sys
from sys import modules
import random
import itertools
import json
from collections import defaultdict, OrderedDict
from inspect import signature
import warnings
import pickle
from copy import copy, deepcopy
from overrides import overrides
import importlib
import string
import sqlite3

import matplotlib.pyplot as plt

from transformers import BertPreTrainedModel, BertModel, BertConfig, BertTokenizer

from utils.spider import process_sql, evaluation
from utils.schema_gnn.spider_utils import Table, TableColumn, read_dataset_schema
from utils.misc_utils import Postprocess_rewrite_seq

from dataset_readers.reader_utils import extractAudioFeatures, extractAudioFeatures_NoPooling, \
    dbToTokens, dbToTokens_new, \
    read_DB, Get_align_tags
from dataset_readers.reranker_reader_legacy import SpiderASRRerankerReaderV1, SpiderASRRerankerReaderV2
from dataset_readers.reranker_reader import SpiderASRRerankerReaderV2_Siamese
from modules.encoder import SpeakQLEncoder, SpeakQLEncoderV1
from modules.wav2vec_audio_encoder import SpeakQLAudioEncoder, Wav2vecAudioEncoder
from modules.tabert_embedder import TaBERTEmbedder
from modules.sql_decoder import SQLDecoder, RatsqlSQLDecoder
# from models.reranker import SpiderASRRerankerV0, SpiderASRRerankerV1, SpiderASRRerankerV2, SpiderASRReranker_Siamese
# from predictors.reranker_predictor import SpiderASRRerankerPredictor, SpiderASRRerankerPredictor_Siamese

# import dataset_readers, models
from dataset_readers.end2end_reader import SpeakQLEnd2endReader
from models.end2end import SpeakQLEnd2endModel 
# from predictors.rewriter_predictor import SpiderASRRewriterPredictor_Tagger_ILM, SpiderASRRewriterPredictor_Seq2seq

# import dataset_readers.rewriter_reader
# import models.rewriter
# import predictors.rewriter_predictor

import table_bert
from table_bert import TableBertModel

import _jsonnet

from ratsql.commands.infer import Inferer
from ratsql.datasets.spider import SpiderItem, load_tables
from ratsql.utils import registry
from ratsql.models.spider.spider_enc import SpiderEncoderState, SpiderEncoderV2Preproc, preprocess_schema_uncached
from ratsql.models.nl2code.decoder import NL2CodeDecoderPreprocItem, NL2CodeDecoderPreproc, NL2CodeDecoder
from ratsql.models.spider.spider_beam_search import beam_search_with_heuristics_for_speakql

torch.manual_seed(1)

<torch._C.Generator at 0x10ba4c770>

In [None]:
del modules['dataset_readers.reader_utils']
del modules['modules.encoder']
del modules['ratsql.models.spider.spider_beam_search']
del modules['ratsql.models.nl2code.decoder']

In [None]:
del registry._REGISTRY['decoder']

In [None]:
list(filter(lambda k : k.startswith('ratsql'), modules.keys()))

In [2]:
AUDIO_DIM = 136
AUDIO_DIM_NO_POOLING = 68

## End-to-end model

### Dataset Reader (tag:r)

In [8]:
tables_json_fname = '/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/tables.json'
dataset_dir = '/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/my'
databases_dir = '/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/database'
tabert_model_path = '/Users/mac/Desktop/syt/Deep-Learning/Repos/TaBERT/pretrained-models/tabert_base_k1/model.bin'

src_token_indexers = {'bert': TokenIndexer.by_name('pretrained_transformer_mismatched')('bert-base-uncased')}
tgt_token_indexers = {'tgt_tokens': SingleIdTokenIndexer(namespace='tgt_tokens')}

ratsql_enc_preproc_config = {
    "word_emb": {
        "name": 'glove',
        "kind": '42B',
        "lemmatize": True,
    },
    "min_freq": 4,
    "max_count": 5000,
    "db_path": databases_dir,
    "compute_sc_link": True,
    "compute_cv_link": True,
    "fix_issue_16_primary_keys": True,
    "count_tokens_in_word_emb_for_vocab": True,
    "save_path": '/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/nl2code-glove,cv_link=true',
}

ratsql_dec_preproc_config = {
    "grammar": {
        "name": 'spider',
        "output_from": True,
        "use_table_pointer": True,
        "include_literals": False,
        "end_with_from": True,
        "infer_from_conditions": True,
        "clause_order": None,
        "factorize_sketch": 2,
    },
    "min_freq": 4,
    "max_count": 5000,
    "use_seq_elem_rules": True,
    "save_path": '/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/nl2code-glove,cv_link=true',
}

ratsql_enc_preproc = SpiderEncoderV2Preproc(**ratsql_enc_preproc_config)
ratsql_dec_preproc = NL2CodeDecoderPreproc(**ratsql_dec_preproc_config)

dataset_reader = SpeakQLEnd2endReader(tables_json_fname=tables_json_fname,
                                      dataset_dir=dataset_dir,
                                      databases_dir=databases_dir,
                                      tabert_model_path=tabert_model_path,
                                      ratsql_enc_preproc=ratsql_enc_preproc,
                                      ratsql_dec_preproc=ratsql_dec_preproc,
                                      src_token_indexers=src_token_indexers,
                                      tgt_token_indexers=tgt_token_indexers)

In [9]:
train_dataset = dataset_reader.read('train')

dev_dataset = dataset_reader.read('dev')

HBox(children=(IntProgress(value=1, bar_style='info', description='reading instances', max=1, style=ProgressSt…

HBox(children=(IntProgress(value=0, description='DB connections', max=166, style=ProgressStyle(description_wid…







HBox(children=(IntProgress(value=1, bar_style='info', description='reading instances', max=1, style=ProgressSt…

HBox(children=(IntProgress(value=0, description='DB connections', max=166, style=ProgressStyle(description_wid…







In [10]:
train_dataset[0].fields['metadata'].as_tensor(0).keys()

dict_keys(['original_id', 'text_len', 'schema_len', 'source_tokens', 'tabert_tables', 'pointer_spans', 'ratsql_items', 'text_tokenized', 'text_offsets', 'target_orig_sql', 'target_written_sql'])

In [None]:
train_dataset[0].fields['metadata'].as_tensor(0)['pointer_spans']

### Model

#### Rat-sql decoder

In [7]:
exp_config_path = '/Users/mac/Desktop/syt/Deep-Learning/Repos/rat-sql/experiments/spider-glove-run.jsonnet'
root_dir = '/Users/mac/Desktop/syt/Deep-Learning/Repos/rat-sql'
model_dir = '/Users/mac/Desktop/syt/Deep-Learning/Repos/rat-sql/logdir/glove_run/bs=20,lr=7.4e-04,end_lr=0e0,att=0'
checkpoint_step = 40000

exp_config = json.loads(_jsonnet.evaluate_file(exp_config_path))

model_config_path = os.path.join(root_dir, exp_config["model_config"])
model_config_args = exp_config.get("model_config_args")

infer_config = json.loads(_jsonnet.evaluate_file(model_config_path, tla_codes={'args': json.dumps(model_config_args)}))

inferer = Inferer(infer_config)
inferer.device = torch.device("cpu")
model = inferer.load_model(model_dir, checkpoint_step)



Loading model from /Users/mac/Desktop/syt/Deep-Learning/Repos/rat-sql/logdir/glove_run/bs=20,lr=7.4e-04,end_lr=0e0,att=0/model_checkpoint-00040000


In [8]:
dataset = registry.construct('dataset', inferer.config['data']['val'])

DB connections: 100%|██████████| 166/166 [00:01<00:00, 118.86it/s]


In [9]:
for _, schema in dataset.schemas.items():
    model.preproc.enc_preproc._preprocess_schema(schema)

In [10]:
dataset

<ratsql.datasets.spider.SpiderDataset at 0x14ad64250>

In [11]:
model.decoder

NL2CodeDecoder(
  (state_update): RecurrentDropoutLSTMCell()
  (desc_attn): MultiHeadedAttention(
    (linears): ModuleList(
      (0): Linear(in_features=512, out_features=256, bias=True)
      (1): Linear(in_features=256, out_features=256, bias=True)
      (2): Linear(in_features=256, out_features=256, bias=True)
      (3): Linear(in_features=256, out_features=256, bias=True)
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (rule_logits): Sequential(
    (0): Linear(in_features=512, out_features=128, bias=True)
    (1): Tanh()
    (2): Linear(in_features=128, out_features=94, bias=True)
  )
  (rule_embedding): Embedding(94, 128)
  (gen_logodds): Linear(in_features=512, out_features=1, bias=True)
  (terminal_logits): Sequential(
    (0): Linear(in_features=512, out_features=128, bias=True)
    (1): Tanh()
    (2): Linear(in_features=128, out_features=5, bias=True)
  )
  (terminal_embedding): Embedding(5, 128)
  (copy_pointer): BahdanauPointer(
    (compute_scores): Sequential(

In [52]:
_qs = ["Which student has enrolled for the most times in any program? List the id, first name, middle name, last name, the number of enrollments and student id."]
_db_ids = ["student_transcripts_tracking"]
_sqls = [json.loads('''{'except': None, 'from': {'conds': [[False, 2, [0, [0, 32, False], None], [0, 47, False], None]], 'table_units': [['table_unit', 6], ['table_unit', 7]]}, 'groupBy': [[0, 32, False]], 'having': [], 'intersect': None, 'limit': 1, 'orderBy': ['desc', [[0, [3, 0, False], None]]], 'select': [False, [[0, [0, [0, 32, False], None]], [0, [0, [0, 35, False], None]], [0, [0, [0, 36, False], None]], [0, [0, [0, 37, False], None]], [3, [0, [0, 0, False], None]], [0, [0, [0, 32, False], None]]]], 'union': None, 'where': []}''')]

preproc_inputs = []
for q, db_id, sql in zip(_qs, _db_ids, _sqls):
    spider_schema = dataset.schemas[db_id]
    data_item = SpiderItem(
        text=None,  # intentionally None -- should be ignored when the tokenizer is set correctly
        code=sql,
        schema=spider_schema,
        orig_schema=spider_schema.orig,
        orig={"question": q}
    )
    model.preproc.clear_items()
    enc_input = model.preproc.enc_preproc.preprocess_item(data_item, None)
    
    sql_parsed = model.preproc.dec_preproc.grammar.parse(data_item.code, "val")
    model.preproc.dec_preproc.add_item(data_item, "val", sql_parsed)
    dec_output = model.preproc.dec_preproc.items["val"][-1]
    
    preproc_data = enc_input, dec_output
    
    preproc_inputs.append(preproc_data)

preproc_inputs

[({'raw_question': 'Which student has enrolled for the most times in any program? List the id, first name, middle name, last name, the number of enrollments and student id.',
   'question': ['which',
    'student',
    'have',
    'enrol',
    'for',
    'the',
    'most',
    'time',
    'in',
    'any',
    'program',
    '?',
    'list',
    'the',
    'id',
    ',',
    'first',
    'name',
    ',',
    'middle',
    'name',
    ',',
    'last',
    'name',
    ',',
    'the',
    'number',
    'of',
    'enrollment',
    'and',
    'student',
    'id',
    '.'],
   'question_for_copying': ['which',
    'student',
    'has',
    'enrolled',
    'for',
    'the',
    'most',
    'times',
    'in',
    'any',
    'program',
    '?',
    'list',
    'the',
    'id',
    ',',
    'first',
    'name',
    ',',
    'middle',
    'name',
    ',',
    'last',
    'name',
    ',',
    'the',
    'number',
    'of',
    'enrollments',
    'and',
    'student',
    'id',
    '.'],
   'db_id':

In [None]:
type(spider_schema), spider_schema.__dict__.keys()

In [70]:
print(_sqls[0])

{'except': None, 'from': {'conds': [[False, 2, [0, [0, 32, False], None], [0, 47, False], None]], 'table_units': [['table_unit', 6], ['table_unit', 7]]}, 'groupBy': [[0, 32, False]], 'having': [], 'intersect': None, 'limit': 1, 'orderBy': ['desc', [[0, [3, 0, False], None]]], 'select': [False, [[0, [0, [0, 32, False], None]], [0, [0, [0, 35, False], None]], [0, [0, [0, 36, False], None]], [0, [0, [0, 37, False], None]], [3, [0, [0, 0, False], None]], [0, [0, [0, 32, False], None]]]], 'union': None, 'where': []}


In [67]:
print(model.preproc.dec_preproc.validate_item(data_item, 'val'))

(True, {'_type': 'sql', 'select': {'_type': 'select', 'is_distinct': False, 'aggs': [{'_type': 'agg', 'agg_id': {'_type': 'NoneAggOp'}, 'val_unit': {'_type': 'Column', 'col_unit1': {'_type': 'col_unit', 'agg_id': {'_type': 'NoneAggOp'}, 'is_distinct': False, 'col_id': 32}}}, {'_type': 'agg', 'agg_id': {'_type': 'NoneAggOp'}, 'val_unit': {'_type': 'Column', 'col_unit1': {'_type': 'col_unit', 'agg_id': {'_type': 'NoneAggOp'}, 'is_distinct': False, 'col_id': 35}}}, {'_type': 'agg', 'agg_id': {'_type': 'NoneAggOp'}, 'val_unit': {'_type': 'Column', 'col_unit1': {'_type': 'col_unit', 'agg_id': {'_type': 'NoneAggOp'}, 'is_distinct': False, 'col_id': 36}}}, {'_type': 'agg', 'agg_id': {'_type': 'NoneAggOp'}, 'val_unit': {'_type': 'Column', 'col_unit1': {'_type': 'col_unit', 'agg_id': {'_type': 'NoneAggOp'}, 'is_distinct': False, 'col_id': 37}}}, {'_type': 'agg', 'agg_id': {'_type': 'Count'}, 'val_unit': {'_type': 'Column', 'col_unit1': {'_type': 'col_unit', 'agg_id': {'_type': 'NoneAggOp'}, 'is

In [68]:
print(data_item.code)

{'except': None, 'from': {'conds': [[False, 2, [0, [0, 32, False], None], [0, 47, False], None]], 'table_units': [['table_unit', 6], ['table_unit', 7]]}, 'groupBy': [[0, 32, False]], 'having': [], 'intersect': None, 'limit': 1, 'orderBy': ['desc', [[0, [3, 0, False], None]]], 'select': [False, [[0, [0, [0, 32, False], None]], [0, [0, [0, 35, False], None]], [0, [0, [0, 36, False], None]], [0, [0, [0, 37, False], None]], [3, [0, [0, 0, False], None]], [0, [0, [0, 32, False], None]]]], 'union': None, 'where': []}


In [54]:
model.preproc.dec_preproc.ast_wrapper.verify_ast(sql_parsed)

True

In [64]:
with torch.no_grad():
    model_output = model.eval_on_batch(preproc_inputs)
# model.train()
model_output

BiLSTM:
[array([ 0, 33])]
[array([ 0, 33])]
BiLSTM-summ:
[array([  0,   2,   5,   8,  11,  14,  16,  19,  23,  25,  29,  32,  35,
        38,  41,  44,  47,  50,  53,  57,  60,  64,  68,  71,  74,  77,
        80,  83,  86,  89,  92,  95,  98, 101, 105, 109, 112, 115, 118,
       122, 125, 127, 131, 134, 138, 142, 146, 149, 152, 155, 159, 162,
       166, 169, 172, 175, 179, 182])]
[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57]]
BiLSTM-summ:
[array([ 0,  1,  2,  3,  5,  6,  7,  8, 10, 13, 14, 16])]
[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]]


KeyError: ('agg*', 6)

In [13]:
enc_output = model.encoder.forward(preproc_inputs)
len(enc_output), vars(enc_output[0]).keys()

BiLSTM:
[array([0, 7]), array([0, 7])]
[array([0, 7]), array([0, 7])]
BiLSTM-summ:
[array([ 0,  2,  5,  7,  9, 11, 13, 15, 17, 20, 22, 24, 27, 31, 33, 36, 39,
       42, 44, 47, 49, 52, 55]), array([  0,   2,   5,   8,  11,  13,  16,  19,  22,  25,  28,  31,  34,
        37,  40,  43,  46,  49,  53,  56,  59,  61,  63,  65,  67,  70,
        73,  76,  79,  82,  85,  88,  91,  94,  97, 100, 103, 107, 110,
       112, 115, 117, 120, 123, 125])]
[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44]]
BiLSTM-summ:
[array([0, 1, 2, 3, 6]), array([0, 1, 2, 3])]
[[0, 1, 2, 3, 4], [0, 1, 2, 3]]


(2,
 dict_keys(['state', 'memory', 'question_memory', 'schema_memory', 'words', 'pointer_memories', 'pointer_maps', 'm2c_align_mat', 'm2t_align_mat']))

In [14]:
len(preproc_data[0]['question']), len(preproc_data[0]['tables']), len(preproc_data[0]['columns'])

(7, 3, 44)

In [15]:
enc_output[0].m2c_align_mat.size(), enc_output[0].m2t_align_mat.size()

(torch.Size([33, 22]), torch.Size([33, 4]))

In [16]:
enc_output[0].m2c_align_mat.detach().sum(1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

In [17]:
enc_output[0].question_memory.size(), enc_output[0].schema_memory.size()

(torch.Size([1, 7, 256]), torch.Size([1, 26, 256]))

In [18]:
enc_output[0].pointer_memories["column"].size(), enc_output[0].pointer_memories["table"].size()

(torch.Size([1, 22, 256]), torch.Size([1, 26, 256]))

In [19]:
enc_output[0].pointer_maps

{'column': {0: [0],
  1: [1],
  2: [2],
  3: [3],
  4: [4],
  5: [5],
  6: [6],
  7: [7],
  8: [8],
  9: [9],
  10: [10],
  11: [11],
  12: [12],
  13: [13],
  14: [14],
  15: [15],
  16: [16],
  17: [17],
  18: [18],
  19: [19],
  20: [20],
  21: [21]},
 'table': {0: [0], 1: [1], 2: [2], 3: [3]}}

In [20]:
enc_output[1].pointer_maps

{'column': {0: [0],
  1: [1],
  2: [2],
  3: [3],
  4: [4],
  5: [5],
  6: [6],
  7: [7],
  8: [8],
  9: [9],
  10: [10],
  11: [11],
  12: [12],
  13: [13],
  14: [14],
  15: [15],
  16: [16],
  17: [17],
  18: [18],
  19: [19],
  20: [20],
  21: [21],
  22: [22],
  23: [23],
  24: [24],
  25: [25],
  26: [26],
  27: [27],
  28: [28],
  29: [29],
  30: [30],
  31: [31],
  32: [32],
  33: [33],
  34: [34],
  35: [35],
  36: [36],
  37: [37],
  38: [38],
  39: [39],
  40: [40],
  41: [41],
  42: [42],
  43: [43]},
 'table': {0: [0], 1: [1], 2: [2]}}

In [None]:
## Speakql model/encoder compatible with ratsql decoder (tag:m)

In [None]:
# importlib.reload(dataset_readers.rewriter_s2s_tabert_reader)
# importlib.reload(models.rewriter_s2s_tabert)

# from dataset_readers.rewriter_s2s_tabert_reader import SpiderASRRewriterReader_Seq2seq_TaBERT
# from models.rewriter_s2s_tabert import SpiderASRRewriter_Seq2seq_TaBERT 

In [12]:
# vocab = Vocabulary.from_instances(train_dataset + dev_dataset)
vocab = Vocabulary.from_instances(train_dataset)
# vocab = None
vocab

HBox(children=(IntProgress(value=0, description='building vocab', max=7, style=ProgressStyle(description_width…




Vocabulary with namespaces:  Non Padded Namespaces: {'*tags', '*labels'}

In [13]:
# Hyperparams 
TAG_EMB_DIM = 64
SRC_EMB_DIM = 768 # BERT 
TGT_EMB_DIM = 300
AUDIO_ENC_DIM = 32

ENCODER_DIM = 64
# TAGGING_FF_DIM = 64
DECODER_DIM = ENCODER_DIM # It seems that otherwise it can't work 

In [None]:
# src_text_embedder = BasicTextFieldEmbedder(
#         token_embedders={
#             "bert": TokenEmbedder.by_name("pretrained_transformer_mismatched")("bert-base-uncased")
#         })
# tgt_text_embedder = BasicTextFieldEmbedder(
#         token_embedders={
#             "tgt_tokens": Embedding(
#                 embedding_dim=TGT_EMB_DIM,
#                 num_embeddings=vocab.get_vocab_size('tgt_tokens')
#             )
#         })
# tgt_text_embedder = Embedding(embedding_dim=TGT_EMB_DIM,
#                               num_embeddings=vocab.get_vocab_size('tgt_tokens'))

In [None]:
# tag_embedder = Embedding(embedding_dim=TAG_EMB_DIM,
#                          vocab_namespace='rewriter_tags',
#                          vocab=vocab)
# tag_embedder = Embedding(embedding_dim=TAG_EMB_DIM,
#                          num_embeddings=vocab.get_vocab_size('rewriter_tags'))

In [None]:
# rewrite_decoder = SeqDecoder.by_name('auto_regressive_seq_decoder')(
#     vocab=vocab,
#     decoder_net=DecoderNet.by_name('lstm_cell')(
#         decoding_dim=DECODER_DIM,
#         target_embedding_dim=TGT_EMB_DIM,
#         attention=Attention.by_name('bilinear')(DECODER_DIM, ENCODER_DIM)
#     ),
#     max_decoding_steps=100,
#     target_embedder=tgt_text_embedder,
#     target_namespace='tgt_tokens',
#     beam_size=4
# )

In [14]:
audio_s2v = CnnEncoder(embedding_dim = AUDIO_DIM_NO_POOLING,
                       num_filters = 4,
                       ngram_filter_sizes = (2, 3, 4, 5),
                       output_dim = AUDIO_ENC_DIM)

lstm_s2s = PytorchSeq2SeqWrapper(torch.nn.LSTM(SRC_EMB_DIM + AUDIO_ENC_DIM, ENCODER_DIM, batch_first=True))
# lstm_s2s_with_tags = PytorchSeq2SeqWrapper(torch.nn.LSTM(SRC_EMB_DIM + AUDIO_ENC_DIM + TAG_EMB_DIM, ENCODER_DIM, batch_first=True))

# lstm_s2v_no_tags = PytorchSeq2VecWrapper(torch.nn.LSTM(ENCODER_DIM, ENCODER_DIM, batch_first=True))
# lstm_s2v_with_tags = PytorchSeq2VecWrapper(torch.nn.LSTM(ENCODER_DIM, ENCODER_DIM, batch_first=True))

# TODO: use s2s & s2v, instead of multilayer s2v, since we need sequence representations here 

speakql_encoder = SpeakQLEncoderV1(
    audio_attention_layer=CosineMatrixAttention(),
    audio_attention_residual='+',
    seq2seq_encoders=[lstm_s2s],
    seq2vec_encoder=None
)
# encoder_with_tags = SpeakQLEncoderV1(
#     audio_attention_layer=CosineMatrixAttention(),
#     audio_attention_residual='+',
#     seq2seq_encoders=[lstm_s2s_with_tags],
#     seq2vec_encoder=lstm_s2v_with_tags
# )

In [15]:
ratsql_decoder_config = {
    "dropout": 0.20687225956012834,
    "desc_attn": 'mha',
    "enc_recurrent_size": ENCODER_DIM,
    "recurrent_size": 64,
    "loss_type": "softmax",
    "use_align_mat": True,
    "use_align_loss": True,
    "enumerate_order": False,
}

ratsql_decoder = NL2CodeDecoder(
    **ratsql_decoder_config,
    device=torch.device('cpu'),
    preproc=ratsql_dec_preproc
)

In [16]:
ratsql_decoder

NL2CodeDecoder(
  (state_update): RecurrentDropoutLSTMCell()
  (desc_attn): MultiHeadedAttention(
    (linears): ModuleList(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): Linear(in_features=64, out_features=64, bias=True)
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Linear(in_features=64, out_features=64, bias=True)
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (rule_logits): Sequential(
    (0): Linear(in_features=64, out_features=128, bias=True)
    (1): Tanh()
    (2): Linear(in_features=128, out_features=94, bias=True)
  )
  (rule_embedding): Embedding(94, 128)
  (gen_logodds): Linear(in_features=64, out_features=1, bias=True)
  (terminal_logits): Sequential(
    (0): Linear(in_features=64, out_features=128, bias=True)
    (1): Tanh()
    (2): Linear(in_features=128, out_features=5, bias=True)
  )
  (terminal_embedding): Embedding(5, 128)
  (copy_pointer): BahdanauPointer(
    (compute_scores): Sequential(
      (0):

In [17]:
# tagger_ILM_model = SpiderASRRewriter_Tagger_ILM(
#     src_text_embedder=src_text_embedder,
#     tag_embedder=tag_embedder,
#     bert_pretrained_model='bert-base-uncased',
#     audio_seq2vec_encoder=audio_s2v,
#     encoder_no_tags=encoder_no_tags,
#     encoder_with_tags=encoder_with_tags,
#     rewrite_decoder=rewrite_decoder,
#     ff_dimension=TAGGING_FF_DIM,
#     concat_audio=True,
#     vocab=vocab
# )

In [18]:
end2end_model = SpeakQLEnd2endModel(
    src_text_embedder = None,
    tabert_model_path = "/Users/mac/Desktop/syt/Deep-Learning/Repos/TaBERT/pretrained-models/tabert_base_k1/model.bin",
    finetune_tabert = False,
    audio_seq2vec_encoder = audio_s2v,
    encoder = speakql_encoder,
    sql_decoder = ratsql_decoder,
    concat_audio = True,
    raw_audio_encoder = None,  # wav2vec
    finetune_raw_audio_encoder = False,
    align_tag_embedder = None,
    vocab = vocab
)

In [33]:
train_dataset.index_with(vocab)
dev_dataset.index_with(vocab)

In [34]:
optimizer = optim.SGD(end2end_model.parameters(), lr=0.01)

train_data_loader = PyTorchDataLoader(train_dataset, batch_size=8, shuffle=True)
dev_data_loader = PyTorchDataLoader(dev_dataset, batch_size=8, shuffle=False)

trainer = GradientDescentTrainer(model=end2end_model,
                                 optimizer=optimizer,
                                 data_loader=train_data_loader,
                                 validation_data_loader=dev_data_loader,
                                 patience=1,
                                 num_epochs=1,
                                 grad_norm=0.1,
                                 cuda_device=-1)
trainer.train()

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

Ignored sample with unseen rule: ('agg*', 6)
Ignored sample with unseen rule: ('agg*', 6)
Ignored sample with unseen rule: ('agg*', 6)
Ignored sample with unseen rule: ('agg*', 6)
Ignored sample with unseen rule: ('agg*', 6)
Ignored sample with unseen rule: ('agg*', 6)
Ignored sample with unseen rule: ('agg*', 6)
Ignored sample with unseen rule: ('agg*', 6)
Ignored sample with unseen rule: ('agg*', 6)
Ignored sample with unseen rule: ('agg*', 6)





{'best_epoch': 0,
 'peak_worker_0_memory_MB': 5636.419584,
 'training_duration': '0:03:09.993054',
 'training_start_epoch': 0,
 'training_epochs': 0,
 'epoch': 0,
 'training_loss': 339.1650695800781,
 'training_worker_0_memory_MB': 5636.419584,
 'validation_loss': 0.0,
 'best_validation_loss': 0.0}

In [22]:
_output = end2end_model.forward_on_instance(dev_dataset[0])
_output.keys()

Ignored sample with unseen rule: ('agg*', 6)


dict_keys(['enc_states_nl2code'])

In [None]:
_output['enc_states_nl2code']

In [None]:
# Load trained model 

end2end_model = Model.from_archive('runs/local-test/4.0L/model.tar.gz')
end2end_model

### E2E - Predictor (tag:p)

In [44]:
test_dataset = dataset_reader.read('test')
len(test_dataset)

HBox(children=(IntProgress(value=1, bar_style='info', description='reading instances', max=1, style=ProgressSt…

HBox(children=(IntProgress(value=0, description='DB connections', max=166, style=ProgressStyle(description_wid…







33

In [61]:
class SpeakQLEnd2EndPredictor(Predictor):
    def __init__(self,
                 model: Model,
                 dataset_reader: DatasetReader,
                 beam_size=4,
                 max_steps=1000):
        super().__init__(model, dataset_reader)
        
        self.beam_size = beam_size
        self.max_steps = max_steps
        
        self.save_intermediate = False

    def set_save_intermediate(self, save_intermediate: bool):
        self._model.set_save_intermediate(save_intermediate)
        self.save_intermediate = save_intermediate
    
    @overrides
    def predict_instance(self, instance: Instance) -> JsonDict:
        # Input instance has gold rewrite_seq_s2s  
        
        outputs = dict()
        metadata = instance.fields['metadata']
        outputs['question'] = ' '.join(metadata['source_tokens'][:metadata['text_len']])
        outputs['original_id'] = metadata['original_id']
        outputs['gold_sql'] = metadata['target_written_sql']
        
        ## In eval mode, model will forward with beam_search even if gold is provided 
        # _instance = Instance(instance.fields.copy())
        # del _instance.fields['rewrite_seq_s2s']
        
        with torch.no_grad():
            cuda_device = self._model._get_prediction_device()
            _batch = Batch([instance])
            _batch.index_instances(self._model.vocab)
            model_input = move_to_device(_batch.as_tensor_dict(), cuda_device)
            sql_beam_search_outputs = beam_search_with_heuristics_for_speakql(
                model=self._model,
                speakql_input=model_input,
                orig_item=metadata['ratsql_items'][0],
                preproc_item=metadata['ratsql_items'][1],
                beam_size=self.beam_size,
                max_steps=self.max_steps,
            )
            # outputs = self.make_output_human_readable(self(**model_input))
        
        pred_sql = ''
        if len(sql_beam_search_outputs) > 0:
            pred_sql = sql_beam_search_outputs[0]['inferred_code']
        
        outputs = {
            'pred_sql': pred_sql
        }

        if self.save_intermediate:
            outputs['intermediates'] = self._model.get_intermediates()
        
        return sanitize(outputs)

In [62]:
predictor = SpeakQLEnd2EndPredictor(
    model=end2end_model,
    dataset_reader=dataset_reader)

In [63]:
_test_instance = Instance(test_dataset[0].fields.copy())
_test_instance.fields

{'sentence': <allennlp.data.fields.text_field.TextField at 0x30eddf050>,
 'source_to_target': <allennlp.data.fields.namespace_swapping_field.NamespaceSwappingField at 0x30eddff00>,
 'text_mask': <allennlp.data.fields.array_field.ArrayField at 0x1f2750d70>,
 'schema_mask': <allennlp.data.fields.array_field.ArrayField at 0x1f2750c80>,
 'schema_column_ids': <allennlp.data.fields.array_field.ArrayField at 0x1f27505a0>,
 'audio_feats': <allennlp.data.fields.list_field.ListField at 0x13db3c5d0>,
 'audio_mask': <allennlp.data.fields.array_field.ArrayField at 0x1f2750f00>,
 'source_token_ids': <allennlp.data.fields.array_field.ArrayField at 0x25b36e2d0>,
 'metadata': <allennlp.data.fields.metadata_field.MetadataField at 0x13750b090>}

In [64]:
predictor_output = predictor.predict_instance(_test_instance)
predictor_output

{'pred_sql': ''}

In [272]:
# output_test_path = '/Users/mac/Desktop/syt/Deep-Learning/Projects-M/SpeakQL/SpeakQL/Allennlp_models/outputs/local-test/test-rewriter-2.2tL.json'

# with open(output_test_path, 'w') as f:
#     json.dump(tagger_output_test_dataset, f, indent=4)

### Temp

In [7]:
params_path = 'runs/4.0.0.0/model_state_epoch_14.th'
params = torch.load(params_path, map_location=torch.device('cpu'))
list(params.keys())

['tabert_model._bert_model.bert.embeddings.word_embeddings.weight',
 'tabert_model._bert_model.bert.embeddings.position_embeddings.weight',
 'tabert_model._bert_model.bert.embeddings.token_type_embeddings.weight',
 'tabert_model._bert_model.bert.embeddings.LayerNorm.weight',
 'tabert_model._bert_model.bert.embeddings.LayerNorm.bias',
 'tabert_model._bert_model.bert.encoder.layer.0.attention.self.query.weight',
 'tabert_model._bert_model.bert.encoder.layer.0.attention.self.query.bias',
 'tabert_model._bert_model.bert.encoder.layer.0.attention.self.key.weight',
 'tabert_model._bert_model.bert.encoder.layer.0.attention.self.key.bias',
 'tabert_model._bert_model.bert.encoder.layer.0.attention.self.value.weight',
 'tabert_model._bert_model.bert.encoder.layer.0.attention.self.value.bias',
 'tabert_model._bert_model.bert.encoder.layer.0.attention.output.dense.weight',
 'tabert_model._bert_model.bert.encoder.layer.0.attention.output.dense.bias',
 'tabert_model._bert_model.bert.encoder.layer.0.

In [16]:
[(k, v) for k, v in params.items() if '_bert_model' not in k]

[('audio_seq2vec_encoder.conv_layer_0.weight',
  tensor([[[-7.6348e-02,  4.1835e-02],
           [ 2.3982e-02, -8.3926e-02],
           [-2.8561e-02, -8.3756e-02],
           [-5.9972e-02,  5.5599e-02],
           [ 4.8518e-03,  2.4257e-02],
           [ 6.0439e-02, -8.0720e-02],
           [ 1.4134e-02,  4.7279e-03],
           [-3.2942e-02, -8.3919e-02],
           [ 6.3656e-02,  4.9750e-02],
           [-6.1359e-02,  3.3727e-02],
           [ 5.5835e-02,  3.8362e-02],
           [-5.0697e-02,  7.1775e-02],
           [ 2.5095e-03, -3.3541e-02],
           [ 3.7495e-02,  4.6378e-02],
           [ 2.0532e-02, -4.8016e-02],
           [ 1.8072e-02,  2.0122e-02],
           [-5.0043e-02,  8.5643e-02],
           [ 2.0788e-02,  6.8649e-02],
           [ 7.3311e-02,  1.0855e-02],
           [ 5.0526e-02, -7.5090e-02],
           [-5.6216e-02,  2.7156e-02],
           [-1.5779e-02, -3.6399e-02],
           [ 4.2578e-02,  4.0174e-02],
           [-2.0960e-02,  5.3475e-02],
           [ 3.54

In [14]:
torch.equal(
    params['tabert_model._bert_model.bert.encoder.layer.6.attention.self.value.weight'],
    params['tabert_embedder.tabert_model._bert_model.bert.encoder.layer.6.attention.self.value.weight']
)

True

In [17]:
train_params_path = 'runs/4.0.0.0/training_state_epoch_14.th'
train_params = torch.load(train_params_path, map_location=torch.device('cpu'))
list(train_params.keys())

['metric_tracker',
 'optimizer',
 'batch_num_total',
 'learning_rate_scheduler',
 'epoch']

In [18]:
train_params

{'metric_tracker': {'best_so_far': 23.62405014038086,
  'patience': 50,
  'epochs_with_no_improvement': 13,
  'is_best_so_far': False,
  'should_decrease': True,
  'best_epoch_metrics': {'loss': 23.62405014038086},
  'epoch_number': 15,
  'best_epoch': 1},
 'optimizer': {'state': {204: {'step': 77085,
    'exp_avg': tensor([[[ 0.0000e+00,  0.0000e+00],
             [ 0.0000e+00,  0.0000e+00],
             [ 0.0000e+00,  0.0000e+00],
             [ 0.0000e+00,  0.0000e+00],
             [ 0.0000e+00,  0.0000e+00],
             [ 0.0000e+00,  0.0000e+00],
             [ 0.0000e+00,  0.0000e+00],
             [ 0.0000e+00,  0.0000e+00],
             [ 0.0000e+00,  0.0000e+00],
             [ 0.0000e+00,  0.0000e+00],
             [ 0.0000e+00,  0.0000e+00],
             [ 0.0000e+00,  0.0000e+00],
             [ 0.0000e+00,  0.0000e+00],
             [ 0.0000e+00,  0.0000e+00],
             [ 0.0000e+00,  0.0000e+00],
             [ 0.0000e+00,  0.0000e+00],
             [ 0.0000e+00,  0.

In [None]:
instance = copy(train_dataset[0])
instance.fields

In [None]:
instance.index_fields(vocab)

In [None]:
list(zip(instance.fields['tags'].labels, instance.fields['tags']._indexed_labels))

In [None]:
instance.fields['sentence']._indexed_tokens

In [None]:
list(zip(instance.fields['rewrite_seq'].tokens, instance.fields['rewrite_seq']._indexed_tokens['tgt_tokens']['tokens']))

In [None]:
instance.fields['rewrite_seq']._indexed_tokens

In [None]:
Instance.add_field

In [None]:
instance = Instance(test_dataset[0].fields.copy())
instance.fields

In [None]:
len(instance.fields['rewrite_seq'].tokens)

In [None]:
l = instance.fields['rewrite_seq'].tokens
len(l)

In [None]:
l[0].text

In [None]:
a = torch.LongTensor([1])[0]
b = torch.LongTensor([2])[0]
a, b

In [None]:
torch.LongTensor([a, b])

In [None]:
schemas = read_dataset_schema('/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/tables.json')
len(schemas)

In [None]:
schemas['perpetrator']

In [None]:
[(c.name, c.text) for c in schemas['perpetrator']['people'].columns]

In [None]:
# schema_gnn.spider.utils.read_dataset_values

import sqlite3

db_id = 'perpetrator'
dataset_path = '/Users/mac/Desktop/syt/Deep-Learning/Dataset/spider/database'
tables = ["perpetrator", "people"]

db = os.path.join(dataset_path, db_id, db_id + ".sqlite")
try:
    conn = sqlite3.connect(db)
except Exception as e:
    raise Exception(f"Can't connect to SQL: {e} in path {db}")
conn.text_factory = str
cursor = conn.cursor()

values = {}

for table in tables:
    try:
        cursor.execute(f"SELECT * FROM {table} LIMIT 5000")
        values[table] = cursor.fetchall()
    except:
        conn.text_factory = lambda x: str(x, 'latin1')
        cursor = conn.cursor()
        cursor.execute(f"SELECT * FROM {table} LIMIT 5000")
        values[table] = cursor.fetchall()

In [None]:
values

In [None]:
cursor.execute(f"SELECT * FROM sqlite_master where type='table'")
cursor.fetchall()

In [None]:
cursor.execute(f"SELECT * FROM people LIMIT 5000")
[d[0] for d in cursor.description]

In [None]:
cursor.execute('PRAGMA TABLE_INFO(people)')
cursor.fetchall()

In [None]:
[d[0] for d in cursor.description]

In [None]:
d = OrderedDict({'4': 4, '2': 2, '0': 0})
d

In [None]:
d2 = OrderedDict(sorted(d.items(), key=lambda x : x[1] % 3))
d2

In [None]:
d2.values()

In [None]:
torch.zeros([0, 5])

In [None]:
str(None)

In [25]:
# CosineEmbeddingLoss 
arr1 = np.random.randn(2, 4) + np.ones((2, 4))
arr2 = np.random.randn(2, 4) + np.ones((2, 4))
tensor1 = torch.tensor(arr1, dtype=torch.float32)
tensor2 = torch.tensor(arr2, dtype=torch.float32)
y = torch.tensor(np.eye(2) * 2 - np.ones((2, 2)), dtype=tensor1.dtype, device=tensor1.device)
tensor1, tensor2, y

(tensor([[ 0.1050,  1.0382,  0.5860,  1.2960],
         [ 1.2995, -0.6261,  1.3097,  2.1744]]),
 tensor([[-0.0796,  2.7626,  1.7253,  0.4309],
         [ 0.4299,  2.4990,  1.1491,  0.2155]]),
 tensor([[ 1., -1.],
         [-1.,  1.]]))

In [26]:
y.diag()

tensor([1., 1.])

In [27]:
tensor1.size(), tensor2.size()

(torch.Size([2, 4]), torch.Size([2, 4]))

In [30]:
torch.mean(tensor1[10:15]), torch.sum(tensor1[10:15])

(tensor(nan), tensor(0.))

In [32]:
torch.mean(tensor1[1:10]), torch.mean(tensor1[1:2])

(tensor(1.0394), tensor(1.0394))

In [387]:
cos_loss = CosineEmbeddingLoss(margin=0, reduction='mean')

In [388]:
t1 = tensor1.unsqueeze(0).expand(2, 2, 4).reshape(4, 4)
t2 = tensor2.unsqueeze(1).expand(2, 2, 4).reshape(4, 4)
y_ = y.view(4)
t1, t2, y

(tensor([[2.4972, 2.0322, 1.4485, 1.8441],
         [0.4712, 1.2000, 1.9665, 1.2432],
         [2.4972, 2.0322, 1.4485, 1.8441],
         [0.4712, 1.2000, 1.9665, 1.2432]]),
 tensor([[1.8994, 1.5220, 1.3646, 0.9790],
         [1.8994, 1.5220, 1.3646, 0.9790],
         [1.2207, 0.7970, 1.5540, 0.8197],
         [1.2207, 0.7970, 1.5540, 0.8197]]),
 tensor([[ 1., -1.],
         [-1.,  1.]], dtype=torch.float64))

In [389]:
cos_loss(t1, t2, y_)

tensor(0.4648)

In [390]:
from scipy.spatial.distance import cosine as cos_sim
l = cos_sim(arr1[0], arr2[0]) + cos_sim(arr1[1], arr2[1])
l += (1 - cos_sim(arr1[0], arr2[1])) + (1 - cos_sim(arr1[1], arr2[0]))
l /= 4.0
l

0.46483609747567395

In [93]:
import tarfile

model_ckpt_bin = '/Users/mac/Desktop/syt/Deep-Learning/Projects-M/SpeakQL/SpeakQL/Allennlp_models/runs/3.1.0/model.tar.gz'

with tarfile.open(model_ckpt_bin, 'r:gz') as tar:
    f = tar.extractfile('weights.th')
    ckpt = torch.load(f, map_location=torch.device('cpu'))

type(ckpt)

collections.OrderedDict

In [95]:
list(ckpt.keys())[::10]

['tabert_model._bert_model.bert.embeddings.word_embeddings.weight',
 'tabert_model._bert_model.bert.encoder.layer.0.attention.self.value.bias',
 'tabert_model._bert_model.bert.encoder.layer.0.output.LayerNorm.bias',
 'tabert_model._bert_model.bert.encoder.layer.1.attention.output.LayerNorm.bias',
 'tabert_model._bert_model.bert.encoder.layer.2.attention.self.key.bias',
 'tabert_model._bert_model.bert.encoder.layer.2.output.dense.bias',
 'tabert_model._bert_model.bert.encoder.layer.3.attention.output.dense.bias',
 'tabert_model._bert_model.bert.encoder.layer.4.attention.self.query.bias',
 'tabert_model._bert_model.bert.encoder.layer.4.intermediate.dense.bias',
 'tabert_model._bert_model.bert.encoder.layer.5.attention.self.value.bias',
 'tabert_model._bert_model.bert.encoder.layer.5.output.LayerNorm.bias',
 'tabert_model._bert_model.bert.encoder.layer.6.attention.output.LayerNorm.bias',
 'tabert_model._bert_model.bert.encoder.layer.7.attention.self.key.bias',
 'tabert_model._bert_model.b

In [96]:
del ckpt

In [187]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('predicts', 26744),
 ('golds', 4856),
 ('hyp_list', 4856),
 ('ref_list', 4856),
 ('Activation', 1064),
 ('Attention', 1064),
 ('Average', 1064),
 ('BasicTextFieldEmbedder', 1064),
 ('BertConfig', 1064),
 ('BertModel', 1064),
 ('BertPreTrainedModel', 1064),
 ('BertTokenizer', 1064),
 ('BilinearMatrixAttention', 1064),
 ('BooleanAccuracy', 1064),
 ('BucketBatchSampler', 1064),
 ('CategoricalAccuracy', 1064),
 ('CnnEncoder', 1064),
 ('ConditionalRandomField', 1064),
 ('CosineEmbeddingLoss', 1064),
 ('CosineMatrixAttention', 1064),
 ('DataLoader', 1064),
 ('DatasetReader', 1064),
 ('DecoderNet', 1064),
 ('Embedding', 1064),
 ('GradientDescentTrainer', 1064),
 ('LinearMatrixAttention', 1064),
 ('MSELoss', 1064),
 ('MatrixAttention', 1064),
 ('MeanAbsoluteError', 1064),
 ('Model', 1064),
 ('ModuleList', 1064),
 ('Params', 1064),
 ('Predictor', 1064),
 ('PretrainedTransformerEmbedder', 1064),
 ('PretrainedTransformerMismatchedEmbedder', 1064),
 ('PyTorchDataLoader', 1064),
 ('PytorchSeq2Seq

In [121]:
%load_ext memory_profiler

In [122]:
%memit

peak memory: 1329.27 MiB, increment: 0.56 MiB


In [128]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [89]:
d = dict()
try:
    _ = d[('agg*', 6)]
except KeyError as e:
    print(e)
    print(type(e), type(e.with_traceback(None)), e.args)
    assert False
except Exception as e:
    print('xxx')

('agg*', 6)
<class 'KeyError'> <class 'KeyError'> (('agg*', 6),)


AssertionError: 