In [1]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import sys

ROOT = os.path.join(os.path.dirname(os.getcwd()), 'backend')
if ROOT not in sys.path:
    sys.path.append(ROOT)

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup,
    AutoTokenizer
)

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

seed_number = 42
set_seed(seed_number)

In [2]:
from app.dataService import globalVariable as GV
# from app.dataService.dataService import DataService
print(f"GV.SPIDER_FOLDER: {GV.SPIDER_FOLDER}")
from app.dataService.utils.processSQL import process_sql
from app.dataService.utils.processSQL import decode_sql
from app.dataService import sql2sql

db_schema, db_names, tables = process_sql.get_schemas_from_json(os.path.join(GV.SPIDER_FOLDER, "tables.json"))
# dataService = DataService("spider")

GV.SPIDER_FOLDER: /data2/xingbo/chi2022/seqNLI/backend/app/dataService/../data/dataset/spider


In [3]:
f_name = "query_seq"
f_type = "train"
# f_name = "query_seq_1"
# f_type = "dev"
with open(os.path.join(GV.SPIDER_FOLDER , f"{f_name}.json"), "r") as f:
    seq_data = json.load(f)

In [4]:
keywords = ["*", "avg", "min", "max", "count", "+", "-", "*", "/"]
", ".join(keywords)

'*, avg, min, max, count, +, -, *, /'

In [5]:
from termcolor import colored
# convert sequences to pairwise pairs
def col_unit_2_string(col_unit):
    if col_unit is not None:
        agg_id, col, isDistinct = col_unit
        if agg_id != "none":
            return (isDistinct + " " + agg_id + "(" + col + ")").strip()
        else:
            return (isDistinct  + " " + col).strip()
    else:
        return ""
    
def extract_filters(sql, table):
    return

def extract_groupby(sql, table):
    groupby_decoded = decode_sql.decode_groupby(sql["groupBy"], table)
    return [col[1] for col in groupby_decoded]
    
def extract_select_ent(select):
    cols_ents = []
    global_distinct = select[0]
    for s in select[1]:
        agg_id = s[0]
        val_unit = s[1]
        # 'select': (isDistinct(bool), [(agg_id, val_unit), (agg_id, val_unit), ...])
        # val_unit: (unit_op, col_unit1, col_unit2)
        # col_unit: (agg_id, col_id, isDistinct(bool))
        unit_op, col_unit1, col_unit2 = val_unit
        agg_id1, col1, isDistinct1 = col_unit1
        col1_string = col_unit_2_string(col_unit1)
        
        if col_unit2 is not None:
            col2_string = col_unit_2_string(col_unit2)
            col1_string =  " ".join([col1_string, unit_op, col2_string])
        
        if agg_id != "none":
            col1_string = agg_id + " (" + col1_string + ")"
        cols_ents.append(col1_string)
    return cols_ents

def organize_meta(table):
    table_names = table["table_names"]
    col_names = table["column_names"]
    meta = []
    for table_id, table_name in enumerate(table_names):
        t = table_name + " (" + ", ".join([col_name[1] for col_name in col_names if col_name[0] == table_id]) + ")"
        meta.append(t)
    return meta

sources = []
targets = []
metas = []
for db_name in seq_data.keys():
    # print(db_name, seq_data[db_name].keys())
    sqlseqs = seq_data[db_name]["sql"]
    for sqlseq in sqlseqs:
        for sqlid, sql in enumerate(sqlseq):
            schema = process_sql.Schema(db_schema[db_name], tables[db_name])
            sql_label = process_sql.get_sql(schema, sql)
            select_decoded = decode_sql.decode_select(sql_label, tables[db_name])
            curr_select_ents = ", ".join(extract_select_ent(select_decoded))
            curr_groupby_ents = ", ".join(extract_groupby(sql_label, tables[db_name]))
            curr_query = f"select ({curr_select_ents})" + f" groupby ({curr_groupby_ents})"
            # print(curr_query)
            meta = organize_meta(tables[db_name])
            if sqlid + 1 < len(sqlseq):
                sql_next_label = process_sql.get_sql(schema, sqlseq[sqlid + 1])
                next_select_decoded = decode_sql.decode_select(sql_next_label, tables[db_name])
                next_select_ents = ", ".join(extract_select_ent(next_select_decoded))
                next_groupby_ents = ", ".join(extract_groupby(sql_next_label, tables[db_name]))
                next_query = f"select ({next_select_ents})" + f" groupby ({next_groupby_ents})"
                # organize pairs
                sources.append(curr_query)
                targets.append(next_query)
                metas.append(meta)
                
print("len(sources): ", len(sources), len(targets), len(metas))

print(sources[0])

with open(os.path.join(GV.SPIDER_FOLDER ,f"{f_name}_{f_type}.json"), "w") as f:
    json.dump({
        "meta": metas,
        "source": sources,
        "target": targets
    }, f)

len(sources):  5170 5170 5170
select (activity: activity name) groupby ()


In [6]:
with open(os.path.join(GV.SPIDER_FOLDER ,f"{f_name}_{f_type}.json"), "r") as f:
    pairs_data = json.load(f)

In [7]:
pairs_data.keys()

dict_keys(['meta', 'source', 'target'])

In [8]:
tokenizer = AutoTokenizer.from_pretrained('t5-base')

In [9]:
dataset = sql2sql.get_spider_dataset(tokenizer,"train")

In [10]:
print(f"dataset.targets[0]: {dataset.targets[0]}")

dataset.targets[0]: {'input_ids': tensor([[ 1738,    41, 13362,    41, 21661,    10,  1429,    61,    61,   563,
           969,    41,    61,     1,     1,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,   

## Train

In [14]:
!mkdir -p t5_seq

In [15]:
args_dict = sql2sql.args_dict
args_dict.update({'output_dir': 't5_seq', 'num_train_epochs': 10, 'vocab_file': 'tokenizer_config.json'})
args = argparse.Namespace(**args_dict)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    #early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
)

In [16]:
model = sql2sql.T5FineTuner(args)
trainer = pl.Trainer(**train_params)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [17]:
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: -1it [00:00, ?it/s]

  f"One of the returned values {set(extra.keys())} has a `grad_fn`. We will detach it automatically"


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [26]:
## save the model this way so next time you can load it using T5ForConditionalGeneration.from_pretrained
model.model.save_pretrained('t5_seq')
model.model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dr

In [27]:
import textwrap
from tqdm.auto import tqdm
from sklearn import metrics

In [28]:
batch_size = 16
loader = DataLoader(dataset, batch_size = batch_size, shuffle = True)
it = iter(loader)
batch = next(it)
batch["source_ids"].shape

torch.Size([16, 512])

In [29]:
cudamodel = model.model.to('cuda')
outs = cudamodel.generate(input_ids=batch['source_ids'].cuda(), 
                              attention_mask=batch['source_mask'].cuda(), 
                              max_length=512)

In [30]:
dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
sources = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch['source_ids']]
targets = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch['target_ids']]

In [31]:
for i in range(batch_size):
    lines = textwrap.wrap("Input:\n%s\n" % sources[i], width=100)
    print("\n".join(lines))
    print("\nActual: %s" % targets[i])
    print("Predicted: %s" % dec[i])
    print("=====================================================================\n")

Input: select (country: country name) groupby () *, avg, min, max, count, +, -, /, select, groupby,
country (country id, country name, capital, official native language), team (team id, name), match
season (season, player, position, country, team, draft pick number, draft class, college), player
(player id, player, years played, total wl, singles wl, doubles wl, team)

Actual: select (team: name) groupby ()
Predicted: select (match season: position) groupby ()

Input: select (reference calendar: day number, all documents: date stored) groupby () *, avg, min,
max, count, +, -, /, select, groupby, reference document types (document type code, document type
name, document type description), reference calendar (calendar date, day number), reference
locations (location code, location name, location description), roles (role code, role name, role
description), all documents (document id, date stored, document type code, document name, document
description, other details), employees (employee

In [None]:
loader = DataLoader(dataset, batch_size=16, num_workers=4, shuffle=True)
model.model.eval()
outputs = []
targets = []
cudamodel = model.model.to('cuda')
for batch in tqdm(loader):
  outs = cudamodel.generate(input_ids=batch['source_ids'].cuda(), 
                              attention_mask=batch['source_mask'].cuda(), 
                              max_length=512)
 
  dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
  target = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch["target_ids"]]
  
  outputs.extend(dec)
  targets.extend(target)

  0%|          | 0/324 [00:00<?, ?it/s]

In [None]:
print("f1-macro: {}".format(metrics.f1_score(targets, outputs, average='macro')))
print("f1-micro: {}".format(metrics.f1_score(targets, outputs, average='micro')))