In [1]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import sys

ROOT = os.path.join(os.path.dirname(os.getcwd()), 'backend')
if ROOT not in sys.path:
    sys.path.append(ROOT)

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup,
    AutoTokenizer
)

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

seed_number = 42
set_seed(seed_number)

In [2]:
from app.dataService import globalVariable as GV
# from app.dataService.dataService import DataService
print(f"GV.SPIDER_FOLDER: {GV.SPIDER_FOLDER}")
from app.dataService.utils.processSQL import process_sql
from app.dataService.utils.processSQL import decode_sql
from app.dataService import sql2sql

db_schema, db_names, tables = process_sql.get_schemas_from_json(os.path.join(GV.SPIDER_FOLDER, "tables.json"))
# dataService = DataService("spider")

GV.SPIDER_FOLDER: /data2/xingbo/chi2022/seqNLI/backend/app/dataService/../data/dataset/spider


In [3]:
f_name = "query_seq"
f_type = "train"
# f_name = "query_seq_1"
# f_type = "dev"
with open(os.path.join(GV.SPIDER_FOLDER , f"{f_name}.json"), "r") as f:
    seq_data = json.load(f)

In [4]:
# convert sequences to pairwise pairs
def extract_select_ent(select):
    cols_ents = []
    for s in select[1]:
        agg_id = s[0]
        val_unit = s[1]
        # 'select': (isDistinct(bool), [(agg_id, val_unit), (agg_id, val_unit), ...])
        # val_unit: (unit_op, col_unit1, col_unit2)
        # col_unit: (agg_id, col_id, isDistinct(bool))
        unit_op, col_unit1, col_unit2 = val_unit
        agg_id1, col1, isDistinct1 = col_unit1
        
        cols = col1
        if col_unit2 is not None:
            agg_id2, col2, isDistinct2 = col_unit2
            cols += ", " + col2
        cols_ents.append(cols)
    
    # print("cols_ents: {}".format(cols_ents))
    return cols_ents

def organize_meta(table):
    table_names = table["table_names"]
    col_names = table["column_names"]
    meta = []
    for table_id, table_name in enumerate(table_names):
        t = table_name + " (" + ", ".join([col_name[1] for col_name in col_names if col_name[0] == table_id]) + ")"
        meta.append(t)
    return meta

sources = []
targets = []
metas = []
for db_name in seq_data.keys():
    # print(db_name, seq_data[db_name].keys())
    sqlseqs = seq_data[db_name]["sql"]
    for sqlseq in sqlseqs:
        for sqlid, sql in enumerate(sqlseq):
            schema = process_sql.Schema(db_schema[db_name], tables[db_name])
            sql_label = process_sql.get_sql(schema, sql)
            select_decoded = decode_sql.decode_select(sql_label, tables[db_name])
            curr_select_ents = extract_select_ent(select_decoded)
            meta = organize_meta(tables[db_name])
            if sqlid + 1 < len(sqlseq):
                sql_next_label = process_sql.get_sql(schema, sqlseq[sqlid + 1])
                next_select_decoded = decode_sql.decode_select(sql_next_label, tables[db_name])
                next_select_ents = extract_select_ent(next_select_decoded)
                # organize pairs
                sources.append(curr_select_ents)
                targets.append(next_select_ents)
                metas.append(meta)
                
print("len(sources): ", len(sources), len(targets), len(metas))

print(sources[0])

# with open(os.path.join(GV.SPIDER_FOLDER ,f"{f_name}_{f_type}.json"), "w") as f:
#     json.dump({
#         "meta": metas,
#         "source": sources,
#         "target": targets
#     }, f)

len(sources):  5170 5170 5170
['activity: activity name']


In [5]:
with open(os.path.join(GV.SPIDER_FOLDER ,f"{f_name}_{f_type}.json"), "r") as f:
    pairs_data = json.load(f)

In [6]:
pairs_data.keys()

dict_keys(['meta', 'source', 'target'])

In [7]:
tokenizer = AutoTokenizer.from_pretrained('t5-base')

In [8]:
dataset = sql2sql.get_spider_dataset(tokenizer)

len(self.sources): 5170


In [9]:
print(f"dataset.targets[0]: {dataset.targets[0]}")

dataset.targets[0]: {'input_ids': tensor([[1756,   10, 1429,    1,    1,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 

## Train

In [10]:
!mkdir -p t5_seq

In [11]:
args_dict = sql2sql.args_dict
args_dict.update({'output_dir': 't5_seq', 'num_train_epochs': 10, 'vocab_file': 'tokenizer_config.json'})
args = argparse.Namespace(**args_dict)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    #early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
)

In [12]:
model = sql2sql.T5FineTuner(args)
trainer = pl.Trainer(**train_params)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [None]:
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

len(self.sources): 5170
len(self.sources): 5170


Training: -1it [00:00, ?it/s]

  f"One of the returned values {set(extra.keys())} has a `grad_fn`. We will detach it automatically"


In [None]:
## save the model this way so next time you can load it using T5ForConditionalGeneration.from_pretrained
model.model.save_pretrained('t5_seq')
model.model.eval()

In [None]:
import textwrap
from tqdm.auto import tqdm
from sklearn import metrics

In [21]:
batch_size = 16
loader = DataLoader(dataset, batch_size = batch_size, shuffle = True)
it = iter(loader)
batch = next(it)
batch["source_ids"].shape

torch.Size([16, 512])

In [22]:
cudamodel = model.model.to('cuda')
outs = cudamodel.generate(input_ids=batch['source_ids'].cuda(), 
                              attention_mask=batch['source_mask'].cuda(), 
                              max_length=512)

In [23]:
dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
sources = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch['source_ids']]
targets = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch['target_ids']]

In [25]:
for i in range(batch_size):
    lines = textwrap.wrap("Review:\n%s\n" % sources[i], width=100)
    print("\n".join(lines))
    print("\nActual: %s" % targets[i])
    print("Predicted: %s" % dec[i])
    print("=====================================================================\n")

Review: mountain: name *, mountain (mountain id, name, height, prominence, range, country), climber
(climber id, name, country, time, points, mountain id)

Actual: climber: name, mountain: height
Predicted: mountain: name

Review: customer orders: actual delivery date *, reference payment methods (payment method code,
payment method description), reference service types (service type code, parent service type code,
service type description), addresses (address id, line 1, line 2, city town, state county, other
details), products (product id, product name, product price, product description, other product
service details), marketing regions (marketing region code, marketing region name, marketing region
descriptrion, other details), clients (client id, address id, customer email address, customer name,
customer phone, other details), drama workshop groups (workshop group id, address id, currency code,
marketing region code, store name, store phone, store email address, other details), p

In [26]:
loader = DataLoader(dataset, batch_size=16, num_workers=4, shuffle=True)
model.model.eval()
outputs = []
targets = []
cudamodel = model.model.to('cuda')
for batch in tqdm(loader):
  outs = cudamodel.generate(input_ids=batch['source_ids'].cuda(), 
                              attention_mask=batch['source_mask'].cuda(), 
                              max_length=512)
 
  dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
  target = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch["target_ids"]]
  
  outputs.extend(dec)
  targets.extend(target)

  0%|          | 0/324 [00:00<?, ?it/s]

In [27]:
print("f1-macro: {}".format(metrics.f1_score(targets, outputs, average='macro')))
print("f1-micro: {}".format(metrics.f1_score(targets, outputs, average='micro')))

f1-macro: 0.06327901523557042
f1-micro: 0.19709864603481628
