FineTuning WikiSQL  
https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb

FineTuning Summarization  
https://towardsdatascience.com/fine-tuning-a-t5-transformer-for-any-summarization-task-82334c64c81

In [1]:
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
)

In [2]:
model = T5ForConditionalGeneration.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL")
tokenizer = T5Tokenizer.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL")

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


In [3]:
def print_encoding(model_inputs, indent=4):
    indent_str = " " * indent
    print("{")
    for k, v in model_inputs.items():
        print(indent_str + k + ":")
        print(indent_str + indent_str + str(v))
    print("}")

In [4]:
# You can pass multiple strings into the tokenizer and pad them as you need
model_inputs = tokenizer(["number of patients grouped by race.",
                         "How many patients does each race group contain?",
                          "Counts of patients taking drug <ARG-DRUG><0> and <ARG-DRUG><1> within <ARG-TIMEDAYS><0> days.",
                          "Counts of patients with condition <ARG-CONDITION><0>, <ARG-CONDITION><1>, and <ARG-CONDITION><2>.",
                         ],
                         return_tensors="pt",
                         padding="max_length",
                         max_length = 128,
                         truncation=True)
print(f"Pad token: {tokenizer.pad_token} | Pad token id: {tokenizer.pad_token_id}")
print("Padding:")
print_encoding(model_inputs)

Pad token: <pad> | Pad token id: 0
Padding:
{
    input_ids:
        tensor([[  381,    13,  1221,     3, 31801,    57,  1964,     5,     1,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     

In [5]:
# You can also decode a whole batch at once:
print("Batch Decode:")
print(tokenizer.batch_decode(model_inputs.input_ids))
print()
print("Batch Decode: (no special characters)")
print(tokenizer.batch_decode(model_inputs.input_ids, skip_special_tokens=True))

Batch Decode:
['number of patients grouped by race.</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>', 'How many patients does each race group contain?</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <

In [6]:
model_inputs.input_ids[0]

tensor([  381,    13,  1221,     3, 31801,    57,  1964,     5,     1,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [7]:
tokens = tokenizer.convert_ids_to_tokens(model_inputs.input_ids[0])
print(tokens)

['▁number', '▁of', '▁patients', '▁', 'grouped', '▁by', '▁race', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad

In [8]:
output = model.generate(**model_inputs,
            max_length=512,
            use_cache=True,
            num_beams=2,
        )

output

tensor([[    0, 32099,  1221,     3, 31801,    57,  1964,     5,  7720,    13,
          1221,     3, 31801,    57,  1964,     5,     1,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [    0, 32099,  1221,   399,  1964,   563,    58, 32098,  1221,   399,
       

In [9]:
tokenizer.batch_decode(output[0])

['<pad>',
 '<extra_id_0>',
 'patients',
 '',
 'grouped',
 'by',
 'race',
 '.',
 'Number',
 'of',
 'patients',
 '',
 'grouped',
 'by',
 'race',
 '.',
 '</s>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '

#### Using translate English to SQL:

https://huggingface.co/mrm8488/t5-base-finetuned-wikiSQL?text=translate+English+to+SQL%3A+How+many+models+were+finetuned+using+BERT+as+base+model%3F

In [10]:
model_inputs = tokenizer(["translate English to SQL: number of patients grouped by race.",
                         "translate English to SQL: How many patients does each race group contain?",
                          "translate English to SQL: Counts of patients taking drug <ARG-DRUG><0> and <ARG-DRUG><1> within <ARG-TIMEDAYS><0> days.",
                          "translate English to SQL: Counts of patients with condition <ARG-CONDITION><0>, <ARG-CONDITION><1>, and <ARG-CONDITION><2>.",
                         ],
                         return_tensors="pt",
                         padding="max_length", 
                         max_length=128,
                         truncation=True)
print(f"Pad token: {tokenizer.pad_token} | Pad token id: {tokenizer.pad_token_id}")
print("Padding:")
print_encoding(model_inputs)

Pad token: <pad> | Pad token id: 0
Padding:
{
    input_ids:
        tensor([[13959,  1566,    12, 12558,    10,   381,    13,  1221,     3, 31801,
            57,  1964,     5,     1,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     

In [11]:
# print("Batch Decode:")
# print(tokenizer.batch_decode(model_inputs.input_ids))
# print()
print("Batch Decode: (no special characters)")
print(tokenizer.batch_decode(model_inputs.input_ids, skip_special_tokens=True))

Batch Decode: (no special characters)
['translate English to SQL: number of patients grouped by race.', 'translate English to SQL: How many patients does each race group contain?', 'translate English to SQL: Counts of patients taking drug ARG-DRUG>0> and ARG-DRUG>1> within ARG-TIMEDAYS>0> days.', 'translate English to SQL: Counts of patients with condition ARG-CONDITION>0>, ARG-CONDITION>1>, and ARG-CONDITION>2>.']


In [12]:
output = model.generate(**model_inputs,
            max_length=512,
            use_cache=True,
            num_beams=2,
        )

output

tensor([[    0,     3, 23143, 14196,  2847, 17161, 17656, 21680,   953,   549,
         17444,   427, 10949,  3274,  1531,    15,    26,     1,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [    0,     3, 23143, 14196,  2847, 17161, 17656, 21680,   953,   549,
         17444,   427, 10949,  1531,  3274, 10949,  1531,     1,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [    0,     3, 23143, 14196,     3, 10628,     7,    13,  1221,   838,

In [13]:
tokenizer.batch_decode(model_inputs.input_ids[0], skip_special_tokens=False, clean_up_tokenization_spaces=True)[:20]

['translate',
 'English',
 'to',
 'SQL',
 ':',
 'number',
 'of',
 'patients',
 '',
 'grouped',
 'by',
 'race',
 '.',
 '</s>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>']

In [14]:
tokenizer.batch_decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

['<pad>',
 '',
 'SEL',
 'ECT',
 'CO',
 'UNT',
 'Patient',
 'FROM',
 'table',
 'W',
 'HER',
 'E',
 'Race',
 '=',
 'Group',
 'e',
 'd',
 '</s>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>']

In [15]:
tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

'SELECT COUNT Patient FROM table WHERE Race = Grouped'

In [16]:
model.config

T5Config {
  "_name_or_path": "mrm8488/t5-base-finetuned-wikiSQL",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English 

In [17]:
Paths = {
    'Data': './../../Data/',
    'Train': './../../Data/PreparedText2SQL/train.csv',
    'Validation': './../../Data/PreparedText2SQL/validation.csv',
    'Test': './../../Data/PreparedText2SQL/test.csv',
}

In [18]:
import pandas as pd

df_train = pd.read_csv(Paths['Train'])
df_train

Unnamed: 0.1,Unnamed: 0,Base_Question,Folded_Question,Query_Generated
0,0,Count of patients grouped by race.,How many patients are in each race group?,"SELECT race, COUNT(DISTINCT pe1.person_id) AS ..."
1,1,Count of patients grouped by race.,How many patients fit into each race group?,"SELECT race, COUNT(DISTINCT pe1.person_id) AS ..."
2,2,Count of patients grouped by race.,How many patients end up in each race group?,"SELECT race, COUNT(DISTINCT pe1.person_id) AS ..."
3,3,Count of patients grouped by race.,How many patients were in each race group?,"SELECT race, COUNT(DISTINCT pe1.person_id) AS ..."
4,4,Count of patients grouped by race.,How many patients fitted into each race group?,"SELECT race, COUNT(DISTINCT pe1.person_id) AS ..."
...,...,...,...,...
596956,596956,counts of patients with taking drug <ARG-DRUG>...,tell me how many patients are taking <ARG-DRUG...,SELECT DATE_PART_YEAR(drug_exposure_start_date...
596957,596957,counts of patients with taking drug <ARG-DRUG>...,tell me how many people are taking <ARG-DRUG><...,SELECT DATE_PART_YEAR(drug_exposure_start_date...
596958,596958,counts of patients with taking drug <ARG-DRUG>...,tell me how many persons are taking <ARG-DRUG>...,SELECT DATE_PART_YEAR(drug_exposure_start_date...
596959,596959,counts of patients with taking drug <ARG-DRUG>...,tell me how many individuals are taking <ARG-D...,SELECT DATE_PART_YEAR(drug_exposure_start_date...


In [24]:
max_folded = 0
max_query = 0
for i, f in df_train.iterrows():
    folded = len(f['Folded_Question']) #f['Folded_Question']
    query = len(f['Query_Generated'])  #f['Query_Generated']
    
    if folded > max_folded:
        max_folded = folded
    if query > max_query:
        max_query = query
        
max_folded, max_query

(198, 1190)

In [25]:
b = [ 3, 23143, 14196,  2847, 17161,   599,     3, 15438, 25424,  6227,
    3,    26,    52,  5411,  6075,   834,    23,    26,    61, 21680,
   41,  6306, 20557, 20211,  4275,    26, 13534,   834, 19300,  4334,
    3,    26,    52,   536,     3, 15355,  3162,    41,   599,  6306,
 3913, 19046,    18, 20050,  5329,  6048,   908,  6306,  4280,   517,
   18,  3913, 19046,   908,  6306,   632,   908,  4417,  9215,   784,
 3913, 19046,    18, 20050,  5329,  6048,   908,  6306,  4280,   517,
   18,  3913, 19046,   908,  6306,   536,   908,    61,  4417,  9215,
  784,  3913, 19046,    18, 20050,  5329,  6048,   908,  6306,  4280,
  517,    18,  3913, 19046,   908,  6306,   357,   908,    61,  9191,
    3,    26,    52,  5411,    26, 13534,   834,  1018,  6873,   834,
   23,    26,  2423,  1018,  6873,   834,    23,    26,    61,     3,
  117,     1]

In [26]:
c = [    3, 23143, 14196,  2847, 17161,   599,     3, 15438, 25424,  6227,
    3,    26,    52,  5411,  6075,   834,    23,    26,    61, 21680,
   41,  6306, 20557, 20211,  4275,    26, 13534,   834, 19300,  4334,
    3,    26,    52,   536,     3, 15355,  3162,    41,   599,  6306,
 3913, 19046,    18, 20050,  5329,  6048,   908,  6306,  4280,   517,
   18,  3913, 19046,   908,  6306,   632,   908,  4417,  9215,   784,
 3913, 19046,    18, 20050,  5329,  6048,   908,  6306,  4280,   517,
   18,  3913, 19046,   908,  6306,   536,   908,    61,  4417,  9215,
  784,  3913, 19046,    18, 20050,  5329,  6048,   908,  6306,  4280,
  517,    18,  3913, 19046,   908,  6306,   357,   908,    61,  9191,
    3,    26,    52,  5411,    26, 13534,   834,  1018,  6873,   834,
   23,    26,  2423,  1018,  6873,   834,    23,    26,    61,     3,
  117,     1]

In [27]:
b == c

True