In [1]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import sys

ROOT = os.path.join(os.path.dirname(os.getcwd()), 'backend')
if ROOT not in sys.path:
    sys.path.append(ROOT)

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup,
    AutoTokenizer
)

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

seed_number = 42
set_seed(seed_number)

In [2]:
from app.dataService import sql2sql

## Explore T5
- https://huggingface.co/t5-base

In [3]:
tokenizer = AutoTokenizer.from_pretrained('t5-base')
# add special tokens
# special_tokens = {'bos_token': '<BOS>',
#                   'cls_token': '<CSL>',
#                   'additional_special_tokens': ['<MY_NEW_TOKEN>', '<ANOTHER_TOKEN>']}
# tokenizer.add_special_tokens(special_tokens_dict=special_tokens)

In [4]:
args_dict = sql2sql.args_dict

## IMDB sentiment analysis

In [5]:
from datasets import load_dataset
dataset = sql2sql.ImdbDataset(tokenizer, load_dataset("imdb"), 'train', ['negative</s>', 'positive</s>'], max_len=512)
len(dataset)

Reusing dataset imdb (/home/xingbo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)
Loading cached shuffled indices for dataset at /home/xingbo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-8354e7761f356f2f.arrow


len(self.dataset_split): 25000


6278

In [17]:
len(dataset)

6162

In [6]:
print(f"dataset.targets[0]: {dataset.targets[0]}")

dataset.targets[0]: {'input_ids': tensor([[2841,    1,    1,    0,    0,    0,    0,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])}


### Train

In [None]:
!mkdir -p t5_base_imdb_sentiment

In [8]:
args_dict.update({'output_dir': 't5_base_imdb_sentiment', 'num_train_epochs': 1, 'vocab_file': 'tokenizer_config.json'})
args = argparse.Namespace(**args_dict)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    #early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
)

In [11]:
model = sql2sql.T5FineTuner(args)
trainer = pl.Trainer(**train_params)
trainer.fit(model)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Reusing dataset imdb (/home/xingbo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)
Loading cached shuffled indices for dataset at /home/xingbo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-de8fb117c43e3917.arrow
Reusing dataset imdb (/home/xingbo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)
Loading cached shuffled indices for dataset at /home/xingbo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-4136b8a3420daa1d.arrow


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [14]:
## save the model this way so next time you can load it using T5ForConditionalGeneration.from_pretrained
model.model.save_pretrained('t5_base_imdb_sentiment')
model.model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dr

### Evaluation

In [15]:
import textwrap
from tqdm.auto import tqdm
from sklearn import metrics

In [18]:
dataset = sql2sql.ImdbDataset(tokenizer, load_dataset("imdb"), 'test', ['negative</s>', 'positive</s>'], max_len=512)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

Reusing dataset imdb (/home/xingbo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)
Loading cached shuffled indices for dataset at /home/xingbo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-de8fb117c43e3917.arrow


In [19]:
it = iter(loader)
batch = next(it)
batch["source_ids"].shape

torch.Size([32, 512])

In [20]:
cudamodel = model.model.to('cuda')
outs = cudamodel.generate(input_ids=batch['source_ids'].cuda(), 
                              attention_mask=batch['source_mask'].cuda(), 
                              max_length=11)

In [21]:
dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch['source_ids']]
targets = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch['target_ids']]

In [22]:
for i in range(32):
    lines = textwrap.wrap("Review:\n%s\n" % texts[i], width=100)
    print("\n".join(lines))
    print("\nActual sentiment: %s" % targets[i])
    print("Predicted sentiment: %s" % dec[i])
    print("=====================================================================\n")

Review: Okay so there were the odd hole in the plot you could drive a zeppelin through but how well
was the emotional stuff handled It would have been so easy to descend into cheesiness but the writer
pulled it off The image of the ex female cyberman making crying noises as sheit saw her reflection
after regaining her emotions is one that will stay with me forever Thats twice now the monsters have
shown a soft side and been presented fleetingly sympathetically the previous being the last Dalek
from series one but by Jove its worked Add to that the other exfemale who had been upgraded on the
eve of her wedding and Jackie Tyler recognising her husband after she had become cyber and you have
a permanent throat lump Keep it up

Actual sentiment: positive
Predicted sentiment: positive

Review: Nightscream is a TV Movie so its bound to be pretty dire especially as its a supposed horror
film This young girl is haunted by dreams as she arrives in a small town where there was a murder of
a woma

In [24]:
loader = DataLoader(dataset, batch_size=32, num_workers=4, shuffle=True)
model.model.eval()
outputs = []
targets = []
cudamodel = model.model.to('cuda')
for batch in tqdm(loader):
  outs = cudamodel.generate(input_ids=batch['source_ids'].cuda(), 
                              attention_mask=batch['source_mask'].cuda(), 
                              max_length=11)
 
  dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
  target = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch["target_ids"]]
  
  outputs.extend(dec)
  targets.extend(target)

  0%|          | 0/196 [00:00<?, ?it/s]

In [26]:
print(metrics.classification_report(targets, outputs))

              precision    recall  f1-score   support

    negative       0.89      0.96      0.92      3130
    positive       0.96      0.88      0.92      3118

    accuracy                           0.92      6248
   macro avg       0.92      0.92      0.92      6248
weighted avg       0.92      0.92      0.92      6248

