In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/transformer-drg-style-transfer-master/

/content/drive/.shortcut-targets-by-id/1gvgdEyQQFFN43xnL2_DdI4rUtMI5gnmU/transformer-drg-style-transfer-master


In [3]:
!nvidia-smi

Sat Apr 22 00:39:30 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    46W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
! pip install SentencePiece==0.1.94

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting SentencePiece==0.1.94
  Downloading sentencepiece-0.1.94-cp39-cp39-manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SentencePiece
Successfully installed SentencePiece-0.1.94


In [5]:
!pip install transformers
!pip install pytorch_lightning==0.7.5 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m103.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://u

In [6]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# Model

In [8]:
class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams):
    super(T5FineTuner, self).__init__()

    self.my_hparams = hparams
    
    self.model = T5ForConditionalGeneration.from_pretrained(self.my_hparams.model_name_or_path)
    self.tokenizer = T5Tokenizer.from_pretrained(self.my_hparams.tokenizer_name_or_path, max_length=128)
  
  def is_logger(self):
    return self.trainer.proc_rank <= 0
  
  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels=labels,
    )

  def _step(self, batch):
    labels = batch["target_ids"]
    labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        labels=labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]

    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)

    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}
  
  def train_epoch_end(self, outputs):
    #avg_train_loss = torch.stack(self.training_step_outputs).mean()
    avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
    tensorboard_logs = {"avg_train_loss": avg_train_loss}
    return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    return {"val_loss": loss}
  
  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}
    return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.my_hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.my_hparams.learning_rate, eps=self.my_hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]
  
  def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
    if self.trainer.use_tpu:
      xm.optimizer_step(optimizer)
    else:
      optimizer.step()
    optimizer.zero_grad()
    self.lr_scheduler.step()
  
  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.my_hparams)
    dataloader = DataLoader(train_dataset, batch_size=self.my_hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
    t_total = (
        (len(dataloader.dataset) // (self.my_hparams.train_batch_size * max(1, self.my_hparams.n_gpu)))
        // self.my_hparams.gradient_accumulation_steps
        * float(self.my_hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.my_hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.my_hparams)
    return DataLoader(val_dataset, batch_size=self.my_hparams.eval_batch_size, num_workers=4)
    

In [9]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.my_hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

In [10]:
args_dict = dict(
    data_dir="", # path for data files
    output_dir="", # path to save the checkpoints
    model_name_or_path='t5-base',
    tokenizer_name_or_path='t5-base',
    max_seq_length=128,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=8,
    eval_batch_size=8,
    num_train_epochs=2,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)
     

# Yelp review data preperation

In [None]:
! pip install SentencePiece==0.1.94

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration

In [12]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
df = pd.read_csv('/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/t5/train.csv', on_bad_lines='skip', delimiter='\t', header=None, index_col=None)

In [None]:
df[0]

0                                    i was sadly mistaken .
1         so on to the hoagies , the italian is general ...
2              minimal meat and a ton of shredded lettuce .
3         nothing really special & not worthy of the $ _...
4             second , the steak hoagie , it is atrocious .
                                ...                        
443254                                  love these donuts !
443255                       lots of smiles from everyone .
443256                           the donuts are delicious .
443257       fresh in the morning and conveniently priced .
443258    rainbow donuts offers amazing donuts in variou...
Name: 0, Length: 443259, dtype: object

In [13]:
from tqdm.auto import tqdm

In [14]:
class YelpDataset(Dataset):
  def __init__(self, tokenizer, data_dir, type_path,  max_len=512, truncation = True):
    self.file_path = '/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/t5/train.csv'
    
    self.truncation = truncation
    
    self.max_len = max_len
    self.tokenizer = tokenizer
    self.inputs = []
    self.targets = []

    self._build()
  
  def __len__(self):
    return len(self.inputs)
  
  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze()
    target_ids = self.targets[index]["input_ids"].squeeze()

    src_mask    = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
    target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

    return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}
  
  def _build(self):
    self._buil_examples_from_files(self.file_path)
  
  def _buil_examples_from_files(self, files):
    REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()]")
    REPLACE_WITH_SPACE = re.compile("()|(\-)|(\/)")
    df = pd.read_csv('/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/t5/train.csv', on_bad_lines='skip', delimiter='\t', header=None, index_col=None)
    df.dropna(inplace=True)
    df[1] = df[1].apply(str)
    #df = df.groupby(df[1]).sample(n=45000, random_state = 42)
    # df[0] = df[0].apply(lambda text: REPLACE_NO_SPACE.sub("", text))
    # X = df[0].apply(lambda text : REPLACE_WITH_SPACE.sub("", text))
    X = df[0]
    y = df[1]
    
    
    for index, row in tqdm(enumerate(X)):
      # tokenize inputs
      tokenized_inputs = self.tokenizer.batch_encode_plus([X.iloc[index]], max_length=self.max_len, 
                                                          pad_to_max_length=True, 
                                                          return_tensors="pt", truncation = self.truncation)
      tokenized_targets = self.tokenizer.batch_encode_plus([y.iloc[index]], 
                                                           max_length=2, 
                                                           pad_to_max_length=True, 
                                                           return_tensors="pt", truncation = self.truncation)


       # tokenize targets
    
      self.inputs.append(tokenized_inputs)
      self.targets.append(tokenized_targets)
     

In [None]:
#dataset = YelpDataset(tokenizer, 'hAGdcghdwe', 'val',  max_len=128, truncation=True)
#len(dataset)

In [None]:
#dataset = dataset[0]
#dataset

In [None]:
# data = dataset[-1]
# print(tokenizer.decode(data['source_ids']))
# print(tokenizer.decode(data['target_ids']))

In [15]:
args_dict.update({'data_dir': '/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/t5/', 'output_dir': '/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/', 'num_train_epochs':2})
args = argparse.Namespace(**args_dict)

checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)



In [16]:
def get_dataset(tokenizer, type_path, args):
  return YelpDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path,  max_len=args.max_seq_length)

# Initialize model

In [17]:
args

Namespace(data_dir='/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/t5/', output_dir='/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/', model_name_or_path='t5-base', tokenizer_name_or_path='t5-base', max_seq_length=128, learning_rate=0.0003, weight_decay=0.0, adam_epsilon=1e-08, warmup_steps=0, train_batch_size=8, eval_batch_size=8, num_train_epochs=2, gradient_accumulation_steps=16, n_gpu=1, early_stop_callback=False, fp_16=False, opt_level='O1', max_grad_norm=1.0, seed=42)

In [18]:
model = T5FineTuner(args)

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Initialize model

In [None]:
checkpoint = torch.load('/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/checkpointepoch=0.ckpt')
print(checkpoint.keys())

dict_keys(['epoch', 'global_step', 'checkpoint_callback_best', 'optimizer_states', 'lr_schedulers', 'state_dict', 'hparams', 'hparams_type'])


In [19]:
trainer = pl.Trainer(**train_params)

In [20]:
trainer.fit(model)



0it [00:00, ?it/s]



Validation sanity check: 0it [00:00, ?it/s]

0it [00:00, ?it/s]



0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

1

In [None]:
model = T5FineTuner(args)
#trainer = pl.Trainer(**train_params)

# automatically restores model, epoch, step, LR schedulers, etc...

trainer = pl.Trainer(**train_params, resume_from_checkpoint= 
                     "/content/drive/MyDrive/transformer-drg-style-transfer-master/data/yelp/checkpointepoch=0.ckpt")

# train the model
trainer.fit(model)

0it [00:00, ?it/s]



Validation sanity check: 0it [00:00, ?it/s]

0it [00:00, ?it/s]



0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

1

In [None]:
!mkdir t5_base_yelp_sentiment

In [21]:
model.model.save_pretrained('t5_base_yelp_sentiment')