In [1]:
import argparse
import os
import logging
import pytorch_lightning as pl
from modelT5 import T5Model


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/danielsmith/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

In [4]:
args_dict = dict(
    data_dir="data/COMBINED", # path for data files
    output_dir="checkpoints/T5Model", # path to save the checkpoints
    model_name_or_path='t5-base',
    tokenizer_name_or_path='t5-base',
    train_dataset='train',
    test_dataset='test',
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=6,
    eval_batch_size=6,
    num_train_epochs=2,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

In [5]:
if not os.path.exists(args_dict['output_dir']):
    os.makedirs(args_dict['output_dir'])

args_dict.update({'num_train_epochs':1, 'max_seq_length':256})
args = argparse.Namespace(**args_dict)
print(args)

Namespace(adam_epsilon=1e-08, data_dir='data/COMBINED', early_stop_callback=False, eval_batch_size=6, fp_16=False, gradient_accumulation_steps=16, learning_rate=0.0003, max_grad_norm=1.0, max_seq_length=256, model_name_or_path='t5-base', n_gpu=1, num_train_epochs=1, opt_level='O1', output_dir='checkpoints/T5Model', seed=42, test_dataset='test', tokenizer_name_or_path='t5-base', train_batch_size=6, train_dataset='train', warmup_steps=0, weight_decay=0.0)


In [6]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args.output_dir, 
    filename='{epoch}-{val_loss:.2f}', 
    monitor="val_loss", 
    mode="min", 
    save_top_k=5
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    max_epochs=args.num_train_epochs,
    precision= 16 if args.fp_16 else 32,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)

In [7]:
model = T5Model(args)

In [8]:
trainer = pl.Trainer(**train_params)

  rank_zero_deprecation(
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [9]:
trainer.fit(model)

  rank_zero_warn(
Missing logger folder: /Users/danielsmith/Documents/Current Classes/dl/dl-group-project/lightning_logs

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

data/COMBINED/test.tsv




data/COMBINED/train.tsv


Training: 0it [00:00, ?it/s]

TypeError: optimizer_step() got an unexpected keyword argument 'on_tpu'

In [None]:
model.model.save_pretrained('t5model_weights')