In [1]:
from __future__ import absolute_import

import sys
import os

try:
    from dotenv import find_dotenv, load_dotenv
except:
    pass

import argparse

try:
    sys.path.append(os.path.join(os.path.dirname(__file__), '../src'))
except:
    sys.path.append(os.path.join(os.getcwd(), '../src'))
    
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torchcontrib.optim import SWA
from torch.optim import Adam, SGD 
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau, CyclicLR, \
                                     CosineAnnealingWarmRestarts

from consNLP.data import load_data, data_utils, fetch_dataset
from consNLP.models import transformer_models, activations, layers, losses, scorers
from consNLP.visualization import visualize
from consNLP.trainer.trainer import BasicTrainer, PLTrainer, test_pl_trainer
from consNLP.trainer.trainer_utils import set_seed, _has_apex, _torch_lightning_available, _has_wandb, _torch_gpu_available, _num_gpus, _torch_tpu_available
from consNLP.preprocessing.custom_tokenizer import BERTweetTokenizer

if _has_apex:
    #from torch.cuda import amp
    from apex import amp

if _torch_tpu_available:
    import torch_xla
    import torch_xla.core.xla_model as xm
    import torch_xla.distributed.xla_multiprocessing as xmp

if _has_wandb:
    import wandb
    try:
        load_dotenv(find_dotenv())
        wandb.login(key=os.environ['WANDB_API_KEY'])
    except:
        _has_wandb = False

if _torch_lightning_available:
    import pytorch_lightning as pl
    from pytorch_lightning import Trainer, seed_everything
    from pytorch_lightning.loggers import WandbLogger
    from pytorch_lightning.metrics.metric import NumpyMetric
    from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, Callback

import tokenizers
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, AutoConfig

I0806 14:58:45.963731 4539198912 file_utils.py:41] PyTorch version 1.5.0 available.
I0806 14:58:53.984734 4539198912 file_utils.py:57] TensorFlow version 2.2.0-rc3 available.
I0806 14:58:56.724100 4539198912 modeling.py:230] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
wandb: Appending key for api.wandb.ai to your netrc file: /Users/victor/.netrc
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()
I0806 14:58:59.111385 4539198912 textcleaner.py:37] 'pattern' package not found; tag filters are not available for English
W0806 14:58:59.377578 4539198912 deprecation.py:323] From /Users/victor/anaconda3/lib/python3.7/site-packages/tensorflow/python/compat/v2_compat.py:96: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.
Instructions for updating:
non-resource variables are not supported in the long term
wandb: Appending

In [2]:
load_dotenv(find_dotenv())

True

In [3]:
fetch_dataset(project_dir='../',download_from_kaggle=True,\
              kaggle_dataset='lakshmi25npathi/imdb-dataset-of-50k-movie-reviews')

I0806 14:59:06.972804 4539198912 fetch_dataset.py:16] making final data set from raw data
I0806 14:59:06.973808 4539198912 fetch_dataset.py:21] project directory ../
I0806 14:59:06.974637 4539198912 fetch_dataset.py:30] output path ../data/raw
I0806 14:59:13.335819 4539198912 fetch_dataset.py:95] download complete


In [4]:
parser = argparse.ArgumentParser(prog='Torch trainer function',conflict_handler='resolve')

parser.add_argument('--train_data', type=str, default='../data/raw/IMDB Dataset.csv', required=False,
                    help='train data')
parser.add_argument('--val_data', type=str, default='', required=False,
                    help='validation data')
parser.add_argument('--test_data', type=str, default=None, required=False,
                    help='test data')

parser.add_argument('--task_type', type=str, default='binary_sequence_classification', required=False,
                    help='type of task')

parser.add_argument('--transformer_model_pretrained_path', type=str, default='roberta-base', required=False,
                    help='transformer model pretrained path or huggingface model name')
parser.add_argument('--transformer_config_path', type=str, default='roberta-base', required=False,
                    help='transformer config file path or huggingface model name')
parser.add_argument('--transformer_tokenizer_path', type=str, default='roberta-base', required=False,
                    help='transformer tokenizer file path or huggingface model name')
parser.add_argument('--bpe_vocab_path', type=str, default='', required=False,
                    help='bytepairencoding vocab file path')
parser.add_argument('--bpe_merges_path', type=str, default='', required=False,
                    help='bytepairencoding merges file path')
parser.add_argument('--berttweettokenizer_path', type=str, default='', required=False,
                    help='BERTweet tokenizer path')

parser.add_argument('--max_text_len', type=int, default=100, required=False,
                    help='maximum length of text')
parser.add_argument('--epochs', type=int, default=5, required=False,
                    help='number of epochs')
parser.add_argument('--lr', type=float, default=.00003, required=False,
                    help='learning rate')
parser.add_argument('--loss_function', type=str, default='bcelogit', required=False,
                    help='loss function')
parser.add_argument('--metric', type=str, default='f1', required=False,
                    help='scorer metric')

parser.add_argument('--use_lightning_trainer', type=bool, default=False, required=False,
                    help='if lightning trainer needs to be used')
parser.add_argument('--use_torch_trainer', type=bool, default=True, required=False,
                    help='if custom torch trainer needs to be used')
parser.add_argument('--use_apex', type=bool, default=False, required=False,
                    help='if apex needs to be used')
parser.add_argument('--use_gpu', type=bool, default=False, required=False,
                    help='GPU mode')
parser.add_argument('--use_TPU', type=bool, default=False, required=False,
                    help='TPU mode')
parser.add_argument('--num_gpus', type=int, default=0, required=False,
                    help='Number of GPUs')
parser.add_argument('--num_tpus', type=int, default=0, required=False,
                    help='Number of TPUs')

parser.add_argument('--train_batch_size', type=int, default=16, required=False,
                    help='train batch size')
parser.add_argument('--eval_batch_size', type=int, default=16, required=False,
                    help='eval batch size')

parser.add_argument('--model_save_path', type=str, default='../models/sentiment_classification/', required=False,
                    help='seed')

parser.add_argument('--wandb_logging', type=bool, default=False, required=False,
                    help='wandb logging needed')

parser.add_argument('--seed', type=int, default=42, required=False,
                    help='seed')

args, _ = parser.parse_known_args()

print ("Wandb Logging: {}, GPU: {}, Pytorch Lightning: {}, TPU: {}, Apex: {}".format(\
            _has_wandb and args.wandb_logging, _torch_gpu_available,\
            _torch_lightning_available and args.use_lightning_trainer, _torch_tpu_available, _has_apex))

Wandb Logging: False, GPU: False, Pytorch Lightning: False, TPU: False, Apex: False


In [5]:
reshape = False
final_activation = None
convert_output = None

if args.task_type == 'binary_sequence_classification':
    if args.metric != 'roc_auc_score': 
        convert_output = 'round'
    if args.loss_function == 'bcelogit':
        final_activation = 'sigmoid'
        
elif args.task_type == 'multiclass_sequence_classification':
    convert_output = 'max'
    
elif args.task_type == 'binary_token_classification':
    reshape = True
    if args.metric != 'roc_auc_score': 
        convert_output = 'round'
    if args.loss_function == 'bcelogit':
        final_activation = 'sigmoid'
        
elif args.task_type == 'multiclass_token_classification':
    reshape = True
    convert_output = 'max'

In [6]:
df = load_data.load_pandas_df(args.train_data,sep=',')
df = df.iloc[:1000]

In [7]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
model_save_dir = args.model_save_path
try:
    os.makedirs(model_save_dir)
except OSError:
    pass

In [9]:
df.sentiment, label2idx = data_utils.convert_categorical_label_to_int(df.sentiment, \
                                                             save_path=os.path.join(model_save_dir,'label2idx.pkl'))

In [10]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [11]:
from sklearn.model_selection import KFold

kf = KFold(5)

for train_index, val_index in kf.split(df.review, df.sentiment):
    break
    
train_df = df.iloc[train_index].reset_index(drop=True)
val_df = df.iloc[val_index].reset_index(drop=True)

In [12]:
train_df.shape, val_df.shape

((800, 2), (200, 2))

In [13]:
if args.berttweettokenizer_path:
    tokenizer = BERTweetTokenizer(args.berttweettokenizer_path)
else:
    tokenizer = AutoTokenizer.from_pretrained(args.transformer_model_pretrained_path)

if not args.berttweettokenizer_path:
    try:
        bpetokenizer = tokenizers.ByteLevelBPETokenizer(args.bpe_vocab_path, \
                                        args.bpe_merges_path)
    except:
        bpetokenizer = None 
else:
    bpetokenizer = None

I0806 14:59:18.932564 4539198912 configuration_utils.py:283] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json from cache at /Users/victor/.cache/torch/transformers/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.117c81977c5979de8c088352e74ec6e70f5c66096c28b61d3c50101609b39690
I0806 14:59:18.933578 4539198912 configuration_utils.py:319] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL

In [14]:
train_dataset = data_utils.TransformerDataset(train_df.review, bpetokenizer=bpetokenizer, tokenizer=tokenizer, MAX_LEN=args.max_text_len, \
              target_label=train_df.sentiment, sequence_target=False, target_text=None, conditional_label=None, conditional_all_labels=None)

val_dataset = data_utils.TransformerDataset(val_df.review, bpetokenizer=bpetokenizer, tokenizer=tokenizer, MAX_LEN=args.max_text_len, \
              target_label=val_df.sentiment, sequence_target=False, target_text=None, conditional_label=None, conditional_all_labels=None)

In [15]:
config = AutoConfig.from_pretrained(args.transformer_config_path, output_hidden_states=True, output_attentions=True)
basemodel = AutoModel.from_pretrained(args.transformer_model_pretrained_path,config=config)
model = transformer_models.TransformerWithCLS(basemodel)

I0806 14:59:30.627707 4539198912 configuration_utils.py:283] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json from cache at /Users/victor/.cache/torch/transformers/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.117c81977c5979de8c088352e74ec6e70f5c66096c28b61d3c50101609b39690
I0806 14:59:30.628615 4539198912 configuration_utils.py:319] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL

In [16]:
if _torch_tpu_available and args.use_TPU:
    train_sampler = torch.utils.data.distributed.DistributedSampler(
      train_dataset,
      num_replicas=xm.xrt_world_size(),
      rank=xm.get_ordinal(),
      shuffle=True
    )

    val_sampler = torch.utils.data.distributed.DistributedSampler(
      val_dataset,
      num_replicas=xm.xrt_world_size(),
      rank=xm.get_ordinal(),
      shuffle=False
    )

if _torch_tpu_available and args.use_TPU:
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.train_batch_size, sampler=train_sampler,
        drop_last=True,num_workers=2)

    val_data_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=args.eval_batch_size, sampler=val_sampler,
        drop_last=False,num_workers=1)
else:
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.train_batch_size)

    val_data_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=args.eval_batch_size)

### Run with Pytorch Trainer

In [18]:
if args.use_torch_trainer:
    device = torch.device("cuda" if _torch_gpu_available and args.use_gpu else "cpu")

    if _torch_tpu_available and args.use_TPU:
        device=xm.xla_device()

    print ("Device: {}".format(device))
    
    if args.use_TPU and _torch_tpu_available and args.num_tpus > 1:
        train_data_loader = torch_xla.distributed.parallel_loader.ParallelLoader(train_data_loader, [device])
        train_data_loader = train_data_loader.per_device_loader(device)


    trainer = BasicTrainer(model, train_data_loader, val_data_loader, device, args.transformer_model_pretrained_path, \
                               final_activation=final_activation, \
                               test_data_loader=val_data_loader)

    param_optimizer = list(trainer.model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(train_data_loader) * args.epochs)

    if _torch_tpu_available and args.use_TPU:
        optimizer = AdamW(optimizer_parameters, lr=args.lr*xm.xrt_world_size())
    else:
        optimizer = AdamW(optimizer_parameters, lr=args.lr)

    if args.use_apex and _has_apex:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")


    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
    
    loss = losses.get_loss(args.loss_function)
    scorer = scorers.SKMetric(args.metric, convert=convert_output, reshape=reshape) 
    
    def _mp_fn(rank, flags, trainer, epochs, lr, metric, loss_function, optimizer, scheduler, model_save_path, num_gpus, num_tpus,  \
                max_grad_norm, early_stopping_rounds, snapshot_ensemble, is_amp, use_wandb, seed):
        torch.set_default_tensor_type('torch.FloatTensor')
        a = trainer.train(epochs, lr, metric, loss_function, optimizer, scheduler, model_save_path, num_gpus, num_tpus,  \
                max_grad_norm, early_stopping_rounds, snapshot_ensemble, is_amp, use_wandb, seed)

    FLAGS = {}
    if _torch_tpu_available and args.use_TPU:
        xmp.spawn(_mp_fn, args=(FLAGS, trainer, args.epochs, args.lr, scorer, loss, optimizer, scheduler, args.model_save_path, args.num_gpus, args.num_tpus, \
                 1, 3, False, args.use_apex, False, args.seed), nprocs=8, start_method='fork')
    else:
        use_wandb = _has_wandb and args.wandb_logging
        trainer.train(args.epochs, args.lr, scorer, loss, optimizer, scheduler, args.model_save_path, args.num_gpus, args.num_tpus,  \
                max_grad_norm=1, early_stopping_rounds=3, snapshot_ensemble=False, is_amp=args.use_apex, use_wandb=use_wandb, seed=args.seed)

elif args.use_lightning_trainer and _torch_lightning_available:
    from pytorch_lightning import Trainer, seed_everything
    seed_everything(args.seed)
    
    loss = losses.get_loss(args.loss_function)
    scorer = scorers.PLMetric(args.metric, convert=convert_output, reshape=reshape)
    
    log_args = {'description': args.transformer_model_pretrained_path, 'loss': loss.__class__.__name__, 'epochs': args.epochs, 'learning_rate': args.lr}

    if _has_wandb and not _torch_tpu_available and args.wandb_logging:
        wandb.init(project="Project",config=log_args)
        wandb_logger = WandbLogger()

    checkpoint_callback = ModelCheckpoint(
                filepath=args.model_save_path,
                save_top_k=1,
                verbose=True,
                monitor='val_loss',
                mode='min'
                )
    earlystop = EarlyStopping(
                monitor='val_loss',
                patience=3,
               verbose=False,
               mode='min'
               )

    if args.use_gpu and _torch_gpu_available:
        print ("using GPU")
        if args.wandb_logging:
            if _has_apex:
                trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, logger=wandb_logger, precision=16, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
            else:
                trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, logger=wandb_logger, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
        else:
            if _has_apex:
                trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, precision=16, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
            else:
                trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])

    elif args.use_TPU and _torch_tpu_available:
        print ("using TPU")
        if _has_apex:
            trainer = Trainer(num_tpu_cores=args.num_tpus, max_epochs=args.epochs, precision=16, \
                        checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
        else:
            trainer = Trainer(num_tpu_cores=args.num_tpus, max_epochs=args.epochs, \
                        checkpoint_callback=checkpoint_callback, callbacks=[earlystop])

    else:
        print ("using CPU")
        if args.wandb_logging:
            if _has_apex:
                trainer = Trainer(max_epochs=args.epochs, logger=wandb_logger, precision=16, \
                        checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
            else:
                trainer = Trainer(max_epochs=args.epochs, logger=wandb_logger, \
                        checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
        else:
            if _has_apex:
                trainer = Trainer(max_epochs=args.epochs, precision=16, \
                        checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
            else:
                trainer = Trainer(max_epochs=args.epochs, checkpoint_callback=checkpoint_callback, callbacks=[earlystop])

    num_train_steps = int(len(train_data_loader) * args.epochs)

    pltrainer = PLTrainer(num_train_steps, model, scorer, loss, args.lr, \
                          final_activation=final_activation, seed=42)

    #try:
    #    print ("Loaded model from previous checkpoint")
    #    pltrainer = PLTrainer.load_from_checkpoint(args.model_save_path)
    #except:
    #    pass

    trainer.fit(pltrainer, train_data_loader, val_data_loader) 


  0%|          | 0/50 [00:00<?, ?it/s][A

Device: cpu
[LOG] Total number of parameters to learn 124646401


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)

Current training Loss 0.704:   0%|          | 0/50 [00:08<?, ?it/s][A
Current training Loss 0.704:   2%|▏         | 1/50 [00:08<07:17,  8.92s/it][A
Current training Loss 0.653:   2%|▏         | 1/50 [00:17<07:17,  8.92s/it][A
Current training Loss 0.653:   4%|▍         | 2/50 [00:17<06:56,  8.67s/it][A
Current training Loss 0.692:   4%|▍         | 2/50 [00:25<06:56,  8.67s/it][A
Current training Loss 0.692:   6%|▌         | 3/50 [00:25<06:40,  8.52s/it][A
Current training Loss 0.757:   6%|▌         | 3/50 [00:31<06:40,  8.52s/it][A
Current training Loss 0.757:   8%|▊         | 4/50 [00:31<06:00,  7.85s/it][A
Current training Loss 0.704:   8%|▊         | 4/50 [00:38<06:00,  7.85s/it][A
Current training Loss 0.704:  10%|█         | 5/50 [00:38<05:37,  7.49s/it][A
Current training Loss 0.637:  10%|█         | 5/50 [00:44<05:37,  7.49s/it][A
Current tra

Running evaluation on whole training data



Current eval Loss 0.47:   0%|          | 0/50 [00:01<?, ?it/s][A
Current eval Loss 0.47:   2%|▏         | 1/50 [00:01<01:04,  1.31s/it][A
Current eval Loss 0.279:   2%|▏         | 1/50 [00:02<01:04,  1.31s/it][A
Current eval Loss 0.279:   4%|▍         | 2/50 [00:02<01:03,  1.31s/it][A
Current eval Loss 0.275:   4%|▍         | 2/50 [00:03<01:03,  1.31s/it][A
Current eval Loss 0.275:   6%|▌         | 3/50 [00:03<01:01,  1.31s/it][A
Current eval Loss 0.612:   6%|▌         | 3/50 [00:05<01:01,  1.31s/it][A
Current eval Loss 0.612:   8%|▊         | 4/50 [00:05<01:00,  1.31s/it][A
Current eval Loss 0.322:   8%|▊         | 4/50 [00:06<01:00,  1.31s/it][A
Current eval Loss 0.322:  10%|█         | 5/50 [00:06<00:59,  1.32s/it][A
Current eval Loss 0.341:  10%|█         | 5/50 [00:07<00:59,  1.32s/it][A
Current eval Loss 0.341:  12%|█▏        | 6/50 [00:07<00:57,  1.31s/it][A
Current eval Loss 0.364:  12%|█▏        | 6/50 [00:09<00:57,  1.31s/it][A
Current eval Loss 0.364:  14%|█▍  

Running evaluation on validation data



Current eval Loss 0.189:   0%|          | 0/13 [00:01<?, ?it/s][A
Current eval Loss 0.189:   8%|▊         | 1/13 [00:01<00:15,  1.32s/it][A
Current eval Loss 0.484:   8%|▊         | 1/13 [00:02<00:15,  1.32s/it][A
Current eval Loss 0.484:  15%|█▌        | 2/13 [00:02<00:14,  1.33s/it][A
Current eval Loss 0.221:  15%|█▌        | 2/13 [00:03<00:14,  1.33s/it][A
Current eval Loss 0.221:  23%|██▎       | 3/13 [00:03<00:13,  1.32s/it][A
Current eval Loss 0.2:  23%|██▎       | 3/13 [00:05<00:13,  1.32s/it]  [A
Current eval Loss 0.2:  31%|███       | 4/13 [00:05<00:11,  1.32s/it][A
Current eval Loss 0.46:  31%|███       | 4/13 [00:06<00:11,  1.32s/it][A
Current eval Loss 0.46:  38%|███▊      | 5/13 [00:06<00:10,  1.34s/it][A
Current eval Loss 0.438:  38%|███▊      | 5/13 [00:07<00:10,  1.34s/it][A
Current eval Loss 0.438:  46%|████▌     | 6/13 [00:07<00:09,  1.33s/it][A
Current eval Loss 0.556:  46%|████▌     | 6/13 [00:09<00:09,  1.33s/it][A
Current eval Loss 0.556:  54%|█████▍

Train loss = 0.309 Train metric = 0.882 Val loss = 0.414 Val metric = 0.79



  0%|          | 0/50 [00:00<?, ?it/s][A
Current training Loss 0.471:   0%|          | 0/50 [00:05<?, ?it/s][A
Current training Loss 0.471:   2%|▏         | 1/50 [00:05<04:28,  5.47s/it][A
Current training Loss 0.275:   2%|▏         | 1/50 [00:10<04:28,  5.47s/it][A
Current training Loss 0.275:   4%|▍         | 2/50 [00:10<04:18,  5.40s/it][A
Current training Loss 0.421:   4%|▍         | 2/50 [00:16<04:18,  5.40s/it][A
Current training Loss 0.421:   6%|▌         | 3/50 [00:16<04:14,  5.41s/it][A
Current training Loss 0.617:   6%|▌         | 3/50 [00:23<04:14,  5.41s/it][A
Current training Loss 0.617:   8%|▊         | 4/50 [00:23<04:42,  6.14s/it][A
Current training Loss 0.437:   8%|▊         | 4/50 [00:31<04:42,  6.14s/it][A
Current training Loss 0.437:  10%|█         | 5/50 [00:31<05:00,  6.69s/it][A
Current training Loss 0.4:  10%|█         | 5/50 [00:39<05:00,  6.69s/it]  [A
Current training Loss 0.4:  12%|█▏        | 6/50 [00:39<05:09,  7.03s/it][A
Current training Lo

Running evaluation on whole training data



Current eval Loss 0.309:   0%|          | 0/50 [00:01<?, ?it/s][A
Current eval Loss 0.309:   2%|▏         | 1/50 [00:01<01:14,  1.52s/it][A
Current eval Loss 0.097:   2%|▏         | 1/50 [00:02<01:14,  1.52s/it][A
Current eval Loss 0.097:   4%|▍         | 2/50 [00:02<01:11,  1.50s/it][A
Current eval Loss 0.102:   4%|▍         | 2/50 [00:04<01:11,  1.50s/it][A
Current eval Loss 0.102:   6%|▌         | 3/50 [00:04<01:09,  1.48s/it][A
Current eval Loss 0.337:   6%|▌         | 3/50 [00:06<01:09,  1.48s/it][A
Current eval Loss 0.337:   8%|▊         | 4/50 [00:06<01:09,  1.51s/it][A
Current eval Loss 0.135:   8%|▊         | 4/50 [00:07<01:09,  1.51s/it][A
Current eval Loss 0.135:  10%|█         | 5/50 [00:07<01:07,  1.51s/it][A
Current eval Loss 0.203:  10%|█         | 5/50 [00:08<01:07,  1.51s/it][A
Current eval Loss 0.203:  12%|█▏        | 6/50 [00:08<01:05,  1.48s/it][A
Current eval Loss 0.319:  12%|█▏        | 6/50 [00:10<01:05,  1.48s/it][A
Current eval Loss 0.319:  14%|█▍

Running evaluation on validation data



Current eval Loss 0.135:   0%|          | 0/13 [00:01<?, ?it/s][A
Current eval Loss 0.135:   8%|▊         | 1/13 [00:01<00:17,  1.49s/it][A
Current eval Loss 0.371:   8%|▊         | 1/13 [00:02<00:17,  1.49s/it][A
Current eval Loss 0.371:  15%|█▌        | 2/13 [00:02<00:16,  1.48s/it][A
Current eval Loss 0.103:  15%|█▌        | 2/13 [00:04<00:16,  1.48s/it][A
Current eval Loss 0.103:  23%|██▎       | 3/13 [00:04<00:14,  1.48s/it][A
Current eval Loss 0.22:  23%|██▎       | 3/13 [00:05<00:14,  1.48s/it] [A
Current eval Loss 0.22:  31%|███       | 4/13 [00:05<00:13,  1.47s/it][A
Current eval Loss 0.517:  31%|███       | 4/13 [00:07<00:13,  1.47s/it][A
Current eval Loss 0.517:  38%|███▊      | 5/13 [00:07<00:12,  1.55s/it][A
Current eval Loss 0.272:  38%|███▊      | 5/13 [00:09<00:12,  1.55s/it][A
Current eval Loss 0.272:  46%|████▌     | 6/13 [00:09<00:10,  1.53s/it][A
Current eval Loss 0.283:  46%|████▌     | 6/13 [00:10<00:10,  1.53s/it][A
Current eval Loss 0.283:  54%|███

Train loss = 0.17 Train metric = 0.949 Val loss = 0.319 Val metric = 0.847



  0%|          | 0/50 [00:00<?, ?it/s][A
Current training Loss 0.338:   0%|          | 0/50 [00:06<?, ?it/s][A
Current training Loss 0.338:   2%|▏         | 1/50 [00:06<05:33,  6.80s/it][A
Current training Loss 0.127:   2%|▏         | 1/50 [00:12<05:33,  6.80s/it][A
Current training Loss 0.127:   4%|▍         | 2/50 [00:12<05:10,  6.47s/it][A
Current training Loss 0.104:   4%|▍         | 2/50 [00:18<05:10,  6.47s/it][A
Current training Loss 0.104:   6%|▌         | 3/50 [00:18<04:53,  6.24s/it][A
Current training Loss 0.292:   6%|▌         | 3/50 [00:23<04:53,  6.24s/it][A
Current training Loss 0.292:   8%|▊         | 4/50 [00:23<04:38,  6.06s/it][A
Current training Loss 0.171:   8%|▊         | 4/50 [00:29<04:38,  6.06s/it][A
Current training Loss 0.171:  10%|█         | 5/50 [00:29<04:25,  5.91s/it][A
Current training Loss 0.251:  10%|█         | 5/50 [00:35<04:25,  5.91s/it][A
Current training Loss 0.251:  12%|█▏        | 6/50 [00:35<04:18,  5.86s/it][A
Current training 

Running evaluation on whole training data



Current eval Loss 0.238:   0%|          | 0/50 [00:01<?, ?it/s][A
Current eval Loss 0.238:   2%|▏         | 1/50 [00:01<01:12,  1.47s/it][A
Current eval Loss 0.073:   2%|▏         | 1/50 [00:02<01:12,  1.47s/it][A
Current eval Loss 0.073:   4%|▍         | 2/50 [00:02<01:10,  1.47s/it][A
Current eval Loss 0.066:   4%|▍         | 2/50 [00:04<01:10,  1.47s/it][A
Current eval Loss 0.066:   6%|▌         | 3/50 [00:04<01:08,  1.46s/it][A
Current eval Loss 0.129:   6%|▌         | 3/50 [00:05<01:08,  1.46s/it][A
Current eval Loss 0.129:   8%|▊         | 4/50 [00:05<01:07,  1.46s/it][A
Current eval Loss 0.101:   8%|▊         | 4/50 [00:07<01:07,  1.46s/it][A
Current eval Loss 0.101:  10%|█         | 5/50 [00:07<01:05,  1.46s/it][A
Current eval Loss 0.148:  10%|█         | 5/50 [00:08<01:05,  1.46s/it][A
Current eval Loss 0.148:  12%|█▏        | 6/50 [00:08<01:05,  1.49s/it][A
Current eval Loss 0.212:  12%|█▏        | 6/50 [00:10<01:05,  1.49s/it][A
Current eval Loss 0.212:  14%|█▍

Running evaluation on validation data



Current eval Loss 0.203:   0%|          | 0/13 [00:01<?, ?it/s][A
Current eval Loss 0.203:   8%|▊         | 1/13 [00:01<00:17,  1.45s/it][A
Current eval Loss 0.563:   8%|▊         | 1/13 [00:02<00:17,  1.45s/it][A
Current eval Loss 0.563:  15%|█▌        | 2/13 [00:02<00:15,  1.45s/it][A
Current eval Loss 0.057:  15%|█▌        | 2/13 [00:04<00:15,  1.45s/it][A
Current eval Loss 0.057:  23%|██▎       | 3/13 [00:04<00:14,  1.47s/it][A
Current eval Loss 0.257:  23%|██▎       | 3/13 [00:05<00:14,  1.47s/it][A
Current eval Loss 0.257:  31%|███       | 4/13 [00:05<00:13,  1.48s/it][A
Current eval Loss 0.494:  31%|███       | 4/13 [00:07<00:13,  1.48s/it][A
Current eval Loss 0.494:  38%|███▊      | 5/13 [00:07<00:11,  1.50s/it][A
Current eval Loss 0.463:  38%|███▊      | 5/13 [00:08<00:11,  1.50s/it][A
Current eval Loss 0.463:  46%|████▌     | 6/13 [00:08<00:10,  1.49s/it][A
Current eval Loss 0.662:  46%|████▌     | 6/13 [00:10<00:10,  1.49s/it][A
Current eval Loss 0.662:  54%|██

Train loss = 0.099 Train metric = 0.972 Val loss = 0.401 Val metric = 0.816



Current training Loss 0.241:   0%|          | 0/50 [00:05<?, ?it/s][A
Current training Loss 0.241:   2%|▏         | 1/50 [00:05<04:48,  5.88s/it][A
Current training Loss 0.079:   2%|▏         | 1/50 [00:11<04:48,  5.88s/it][A
Current training Loss 0.079:   4%|▍         | 2/50 [00:11<04:41,  5.87s/it][A
Current training Loss 0.081:   4%|▍         | 2/50 [00:17<04:41,  5.87s/it][A
Current training Loss 0.081:   6%|▌         | 3/50 [00:17<04:32,  5.80s/it][A
Current training Loss 0.133:   6%|▌         | 3/50 [00:23<04:32,  5.80s/it][A
Current training Loss 0.133:   8%|▊         | 4/50 [00:23<04:26,  5.79s/it][A
Current training Loss 0.108:   8%|▊         | 4/50 [00:28<04:26,  5.79s/it][A
Current training Loss 0.108:  10%|█         | 5/50 [00:28<04:18,  5.75s/it][A
Current training Loss 0.167:  10%|█         | 5/50 [00:34<04:18,  5.75s/it][A
Current training Loss 0.167:  12%|█▏        | 6/50 [00:34<04:11,  5.72s/it][A
Current training Loss 0.308:  12%|█▏        | 6/50 [00:40<0

Running evaluation on whole training data



Current eval Loss 0.284:   0%|          | 0/50 [00:01<?, ?it/s][A
Current eval Loss 0.284:   2%|▏         | 1/50 [00:01<01:08,  1.41s/it][A
Current eval Loss 0.168:   2%|▏         | 1/50 [00:02<01:08,  1.41s/it][A
Current eval Loss 0.168:   4%|▍         | 2/50 [00:02<01:07,  1.41s/it][A
Current eval Loss 0.22:   4%|▍         | 2/50 [00:04<01:07,  1.41s/it] [A
Current eval Loss 0.22:   6%|▌         | 3/50 [00:04<01:05,  1.40s/it][A
Current eval Loss 0.608:   6%|▌         | 3/50 [00:05<01:05,  1.40s/it][A
Current eval Loss 0.608:   8%|▊         | 4/50 [00:05<01:04,  1.41s/it][A
Current eval Loss 0.138:   8%|▊         | 4/50 [00:07<01:04,  1.41s/it][A
Current eval Loss 0.138:  10%|█         | 5/50 [00:07<01:02,  1.40s/it][A
Current eval Loss 0.021:  10%|█         | 5/50 [00:08<01:02,  1.40s/it][A
Current eval Loss 0.021:  12%|█▏        | 6/50 [00:08<01:01,  1.39s/it][A
Current eval Loss 0.135:  12%|█▏        | 6/50 [00:09<01:01,  1.39s/it][A
Current eval Loss 0.135:  14%|█▍ 

Running evaluation on validation data



Current eval Loss 0.675:   0%|          | 0/13 [00:01<?, ?it/s][A
Current eval Loss 0.675:   8%|▊         | 1/13 [00:01<00:16,  1.40s/it][A
Current eval Loss 0.411:   8%|▊         | 1/13 [00:02<00:16,  1.40s/it][A
Current eval Loss 0.411:  15%|█▌        | 2/13 [00:02<00:15,  1.42s/it][A
Current eval Loss 0.422:  15%|█▌        | 2/13 [00:04<00:15,  1.42s/it][A
Current eval Loss 0.422:  23%|██▎       | 3/13 [00:04<00:14,  1.42s/it][A
Current eval Loss 0.726:  23%|██▎       | 3/13 [00:05<00:14,  1.42s/it][A
Current eval Loss 0.726:  31%|███       | 4/13 [00:05<00:12,  1.42s/it][A
Current eval Loss 0.683:  31%|███       | 4/13 [00:07<00:12,  1.42s/it][A
Current eval Loss 0.683:  38%|███▊      | 5/13 [00:07<00:11,  1.42s/it][A
Current eval Loss 0.965:  38%|███▊      | 5/13 [00:08<00:11,  1.42s/it][A
Current eval Loss 0.965:  46%|████▌     | 6/13 [00:08<00:09,  1.43s/it][A
Current eval Loss 0.358:  46%|████▌     | 6/13 [00:09<00:09,  1.43s/it][A
Current eval Loss 0.358:  54%|██

Train loss = 0.136 Train metric = 0.949 Val loss = 0.61 Val metric = 0.808



Current training Loss 0.284:   0%|          | 0/50 [00:05<?, ?it/s][A
Current training Loss 0.284:   2%|▏         | 1/50 [00:05<04:43,  5.80s/it][A
Current training Loss 0.252:   2%|▏         | 1/50 [00:11<04:43,  5.80s/it][A
Current training Loss 0.252:   4%|▍         | 2/50 [00:11<04:35,  5.74s/it][A
Current training Loss 0.128:   4%|▍         | 2/50 [00:18<04:35,  5.74s/it][A
Current training Loss 0.128:   6%|▌         | 3/50 [00:18<04:48,  6.13s/it][A
Current training Loss 0.416:   6%|▌         | 3/50 [00:25<04:48,  6.13s/it][A
Current training Loss 0.416:   8%|▊         | 4/50 [00:25<04:54,  6.40s/it][A
Current training Loss 0.084:   8%|▊         | 4/50 [00:31<04:54,  6.40s/it][A
Current training Loss 0.084:  10%|█         | 5/50 [00:31<04:43,  6.30s/it][A
Current training Loss 0.033:  10%|█         | 5/50 [00:37<04:43,  6.30s/it][A
Current training Loss 0.033:  12%|█▏        | 6/50 [00:37<04:26,  6.06s/it][A
Current training Loss 0.129:  12%|█▏        | 6/50 [00:42<0

Running evaluation on whole training data



Current eval Loss 0.256:   0%|          | 0/50 [00:01<?, ?it/s][A
Current eval Loss 0.256:   2%|▏         | 1/50 [00:01<01:02,  1.27s/it][A
Current eval Loss 0.042:   2%|▏         | 1/50 [00:02<01:02,  1.27s/it][A
Current eval Loss 0.042:   4%|▍         | 2/50 [00:02<01:01,  1.28s/it][A
Current eval Loss 0.014:   4%|▍         | 2/50 [00:03<01:01,  1.28s/it][A
Current eval Loss 0.014:   6%|▌         | 3/50 [00:03<01:00,  1.28s/it][A
Current eval Loss 0.034:   6%|▌         | 3/50 [00:05<01:00,  1.28s/it][A
Current eval Loss 0.034:   8%|▊         | 4/50 [00:05<00:59,  1.29s/it][A
Current eval Loss 0.057:   8%|▊         | 4/50 [00:06<00:59,  1.29s/it][A
Current eval Loss 0.057:  10%|█         | 5/50 [00:06<00:58,  1.29s/it][A
Current eval Loss 0.032:  10%|█         | 5/50 [00:07<00:58,  1.29s/it][A
Current eval Loss 0.032:  12%|█▏        | 6/50 [00:07<00:56,  1.29s/it][A
Current eval Loss 0.047:  12%|█▏        | 6/50 [00:09<00:56,  1.29s/it][A
Current eval Loss 0.047:  14%|█▍

Running evaluation on validation data



Current eval Loss 0.251:   0%|          | 0/13 [00:01<?, ?it/s][A
Current eval Loss 0.251:   8%|▊         | 1/13 [00:01<00:16,  1.40s/it][A
Current eval Loss 0.557:   8%|▊         | 1/13 [00:02<00:16,  1.40s/it][A
Current eval Loss 0.557:  15%|█▌        | 2/13 [00:02<00:15,  1.40s/it][A
Current eval Loss 0.088:  15%|█▌        | 2/13 [00:04<00:15,  1.40s/it][A
Current eval Loss 0.088:  23%|██▎       | 3/13 [00:04<00:14,  1.40s/it][A
Current eval Loss 0.307:  23%|██▎       | 3/13 [00:05<00:14,  1.40s/it][A
Current eval Loss 0.307:  31%|███       | 4/13 [00:05<00:12,  1.41s/it][A
Current eval Loss 0.588:  31%|███       | 4/13 [00:07<00:12,  1.41s/it][A
Current eval Loss 0.588:  38%|███▊      | 5/13 [00:07<00:11,  1.42s/it][A
Current eval Loss 0.604:  38%|███▊      | 5/13 [00:08<00:11,  1.42s/it][A
Current eval Loss 0.604:  46%|████▌     | 6/13 [00:08<00:09,  1.42s/it][A
Current eval Loss 0.44:  46%|████▌     | 6/13 [00:09<00:09,  1.42s/it] [A
Current eval Loss 0.44:  54%|███

Train loss = 0.048 Train metric = 0.992 Val loss = 0.453 Val metric = 0.816



  8%|▊         | 1/13 [00:01<00:16,  1.38s/it][A
 15%|█▌        | 2/13 [00:02<00:15,  1.40s/it][A
 23%|██▎       | 3/13 [00:04<00:14,  1.41s/it][A
 31%|███       | 4/13 [00:05<00:12,  1.43s/it][A
 38%|███▊      | 5/13 [00:07<00:11,  1.43s/it][A
 46%|████▌     | 6/13 [00:08<00:09,  1.42s/it][A
 54%|█████▍    | 7/13 [00:09<00:08,  1.42s/it][A
 62%|██████▏   | 8/13 [00:11<00:07,  1.41s/it][A
 69%|██████▉   | 9/13 [00:12<00:05,  1.41s/it][A
 77%|███████▋  | 10/13 [00:14<00:04,  1.41s/it][A
 85%|████████▍ | 11/13 [00:15<00:02,  1.41s/it][A
 92%|█████████▏| 12/13 [00:17<00:01,  1.42s/it][A
100%|██████████| 13/13 [00:17<00:00,  1.36s/it][A


In [19]:
test_output1 = trainer.test_output

### Run with Pytorch Lightning Trainer

In [20]:
parser = argparse.ArgumentParser(prog='Torch trainer function',conflict_handler='resolve')

parser.add_argument('--train_data', type=str, default='../data/raw/IMDB Dataset.csv', required=False,
                    help='train data')
parser.add_argument('--val_data', type=str, default='', required=False,
                    help='validation data')
parser.add_argument('--test_data', type=str, default=None, required=False,
                    help='test data')

parser.add_argument('--transformer_model_pretrained_path', type=str, default='roberta-base', required=False,
                    help='transformer model pretrained path or huggingface model name')
parser.add_argument('--transformer_config_path', type=str, default='roberta-base', required=False,
                    help='transformer config file path or huggingface model name')
parser.add_argument('--transformer_tokenizer_path', type=str, default='roberta-base', required=False,
                    help='transformer tokenizer file path or huggingface model name')
parser.add_argument('--bpe_vocab_path', type=str, default='', required=False,
                    help='bytepairencoding vocab file path')
parser.add_argument('--bpe_merges_path', type=str, default='', required=False,
                    help='bytepairencoding merges file path')
parser.add_argument('--berttweettokenizer_path', type=str, default='', required=False,
                    help='BERTweet tokenizer path')

parser.add_argument('--max_text_len', type=int, default=100, required=False,
                    help='maximum length of text')
parser.add_argument('--epochs', type=int, default=5, required=False,
                    help='number of epochs')
parser.add_argument('--lr', type=float, default=.00003, required=False,
                    help='learning rate')
parser.add_argument('--loss_function', type=str, default='bcelogit', required=False,
                    help='loss function')
parser.add_argument('--metric', type=str, default='f1', required=False,
                    help='scorer metric')

parser.add_argument('--use_lightning_trainer', type=bool, default=True, required=False,
                    help='if lightning trainer needs to be used')
parser.add_argument('--use_torch_trainer', type=bool, default=False, required=False,
                    help='if custom torch trainer needs to be used')
parser.add_argument('--use_apex', type=bool, default=False, required=False,
                    help='if apex needs to be used')
parser.add_argument('--use_gpu', type=bool, default=False, required=False,
                    help='GPU mode')
parser.add_argument('--use_TPU', type=bool, default=False, required=False,
                    help='TPU mode')
parser.add_argument('--num_gpus', type=int, default=0, required=False,
                    help='Number of GPUs')
parser.add_argument('--num_tpus', type=int, default=0, required=False,
                    help='Number of TPUs')

parser.add_argument('--train_batch_size', type=int, default=16, required=False,
                    help='train batch size')
parser.add_argument('--eval_batch_size', type=int, default=16, required=False,
                    help='eval batch size')

parser.add_argument('--model_save_path', type=str, default='../models/sentiment_classification/', required=False,
                    help='seed')

parser.add_argument('--wandb_logging', type=bool, default=False, required=False,
                    help='wandb logging needed')

parser.add_argument('--seed', type=int, default=42, required=False,
                    help='seed')

args, _ = parser.parse_known_args()

print ("Wandb Logging: {}, GPU: {}, Pytorch Lightning: {}, TPU: {}, Apex: {}".format(\
            _has_wandb and args.wandb_logging, _torch_gpu_available,\
            _torch_lightning_available and args.use_lightning_trainer, _torch_tpu_available, _has_apex))

Wandb Logging: False, GPU: False, Pytorch Lightning: True, TPU: False, Apex: False


In [22]:
if args.use_torch_trainer:
    device = torch.device("cuda" if _torch_gpu_available and args.use_gpu else "cpu")

    if _torch_tpu_available and args.use_TPU:
        device=xm.xla_device()

    print ("Device: {}".format(device))
    
    if args.use_TPU and _torch_tpu_available and args.num_tpus > 1:
        train_data_loader = torch_xla.distributed.parallel_loader.ParallelLoader(train_data_loader, [device])
        train_data_loader = train_data_loader.per_device_loader(device)


    trainer = BasicTrainer(model, train_data_loader, val_data_loader, device, args.transformer_model_pretrained_path, \
                               final_activation=final_activation, \
                               test_data_loader=val_data_loader)

    param_optimizer = list(trainer.model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(train_data_loader) * args.epochs)

    if _torch_tpu_available and args.use_TPU:
        optimizer = AdamW(optimizer_parameters, lr=args.lr*xm.xrt_world_size())
    else:
        optimizer = AdamW(optimizer_parameters, lr=args.lr)

    if args.use_apex and _has_apex:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")


    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
    
    loss = losses.get_loss(args.loss_function)
    scorer = scorers.SKMetric(args.metric, convert=convert_output, reshape=reshape) 
    
    def _mp_fn(rank, flags, trainer, epochs, lr, metric, loss_function, optimizer, scheduler, model_save_path, num_gpus, num_tpus,  \
                max_grad_norm, early_stopping_rounds, snapshot_ensemble, is_amp, use_wandb, seed):
        torch.set_default_tensor_type('torch.FloatTensor')
        a = trainer.train(epochs, lr, metric, loss_function, optimizer, scheduler, model_save_path, num_gpus, num_tpus,  \
                max_grad_norm, early_stopping_rounds, snapshot_ensemble, is_amp, use_wandb, seed)

    FLAGS = {}
    if _torch_tpu_available and args.use_TPU:
        xmp.spawn(_mp_fn, args=(FLAGS, trainer, args.epochs, args.lr, scorer, loss, optimizer, scheduler, args.model_save_path, args.num_gpus, args.num_tpus, \
                 1, 3, False, args.use_apex, False, args.seed), nprocs=8, start_method='fork')
    else:
        use_wandb = _has_wandb and args.wandb_logging
        trainer.train(args.epochs, args.lr, scorer, loss, optimizer, scheduler, args.model_save_path, args.num_gpus, args.num_tpus,  \
                max_grad_norm=1, early_stopping_rounds=3, snapshot_ensemble=False, is_amp=args.use_apex, use_wandb=use_wandb, seed=args.seed)

elif args.use_lightning_trainer and _torch_lightning_available:
    from pytorch_lightning import Trainer, seed_everything
    seed_everything(args.seed)
    
    loss = losses.get_loss(args.loss_function)
    scorer = scorers.PLMetric(args.metric, convert=convert_output, reshape=reshape)
    
    log_args = {'description': args.transformer_model_pretrained_path, 'loss': loss.__class__.__name__, 'epochs': args.epochs, 'learning_rate': args.lr}

    if _has_wandb and not _torch_tpu_available and args.wandb_logging:
        wandb.init(project="Project",config=log_args)
        wandb_logger = WandbLogger()

    checkpoint_callback = ModelCheckpoint(
                filepath=args.model_save_path,
                save_top_k=1,
                verbose=True,
                monitor='val_loss',
                mode='min'
                )
    earlystop = EarlyStopping(
                monitor='val_loss',
                patience=3,
               verbose=False,
               mode='min'
               )

    if args.use_gpu and _torch_gpu_available:
        print ("using GPU")
        if args.wandb_logging:
            if _has_apex:
                trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, logger=wandb_logger, precision=16, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
            else:
                trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, logger=wandb_logger, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
        else:
            if _has_apex:
                trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, precision=16, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
            else:
                trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])

    elif args.use_TPU and _torch_tpu_available:
        print ("using TPU")
        if _has_apex:
            trainer = Trainer(num_tpu_cores=args.num_tpus, max_epochs=args.epochs, precision=16, \
                        checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
        else:
            trainer = Trainer(num_tpu_cores=args.num_tpus, max_epochs=args.epochs, \
                        checkpoint_callback=checkpoint_callback, callbacks=[earlystop])

    else:
        print ("using CPU")
        if args.wandb_logging:
            if _has_apex:
                trainer = Trainer(max_epochs=args.epochs, logger=wandb_logger, precision=16, \
                        checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
            else:
                trainer = Trainer(max_epochs=args.epochs, logger=wandb_logger, \
                        checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
        else:
            if _has_apex:
                trainer = Trainer(max_epochs=args.epochs, precision=16, \
                        checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
            else:
                trainer = Trainer(max_epochs=args.epochs, checkpoint_callback=checkpoint_callback, callbacks=[earlystop])

    num_train_steps = int(len(train_data_loader) * args.epochs)

    pltrainer = PLTrainer(num_train_steps, model, scorer, loss, args.lr, \
                          final_activation=final_activation, seed=42)

    #try:
    #    print ("Loaded model from previous checkpoint")
    #    pltrainer = PLTrainer.load_from_checkpoint(args.model_save_path)
    #except:
    #    pass

    trainer.fit(pltrainer, train_data_loader, val_data_loader) 

GPU available: False, used: False
I0806 15:52:24.844974 4539198912 distributed.py:29] GPU available: False, used: False
TPU available: False, using: 0 TPU cores
I0806 15:52:24.846642 4539198912 distributed.py:29] TPU available: False, using: 0 TPU cores


using CPU
[LOG] Total number of parameters to learn 124646401



  | Name   | Type               | Params
----------------------------------------------
0 | model  | TransformerWithCLS | 124 M 
1 | metric | PLMetric           | 0     
I0806 15:52:25.195801 4539198912 lightning.py:1495] 
  | Name   | Type               | Params
----------------------------------------------
0 | model  | TransformerWithCLS | 124 M 
1 | metric | PLMetric           | 0     


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

val loss = 0.404 val metric = 0.856 




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


Epoch 00000: val_loss reached 0.46321 (best 0.46321), saving model to ../models/sentiment_classification/epoch=0.ckpt as top 1
I0806 15:57:33.435837 4539198912 model_checkpoint.py:346] 
Epoch 00000: val_loss reached 0.46321 (best 0.46321), saving model to ../models/sentiment_classification/epoch=0.ckpt as top 1


val loss = 0.463 val metric = 0.859 




Train loss = 0.145 Train metric = 0.948


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


Epoch 00001: val_loss reached 0.45796 (best 0.45796), saving model to ../models/sentiment_classification/epoch=1.ckpt as top 1
I0806 16:02:44.166603 4539198912 model_checkpoint.py:346] 
Epoch 00001: val_loss reached 0.45796 (best 0.45796), saving model to ../models/sentiment_classification/epoch=1.ckpt as top 1


val loss = 0.458 val metric = 0.837 
Train loss = 0.072 Train metric = 0.975


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


Epoch 00002: val_loss  was not in top 1
I0806 16:07:46.498960 4539198912 model_checkpoint.py:314] 
Epoch 00002: val_loss  was not in top 1


val loss = 0.539 val metric = 0.778 
Train loss = 0.108 Train metric = 0.961


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


Epoch 00003: val_loss  was not in top 1
I0806 16:12:26.587539 4539198912 model_checkpoint.py:314] 
Epoch 00003: val_loss  was not in top 1


val loss = 0.675 val metric = 0.836 
Train loss = 0.036 Train metric = 0.987


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


Epoch 00004: val_loss  was not in top 1
I0806 16:17:00.354878 4539198912 model_checkpoint.py:314] 
Epoch 00004: val_loss  was not in top 1


val loss = 0.551 val metric = 0.847 
Train loss = 0.054 Train metric = 0.98



In [23]:
from tqdm import tqdm

test_output2 = []

for val_batch in tqdm(val_data_loader):
    out = torch.sigmoid(pltrainer(val_batch)).detach().cpu().numpy()
    test_output2.extend(out[:,0].tolist())
    
#test_output2 = np.concatenate(test_output2)


  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:02<00:27,  2.33s/it][A
 15%|█▌        | 2/13 [00:04<00:23,  2.17s/it][A
 23%|██▎       | 3/13 [00:05<00:20,  2.08s/it][A
 31%|███       | 4/13 [00:08<00:18,  2.07s/it][A
 38%|███▊      | 5/13 [00:10<00:16,  2.06s/it][A
 46%|████▌     | 6/13 [00:12<00:14,  2.03s/it][A
 54%|█████▍    | 7/13 [00:13<00:11,  1.97s/it][A
 62%|██████▏   | 8/13 [00:15<00:09,  1.92s/it][A
 69%|██████▉   | 9/13 [00:17<00:07,  1.91s/it][A
 77%|███████▋  | 10/13 [00:19<00:05,  1.88s/it][A
 85%|████████▍ | 11/13 [00:21<00:03,  1.86s/it][A
 92%|█████████▏| 12/13 [00:23<00:01,  1.85s/it][A
100%|██████████| 13/13 [00:23<00:00,  1.84s/it][A


In [24]:
test_output1 = np.array(test_output1)[:,0]
test_output2 = np.array(test_output2)
np.corrcoef(test_output1,test_output2)

array([[1.        , 0.90560145],
       [0.90560145, 1.        ]])