In [1]:
from __future__ import absolute_import

import sys
import os

try:
    from dotenv import find_dotenv, load_dotenv
except:
    pass

import argparse

try:
    sys.path.append(os.path.join(os.path.dirname(__file__), '../src'))
except:
    sys.path.append(os.path.join(os.getcwd(), '../src'))
    
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torchcontrib.optim import SWA
from torch.optim import Adam, SGD 
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau, CyclicLR, \
                                     CosineAnnealingWarmRestarts

from consNLP.data import load_data, data_utils, fetch_dataset
from consNLP.models import transformer_models, activations, layers, losses, scorers
from consNLP.visualization import visualize
from consNLP.trainer.trainer import BasicTrainer, PLTrainer, test_pl_trainer, QATrainer, PLTrainerQA
from consNLP.trainer.trainer_utils import set_seed, _has_apex, _torch_lightning_available, _has_wandb, _torch_gpu_available, _num_gpus, _torch_tpu_available
from consNLP.preprocessing.custom_tokenizer import BERTweetTokenizer

if _has_apex:
    #from torch.cuda import amp
    from apex import amp

if _torch_tpu_available:
    import torch_xla
    import torch_xla.core.xla_model as xm
    import torch_xla.distributed.xla_multiprocessing as xmp

if _has_wandb:
    import wandb
    try:
        load_dotenv(find_dotenv())
        wandb.login(key=os.environ['WANDB_API_KEY'])
    except:
        _has_wandb = False

if _torch_lightning_available:
    import pytorch_lightning as pl
    from pytorch_lightning import Trainer, seed_everything
    from pytorch_lightning.loggers import WandbLogger
    from pytorch_lightning.metrics.metric import NumpyMetric
    from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, Callback

import tokenizers
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, AutoConfig

I0806 20:33:07.706027 4680543680 file_utils.py:41] PyTorch version 1.5.0 available.
I0806 20:33:15.409231 4680543680 file_utils.py:57] TensorFlow version 2.2.0-rc3 available.
I0806 20:33:17.444828 4680543680 modeling.py:230] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
wandb: Appending key for api.wandb.ai to your netrc file: /Users/victor/.netrc
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()
I0806 20:33:19.592143 4680543680 textcleaner.py:37] 'pattern' package not found; tag filters are not available for English
W0806 20:33:19.841995 4680543680 deprecation.py:323] From /Users/victor/anaconda3/lib/python3.7/site-packages/tensorflow/python/compat/v2_compat.py:96: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.
Instructions for updating:
non-resource variables are not supported in the long term
wandb: Appending

In [2]:
load_dotenv(find_dotenv())

True

In [3]:
fetch_dataset(project_dir='../',download_from_url=True, \
              data_url='https://nlp.stanford.edu/projects/snli/snli_1.0.zip')

I0806 20:02:45.316612 4476943808 fetch_dataset.py:16] making final data set from raw data
I0806 20:02:45.317600 4476943808 fetch_dataset.py:21] project directory ../
I0806 20:02:45.318350 4476943808 fetch_dataset.py:30] output path ../data/raw
I0806 20:02:45.319162 4476943808 fetch_dataset.py:77] downloading snli_1.0.zip
I0806 20:03:40.604937 4476943808 fetch_dataset.py:95] download complete


In [26]:
parser = argparse.ArgumentParser(prog='Torch trainer function',conflict_handler='resolve')

parser.add_argument('--train_data', type=str, default='../data/raw/snli_1.0/snli_1.0_train.jsonl', required=False,
                    help='train data')
parser.add_argument('--val_data', type=str, default='', required=False,
                    help='validation data')
parser.add_argument('--test_data', type=str, default=None, required=False,
                    help='test data')

parser.add_argument('--task_type', type=str, default='multiclass_sequence_classification', required=False,
                    help='type of task')

parser.add_argument('--transformer_model_pretrained_path', type=str, default='textattack/roberta-base-MNLI', required=False,
                    help='transformer model pretrained path or huggingface model name')
parser.add_argument('--transformer_config_path', type=str, default='textattack/roberta-base-MNLI', required=False,
                    help='transformer config file path or huggingface model name')
parser.add_argument('--transformer_tokenizer_path', type=str, default='textattack/roberta-base-MNLI', required=False,
                    help='transformer tokenizer file path or huggingface model name')
parser.add_argument('--bpe_vocab_path', type=str, default='', required=False,
                    help='bytepairencoding vocab file path')
parser.add_argument('--bpe_merges_path', type=str, default='', required=False,
                    help='bytepairencoding merges file path')
parser.add_argument('--berttweettokenizer_path', type=str, default='', required=False,
                    help='BERTweet tokenizer path')

parser.add_argument('--max_text_len', type=int, default=128, required=False,
                    help='maximum length of text')
parser.add_argument('--epochs', type=int, default=2, required=False,
                    help='number of epochs')
parser.add_argument('--lr', type=float, default=.00003, required=False,
                    help='learning rate')
parser.add_argument('--loss_function', type=str, default='ce', required=False,
                    help='loss function')
parser.add_argument('--metric', type=str, default='f1_macro', required=False,
                    help='scorer metric')

parser.add_argument('--use_lightning_trainer', type=bool, default=False, required=False,
                    help='if lightning trainer needs to be used')
parser.add_argument('--use_torch_trainer', type=bool, default=True, required=False,
                    help='if custom torch trainer needs to be used')
parser.add_argument('--use_apex', type=bool, default=False, required=False,
                    help='if apex needs to be used')
parser.add_argument('--use_gpu', type=bool, default=False, required=False,
                    help='GPU mode')
parser.add_argument('--use_TPU', type=bool, default=False, required=False,
                    help='TPU mode')
parser.add_argument('--num_gpus', type=int, default=0, required=False,
                    help='Number of GPUs')
parser.add_argument('--num_tpus', type=int, default=0, required=False,
                    help='Number of TPUs')

parser.add_argument('--train_batch_size', type=int, default=16, required=False,
                    help='train batch size')
parser.add_argument('--eval_batch_size', type=int, default=16, required=False,
                    help='eval batch size')

parser.add_argument('--model_save_path', type=str, default='../models/nli/', required=False,
                    help='seed')

parser.add_argument('--wandb_logging', type=bool, default=False, required=False,
                    help='wandb logging needed')

parser.add_argument('--seed', type=int, default=42, required=False,
                    help='seed')

args, _ = parser.parse_known_args()

print ("Wandb Logging: {}, GPU: {}, Pytorch Lightning: {}, TPU: {}, Apex: {}".format(\
            _has_wandb and args.wandb_logging, _torch_gpu_available,\
            _torch_lightning_available and args.use_lightning_trainer, _torch_tpu_available, _has_apex))

Wandb Logging: False, GPU: False, Pytorch Lightning: False, TPU: False, Apex: False


In [4]:
reshape = False
final_activation = None
convert_output = 'max'

In [5]:
import io
import json

get_transitions = lambda parse: ['reduce' if t == ')' else 'shift' for t in parse if t != '(']
examples = []
with io.open(args.train_data, encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        line = json.loads(line)
        examples.append({
            'premise': line['sentence1'],
            'hypothesis': line['sentence2'],
            'label': line['gold_label'],
            'premise_transitions': get_transitions(line['sentence1_binary_parse']),
            'hypothesis_transitions': get_transitions(line['sentence2_binary_parse'])
        })

In [6]:
df = pd.DataFrame.from_dict(examples)
print (df.shape)
df = df.iloc[:1000]

(550152, 5)


In [7]:
df.head(5)

Unnamed: 0,premise,hypothesis,label,premise_transitions,hypothesis_transitions
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,neutral,"[shift, shift, shift, shift, shift, shift, shi...","[shift, shift, shift, shift, shift, shift, shi..."
1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",contradiction,"[shift, shift, shift, shift, shift, shift, shi...","[shift, shift, shift, shift, shift, shift, shi..."
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",entailment,"[shift, shift, shift, shift, shift, shift, shi...","[shift, shift, shift, shift, shift, shift, shi..."
3,Children smiling and waving at camera,They are smiling at their parents,neutral,"[shift, shift, shift, shift, shift, shift, shi...","[shift, shift, shift, shift, shift, shift, shi..."
4,Children smiling and waving at camera,There are children present,entailment,"[shift, shift, shift, shift, shift, shift, shi...","[shift, shift, shift, shift, shift, shift, shi..."


In [8]:
model_save_dir = args.model_save_path
try:
    os.makedirs(model_save_dir)
except OSError:
    pass

In [9]:
df.label, label2idx = data_utils.convert_categorical_label_to_int(df.label, \
                                                             save_path=os.path.join(model_save_dir,'label2idx.pkl'))

In [10]:
label2idx

{'entailment': 0, 'neutral': 1, 'contradiction': 2, '-': 3}

In [11]:
from sklearn.model_selection import KFold

kf = KFold(5)

for train_index, val_index in kf.split(df.premise, df.label):
    break
    
train_df = df.iloc[train_index].reset_index(drop=True)
val_df = df.iloc[val_index].reset_index(drop=True)

In [12]:
train_df.shape, val_df.shape

((800, 5), (200, 5))

In [27]:
if args.berttweettokenizer_path:
    tokenizer = BERTweetTokenizer(args.berttweettokenizer_path)
else:
    tokenizer = AutoTokenizer.from_pretrained(args.transformer_model_pretrained_path)

if not args.berttweettokenizer_path:
    try:
        bpetokenizer = tokenizers.ByteLevelBPETokenizer(args.bpe_vocab_path, \
                                        args.bpe_merges_path)
    except:
        bpetokenizer = None 
else:
    bpetokenizer = None

I0806 20:39:54.232226 4680543680 configuration_utils.py:283] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/textattack/roberta-base-MNLI/config.json from cache at /Users/victor/.cache/torch/transformers/a0e9a64482bfa531a2b455bb5c56da303d4bc5dfcb9d0204a326d1dac03ee18e.5ab50e081c86cc5d316d4bd6224f34c782f19b06be3485cd6e269b5a46d0554d
I0806 20:39:54.233466 4680543680 configuration_utils.py:319] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": "mnli",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is

In [28]:
train_dataset = data_utils.TransformerDatasetForMNLI(train_df.premise, train_df.hypothesis, \
                                tokenizer=tokenizer, MAX_LEN=args.max_text_len, target_label=train_df.label)
val_dataset = data_utils.TransformerDatasetForMNLI(val_df.premise, val_df.hypothesis, \
                                tokenizer=tokenizer, MAX_LEN=args.max_text_len, target_label=val_df.label)

In [30]:
class TransformerModel(nn.Module):
    def __init__(self, base_model, dropout=.3, n_out=1):
        super(TransformerModel, self).__init__()

        self.base_model = base_model
        self.drop = nn.Dropout(dropout)
        self.out = nn.Linear(base_model.config.hidden_size, n_out)
        
    def forward(self, ids, mask, token_type_ids):
        o2 = self.base_model(ids)
        o2 = o2[1]
        bo = self.drop(o2)
        logits = self.out(bo)
        
        return logits

In [31]:
basemodel = AutoModel.from_pretrained(args.transformer_model_pretrained_path)
model = TransformerModel(basemodel, n_out=len(label2idx))

I0806 20:40:27.774886 4680543680 configuration_utils.py:283] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/textattack/roberta-base-MNLI/config.json from cache at /Users/victor/.cache/torch/transformers/a0e9a64482bfa531a2b455bb5c56da303d4bc5dfcb9d0204a326d1dac03ee18e.5ab50e081c86cc5d316d4bd6224f34c782f19b06be3485cd6e269b5a46d0554d
I0806 20:40:27.775756 4680543680 configuration_utils.py:319] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": "mnli",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is

In [32]:
model

TransformerModel(
  (base_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=

In [33]:
if _torch_tpu_available and args.use_TPU:
    train_sampler = torch.utils.data.distributed.DistributedSampler(
      train_dataset,
      num_replicas=xm.xrt_world_size(),
      rank=xm.get_ordinal(),
      shuffle=True
    )

    val_sampler = torch.utils.data.distributed.DistributedSampler(
      val_dataset,
      num_replicas=xm.xrt_world_size(),
      rank=xm.get_ordinal(),
      shuffle=False
    )

if _torch_tpu_available and args.use_TPU:
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.train_batch_size, sampler=train_sampler,
        drop_last=True,num_workers=2)

    val_data_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=args.eval_batch_size, sampler=val_sampler,
        drop_last=False,num_workers=1)
else:
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.train_batch_size)

    val_data_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=args.eval_batch_size)

In [34]:
for d in train_data_loader:
    break

logit = model(d['ids'],d['mask'],d['token_type_ids'])
print (logit.shape)

torch.Size([16, 4])


In [35]:
loss = losses.get_loss(args.loss_function)
print (loss(logit,d['targets']))

tensor(1.3504, grad_fn=<NllLossBackward>)


In [36]:
metric = scorers.SKMetric(args.metric, convert=convert_output, reshape=reshape)
print (metric(d['targets'].detach().cpu().numpy(),logit.detach().cpu().numpy()))

0.2785714285714286


### Run with Pytorch Trainer

In [39]:
if args.use_torch_trainer:
    device = torch.device("cuda" if _torch_gpu_available and args.use_gpu else "cpu")

    if _torch_tpu_available and args.use_TPU:
        device=xm.xla_device()

    print ("Device: {}".format(device))
    
    if args.use_TPU and _torch_tpu_available and args.num_tpus > 1:
        train_data_loader = torch_xla.distributed.parallel_loader.ParallelLoader(train_data_loader, [device])
        train_data_loader = train_data_loader.per_device_loader(device)


    trainer = BasicTrainer(model, train_data_loader, val_data_loader, device, args.transformer_model_pretrained_path, \
                               final_activation=final_activation, \
                               test_data_loader=val_data_loader)

    param_optimizer = list(trainer.model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(train_data_loader) * args.epochs)

    if _torch_tpu_available and args.use_TPU:
        optimizer = AdamW(optimizer_parameters, lr=args.lr*xm.xrt_world_size())
    else:
        optimizer = AdamW(optimizer_parameters, lr=args.lr)

    if args.use_apex and _has_apex:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
    
    loss = losses.get_loss(args.loss_function)
    scorer = scorers.SKMetric(args.metric, convert=convert_output, reshape=reshape) 
    
    def _mp_fn(rank, flags, trainer, epochs, lr, metric, loss_function, optimizer, scheduler, model_save_path, num_gpus, num_tpus,  \
                max_grad_norm, early_stopping_rounds, snapshot_ensemble, is_amp, use_wandb, seed):
        torch.set_default_tensor_type('torch.FloatTensor')
        a = trainer.train(epochs, lr, metric, loss_function, optimizer, scheduler, model_save_path, num_gpus, num_tpus,  \
                max_grad_norm, early_stopping_rounds, snapshot_ensemble, is_amp, use_wandb, seed)

    FLAGS = {}
    if _torch_tpu_available and args.use_TPU:
        xmp.spawn(_mp_fn, args=(FLAGS, trainer, args.epochs, args.lr, scorer, loss, optimizer, scheduler, args.model_save_path, args.num_gpus, args.num_tpus, \
                 1, 3, False, args.use_apex, False, args.seed), nprocs=8, start_method='fork')
    else:
        use_wandb = _has_wandb and args.wandb_logging
        trainer.train(args.epochs, args.lr, scorer, loss, optimizer, scheduler, args.model_save_path, args.num_gpus, args.num_tpus,  \
                max_grad_norm=1, early_stopping_rounds=3, snapshot_ensemble=False, is_amp=args.use_apex, use_wandb=use_wandb, seed=args.seed)

elif args.use_lightning_trainer and _torch_lightning_available:
    from pytorch_lightning import Trainer, seed_everything
    seed_everything(args.seed)
    
    loss = losses.get_loss(args.loss_function)
    scorer = scorers.PLMetric(args.metric, convert=convert_output, reshape=reshape)
    
    log_args = {'description': args.transformer_model_pretrained_path, 'loss': loss.__class__.__name__, 'epochs': args.epochs, 'learning_rate': args.lr}

    if _has_wandb and not _torch_tpu_available and args.wandb_logging:
        wandb.init(project="Project",config=log_args)
        wandb_logger = WandbLogger()

    checkpoint_callback = ModelCheckpoint(
                filepath=args.model_save_path,
                save_top_k=1,
                verbose=True,
                monitor='val_loss',
                mode='min'
                )
    earlystop = EarlyStopping(
                monitor='val_loss',
                patience=3,
               verbose=False,
               mode='min'
               )

    if args.use_gpu and _torch_gpu_available:
        print ("using GPU")
        if args.wandb_logging:
            if _has_apex:
                trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, logger=wandb_logger, precision=16, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
            else:
                trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, logger=wandb_logger, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
        else:
            if _has_apex:
                trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, precision=16, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
            else:
                trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])

    elif args.use_TPU and _torch_tpu_available:
        print ("using TPU")
        if _has_apex:
            trainer = Trainer(num_tpu_cores=args.num_tpus, max_epochs=args.epochs, precision=16, \
                        checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
        else:
            trainer = Trainer(num_tpu_cores=args.num_tpus, max_epochs=args.epochs, \
                        checkpoint_callback=checkpoint_callback, callbacks=[earlystop])

    else:
        print ("using CPU")
        if args.wandb_logging:
            if _has_apex:
                trainer = Trainer(max_epochs=args.epochs, logger=wandb_logger, precision=16, \
                        checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
            else:
                trainer = Trainer(max_epochs=args.epochs, logger=wandb_logger, \
                        checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
        else:
            if _has_apex:
                trainer = Trainer(max_epochs=args.epochs, precision=16, \
                        checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
            else:
                trainer = Trainer(max_epochs=args.epochs, checkpoint_callback=checkpoint_callback, callbacks=[earlystop])

    num_train_steps = int(len(train_data_loader) * args.epochs)

    pltrainer = PLTrainer(num_train_steps, model, scorer, loss, args.lr, \
                          final_activation=final_activation, seed=42)

    #try:
    #    print ("Loaded model from previous checkpoint")
    #    pltrainer = PLTrainer.load_from_checkpoint(args.model_save_path)
    #except:
    #    pass

    trainer.fit(pltrainer, train_data_loader, val_data_loader) 


  0%|          | 0/50 [00:00<?, ?it/s][A

Device: cpu
[LOG] Total number of parameters to learn 124648708


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)

Current training Loss 1.4:   0%|          | 0/50 [00:10<?, ?it/s][A
Current training Loss 1.4:   2%|▏         | 1/50 [00:10<08:14, 10.09s/it][A
Current training Loss 1.345:   2%|▏         | 1/50 [00:20<08:14, 10.09s/it][A
Current training Loss 1.345:   4%|▍         | 2/50 [00:20<08:05, 10.11s/it][A
Current training Loss 1.299:   4%|▍         | 2/50 [00:31<08:05, 10.11s/it][A
Current training Loss 1.299:   6%|▌         | 3/50 [00:31<08:08, 10.40s/it][A
Current training Loss 1.14:   6%|▌         | 3/50 [00:39<08:08, 10.40s/it] [A
Current training Loss 1.14:   8%|▊         | 4/50 [00:39<07:25,  9.69s/it][A
Current training Loss 1.203:   8%|▊         | 4/50 [00:49<07:25,  9.69s/it][A
Current training Loss 1.203:  10%|█         | 5/50 [00:49<07:18,  9.74s/it][A
Current training Loss 1.245:  10%|█         | 5/50 [00:58<07:18,  9.74s/it][A
Current training

  0%|          | 0/50 [00:00<?, ?it/s][A

Running evaluation on whole training data



Current eval Loss 0.62:   0%|          | 0/50 [00:01<?, ?it/s][A
Current eval Loss 0.62:   2%|▏         | 1/50 [00:01<01:17,  1.57s/it][A
Current eval Loss 0.447:   2%|▏         | 1/50 [00:03<01:17,  1.57s/it][A
Current eval Loss 0.447:   4%|▍         | 2/50 [00:03<01:14,  1.56s/it][A
Current eval Loss 0.761:   4%|▍         | 2/50 [00:04<01:14,  1.56s/it][A
Current eval Loss 0.761:   6%|▌         | 3/50 [00:04<01:13,  1.56s/it][A
Current eval Loss 0.811:   6%|▌         | 3/50 [00:06<01:13,  1.56s/it][A
Current eval Loss 0.811:   8%|▊         | 4/50 [00:06<01:11,  1.55s/it][A
Current eval Loss 0.542:   8%|▊         | 4/50 [00:07<01:11,  1.55s/it][A
Current eval Loss 0.542:  10%|█         | 5/50 [00:07<01:09,  1.54s/it][A
Current eval Loss 0.752:  10%|█         | 5/50 [00:09<01:09,  1.54s/it][A
Current eval Loss 0.752:  12%|█▏        | 6/50 [00:09<01:07,  1.54s/it][A
Current eval Loss 0.494:  12%|█▏        | 6/50 [00:10<01:07,  1.54s/it][A
Current eval Loss 0.494:  14%|█▍  

Running evaluation on validation data



Current eval Loss 0.595:   0%|          | 0/13 [00:01<?, ?it/s][A
Current eval Loss 0.595:   8%|▊         | 1/13 [00:01<00:18,  1.57s/it][A
Current eval Loss 0.227:   8%|▊         | 1/13 [00:03<00:18,  1.57s/it][A
Current eval Loss 0.227:  15%|█▌        | 2/13 [00:03<00:17,  1.56s/it][A
Current eval Loss 0.471:  15%|█▌        | 2/13 [00:04<00:17,  1.56s/it][A
Current eval Loss 0.471:  23%|██▎       | 3/13 [00:04<00:15,  1.55s/it][A
Current eval Loss 0.358:  23%|██▎       | 3/13 [00:06<00:15,  1.55s/it][A
Current eval Loss 0.358:  31%|███       | 4/13 [00:06<00:13,  1.55s/it][A
Current eval Loss 0.228:  31%|███       | 4/13 [00:07<00:13,  1.55s/it][A
Current eval Loss 0.228:  38%|███▊      | 5/13 [00:07<00:12,  1.54s/it][A
Current eval Loss 0.275:  38%|███▊      | 5/13 [00:09<00:12,  1.54s/it][A
Current eval Loss 0.275:  46%|████▌     | 6/13 [00:09<00:10,  1.54s/it][A
Current eval Loss 0.671:  46%|████▌     | 6/13 [00:10<00:10,  1.54s/it][A
Current eval Loss 0.671:  54%|██

Train loss = 0.362 Train metric = 0.662 Val loss = 0.479 Val metric = 0.626



  0%|          | 0/50 [00:00<?, ?it/s][A
Current training Loss 0.649:   0%|          | 0/50 [00:06<?, ?it/s][A
Current training Loss 0.649:   2%|▏         | 1/50 [00:06<05:13,  6.41s/it][A
Current training Loss 0.536:   2%|▏         | 1/50 [00:12<05:13,  6.41s/it][A
Current training Loss 0.536:   4%|▍         | 2/50 [00:12<05:06,  6.38s/it][A
Current training Loss 0.695:   4%|▍         | 2/50 [00:19<05:06,  6.38s/it][A
Current training Loss 0.695:   6%|▌         | 3/50 [00:19<04:59,  6.36s/it][A
Current training Loss 0.719:   6%|▌         | 3/50 [00:26<04:59,  6.36s/it][A
Current training Loss 0.719:   8%|▊         | 4/50 [00:26<05:07,  6.68s/it][A
Current training Loss 0.459:   8%|▊         | 4/50 [00:32<05:07,  6.68s/it][A
Current training Loss 0.459:  10%|█         | 5/50 [00:32<04:57,  6.60s/it][A
Current training Loss 0.848:  10%|█         | 5/50 [00:39<04:57,  6.60s/it][A
Current training Loss 0.848:  12%|█▏        | 6/50 [00:39<04:57,  6.76s/it][A
Current training 

Running evaluation on whole training data



Current eval Loss 0.33:   0%|          | 0/50 [00:01<?, ?it/s][A
Current eval Loss 0.33:   2%|▏         | 1/50 [00:01<01:20,  1.65s/it][A
Current eval Loss 0.366:   2%|▏         | 1/50 [00:03<01:20,  1.65s/it][A
Current eval Loss 0.366:   4%|▍         | 2/50 [00:03<01:18,  1.64s/it][A
Current eval Loss 0.521:   4%|▍         | 2/50 [00:04<01:18,  1.64s/it][A
Current eval Loss 0.521:   6%|▌         | 3/50 [00:04<01:16,  1.62s/it][A
Current eval Loss 0.413:   6%|▌         | 3/50 [00:06<01:16,  1.62s/it][A
Current eval Loss 0.413:   8%|▊         | 4/50 [00:06<01:13,  1.61s/it][A
Current eval Loss 0.509:   8%|▊         | 4/50 [00:08<01:13,  1.61s/it][A
Current eval Loss 0.509:  10%|█         | 5/50 [00:08<01:12,  1.60s/it][A
Current eval Loss 0.439:  10%|█         | 5/50 [00:09<01:12,  1.60s/it][A
Current eval Loss 0.439:  12%|█▏        | 6/50 [00:09<01:10,  1.59s/it][A
Current eval Loss 0.318:  12%|█▏        | 6/50 [00:11<01:10,  1.59s/it][A
Current eval Loss 0.318:  14%|█▍  

Running evaluation on validation data



Current eval Loss 0.584:   0%|          | 0/13 [00:01<?, ?it/s][A
Current eval Loss 0.584:   8%|▊         | 1/13 [00:01<00:18,  1.57s/it][A
Current eval Loss 0.092:   8%|▊         | 1/13 [00:03<00:18,  1.57s/it][A
Current eval Loss 0.092:  15%|█▌        | 2/13 [00:03<00:17,  1.60s/it][A
Current eval Loss 0.536:  15%|█▌        | 2/13 [00:04<00:17,  1.60s/it][A
Current eval Loss 0.536:  23%|██▎       | 3/13 [00:04<00:16,  1.60s/it][A
Current eval Loss 0.419:  23%|██▎       | 3/13 [00:06<00:16,  1.60s/it][A
Current eval Loss 0.419:  31%|███       | 4/13 [00:06<00:14,  1.64s/it][A
Current eval Loss 0.136:  31%|███       | 4/13 [00:08<00:14,  1.64s/it][A
Current eval Loss 0.136:  38%|███▊      | 5/13 [00:08<00:13,  1.63s/it][A
Current eval Loss 0.34:  38%|███▊      | 5/13 [00:09<00:13,  1.63s/it] [A
Current eval Loss 0.34:  46%|████▌     | 6/13 [00:09<00:11,  1.61s/it][A
Current eval Loss 0.612:  46%|████▌     | 6/13 [00:11<00:11,  1.61s/it][A
Current eval Loss 0.612:  54%|███

Train loss = 0.236 Train metric = 0.697 Val loss = 0.473 Val metric = 0.63



  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:01<00:19,  1.65s/it][A
 15%|█▌        | 2/13 [00:03<00:18,  1.66s/it][A
 23%|██▎       | 3/13 [00:05<00:16,  1.69s/it][A
 31%|███       | 4/13 [00:07<00:15,  1.78s/it][A
 38%|███▊      | 5/13 [00:08<00:14,  1.77s/it][A
 46%|████▌     | 6/13 [00:10<00:12,  1.81s/it][A
 54%|█████▍    | 7/13 [00:12<00:10,  1.79s/it][A
 62%|██████▏   | 8/13 [00:14<00:08,  1.76s/it][A
 69%|██████▉   | 9/13 [00:15<00:06,  1.72s/it][A
 77%|███████▋  | 10/13 [00:17<00:05,  1.71s/it][A
 85%|████████▍ | 11/13 [00:19<00:03,  1.69s/it][A
 92%|█████████▏| 12/13 [00:20<00:01,  1.67s/it][A
100%|██████████| 13/13 [00:21<00:00,  1.66s/it][A


In [40]:
test_output1 = trainer.test_output

### Run with Pytorch Lightning Trainer

In [41]:
parser = argparse.ArgumentParser(prog='Torch trainer function',conflict_handler='resolve')

parser.add_argument('--train_data', type=str, default='../data/raw/snli_1.0/snli_1.0_train.jsonl', required=False,
                    help='train data')
parser.add_argument('--val_data', type=str, default='', required=False,
                    help='validation data')
parser.add_argument('--test_data', type=str, default=None, required=False,
                    help='test data')

parser.add_argument('--task_type', type=str, default='multiclass_sequence_classification', required=False,
                    help='type of task')

parser.add_argument('--transformer_model_pretrained_path', type=str, default='textattack/roberta-base-MNLI', required=False,
                    help='transformer model pretrained path or huggingface model name')
parser.add_argument('--transformer_config_path', type=str, default='textattack/roberta-base-MNLI', required=False,
                    help='transformer config file path or huggingface model name')
parser.add_argument('--transformer_tokenizer_path', type=str, default='textattack/roberta-base-MNLI', required=False,
                    help='transformer tokenizer file path or huggingface model name')
parser.add_argument('--bpe_vocab_path', type=str, default='', required=False,
                    help='bytepairencoding vocab file path')
parser.add_argument('--bpe_merges_path', type=str, default='', required=False,
                    help='bytepairencoding merges file path')
parser.add_argument('--berttweettokenizer_path', type=str, default='', required=False,
                    help='BERTweet tokenizer path')

parser.add_argument('--max_text_len', type=int, default=128, required=False,
                    help='maximum length of text')
parser.add_argument('--epochs', type=int, default=2, required=False,
                    help='number of epochs')
parser.add_argument('--lr', type=float, default=.00003, required=False,
                    help='learning rate')
parser.add_argument('--loss_function', type=str, default='ce', required=False,
                    help='loss function')
parser.add_argument('--metric', type=str, default='f1_macro', required=False,
                    help='scorer metric')

parser.add_argument('--use_lightning_trainer', type=bool, default=True, required=False,
                    help='if lightning trainer needs to be used')
parser.add_argument('--use_torch_trainer', type=bool, default=False, required=False,
                    help='if custom torch trainer needs to be used')
parser.add_argument('--use_apex', type=bool, default=False, required=False,
                    help='if apex needs to be used')
parser.add_argument('--use_gpu', type=bool, default=False, required=False,
                    help='GPU mode')
parser.add_argument('--use_TPU', type=bool, default=False, required=False,
                    help='TPU mode')
parser.add_argument('--num_gpus', type=int, default=0, required=False,
                    help='Number of GPUs')
parser.add_argument('--num_tpus', type=int, default=0, required=False,
                    help='Number of TPUs')

parser.add_argument('--train_batch_size', type=int, default=16, required=False,
                    help='train batch size')
parser.add_argument('--eval_batch_size', type=int, default=16, required=False,
                    help='eval batch size')

parser.add_argument('--model_save_path', type=str, default='../models/nli/', required=False,
                    help='seed')

parser.add_argument('--wandb_logging', type=bool, default=False, required=False,
                    help='wandb logging needed')

parser.add_argument('--seed', type=int, default=42, required=False,
                    help='seed')

args, _ = parser.parse_known_args()

print ("Wandb Logging: {}, GPU: {}, Pytorch Lightning: {}, TPU: {}, Apex: {}".format(\
            _has_wandb and args.wandb_logging, _torch_gpu_available,\
            _torch_lightning_available and args.use_lightning_trainer, _torch_tpu_available, _has_apex))

Wandb Logging: False, GPU: False, Pytorch Lightning: True, TPU: False, Apex: False


In [42]:
if args.use_torch_trainer:
    device = torch.device("cuda" if _torch_gpu_available and args.use_gpu else "cpu")

    if _torch_tpu_available and args.use_TPU:
        device=xm.xla_device()

    print ("Device: {}".format(device))
    
    if args.use_TPU and _torch_tpu_available and args.num_tpus > 1:
        train_data_loader = torch_xla.distributed.parallel_loader.ParallelLoader(train_data_loader, [device])
        train_data_loader = train_data_loader.per_device_loader(device)


    trainer = BasicTrainer(model, train_data_loader, val_data_loader, device, args.transformer_model_pretrained_path, \
                               final_activation=final_activation, \
                               test_data_loader=val_data_loader)

    param_optimizer = list(trainer.model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(train_data_loader) * args.epochs)

    if _torch_tpu_available and args.use_TPU:
        optimizer = AdamW(optimizer_parameters, lr=args.lr*xm.xrt_world_size())
    else:
        optimizer = AdamW(optimizer_parameters, lr=args.lr)

    if args.use_apex and _has_apex:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
    
    loss = losses.get_loss(args.loss_function)
    scorer = scorers.SKMetric(args.metric, convert=convert_output, reshape=reshape) 
    
    def _mp_fn(rank, flags, trainer, epochs, lr, metric, loss_function, optimizer, scheduler, model_save_path, num_gpus, num_tpus,  \
                max_grad_norm, early_stopping_rounds, snapshot_ensemble, is_amp, use_wandb, seed):
        torch.set_default_tensor_type('torch.FloatTensor')
        a = trainer.train(epochs, lr, metric, loss_function, optimizer, scheduler, model_save_path, num_gpus, num_tpus,  \
                max_grad_norm, early_stopping_rounds, snapshot_ensemble, is_amp, use_wandb, seed)

    FLAGS = {}
    if _torch_tpu_available and args.use_TPU:
        xmp.spawn(_mp_fn, args=(FLAGS, trainer, args.epochs, args.lr, scorer, loss, optimizer, scheduler, args.model_save_path, args.num_gpus, args.num_tpus, \
                 1, 3, False, args.use_apex, False, args.seed), nprocs=8, start_method='fork')
    else:
        use_wandb = _has_wandb and args.wandb_logging
        trainer.train(args.epochs, args.lr, scorer, loss, optimizer, scheduler, args.model_save_path, args.num_gpus, args.num_tpus,  \
                max_grad_norm=1, early_stopping_rounds=3, snapshot_ensemble=False, is_amp=args.use_apex, use_wandb=use_wandb, seed=args.seed)

elif args.use_lightning_trainer and _torch_lightning_available:
    from pytorch_lightning import Trainer, seed_everything
    seed_everything(args.seed)
    
    loss = losses.get_loss(args.loss_function)
    scorer = scorers.PLMetric(args.metric, convert=convert_output, reshape=reshape)
    
    log_args = {'description': args.transformer_model_pretrained_path, 'loss': loss.__class__.__name__, 'epochs': args.epochs, 'learning_rate': args.lr}

    if _has_wandb and not _torch_tpu_available and args.wandb_logging:
        wandb.init(project="Project",config=log_args)
        wandb_logger = WandbLogger()

    checkpoint_callback = ModelCheckpoint(
                filepath=args.model_save_path,
                save_top_k=1,
                verbose=True,
                monitor='val_loss',
                mode='min'
                )
    earlystop = EarlyStopping(
                monitor='val_loss',
                patience=3,
               verbose=False,
               mode='min'
               )

    if args.use_gpu and _torch_gpu_available:
        print ("using GPU")
        if args.wandb_logging:
            if _has_apex:
                trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, logger=wandb_logger, precision=16, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
            else:
                trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, logger=wandb_logger, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
        else:
            if _has_apex:
                trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, precision=16, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
            else:
                trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])

    elif args.use_TPU and _torch_tpu_available:
        print ("using TPU")
        if _has_apex:
            trainer = Trainer(num_tpu_cores=args.num_tpus, max_epochs=args.epochs, precision=16, \
                        checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
        else:
            trainer = Trainer(num_tpu_cores=args.num_tpus, max_epochs=args.epochs, \
                        checkpoint_callback=checkpoint_callback, callbacks=[earlystop])

    else:
        print ("using CPU")
        if args.wandb_logging:
            if _has_apex:
                trainer = Trainer(max_epochs=args.epochs, logger=wandb_logger, precision=16, \
                        checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
            else:
                trainer = Trainer(max_epochs=args.epochs, logger=wandb_logger, \
                        checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
        else:
            if _has_apex:
                trainer = Trainer(max_epochs=args.epochs, precision=16, \
                        checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
            else:
                trainer = Trainer(max_epochs=args.epochs, checkpoint_callback=checkpoint_callback, callbacks=[earlystop])

    num_train_steps = int(len(train_data_loader) * args.epochs)

    pltrainer = PLTrainer(num_train_steps, model, scorer, loss, args.lr, \
                          final_activation=final_activation, seed=42)

    #try:
    #    print ("Loaded model from previous checkpoint")
    #    pltrainer = PLTrainer.load_from_checkpoint(args.model_save_path)
    #except:
    #    pass

    trainer.fit(pltrainer, train_data_loader, val_data_loader) 

GPU available: False, used: False
I0806 20:58:47.524595 4680543680 distributed.py:29] GPU available: False, used: False
TPU available: False, using: 0 TPU cores
I0806 20:58:47.533794 4680543680 distributed.py:29] TPU available: False, using: 0 TPU cores


using CPU
[LOG] Total number of parameters to learn 124648708



  | Name   | Type             | Params
--------------------------------------------
0 | model  | TransformerModel | 124 M 
1 | metric | PLMetric         | 0     
I0806 20:58:48.175644 4680543680 lightning.py:1495] 
  | Name   | Type             | Params
--------------------------------------------
0 | model  | TransformerModel | 124 M 
1 | metric | PLMetric         | 0     


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

val loss = 0.338 val metric = 0.939 




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


Epoch 00000: val_loss reached 0.53188 (best 0.53188), saving model to ../models/nli/epoch=0.ckpt as top 1
I0806 21:04:50.426769 4680543680 model_checkpoint.py:346] 
Epoch 00000: val_loss reached 0.53188 (best 0.53188), saving model to ../models/nli/epoch=0.ckpt as top 1


val loss = 0.532 val metric = 0.785 




Train loss = 0.413 Train metric = 0.841


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


Epoch 00001: val_loss  was not in top 1
I0806 21:10:30.835139 4680543680 model_checkpoint.py:314] 
Epoch 00001: val_loss  was not in top 1


val loss = 0.645 val metric = 0.792 
Train loss = 0.203 Train metric = 0.922



In [43]:
from tqdm import tqdm

test_output2 = []

for val_batch in tqdm(val_data_loader):
    out = pltrainer(val_batch).detach().cpu().numpy()
    test_output2.extend(out.tolist())
    
#test_output2 = np.concatenate(test_output2)


  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:03<00:44,  3.67s/it][A
 15%|█▌        | 2/13 [00:06<00:36,  3.30s/it][A
 23%|██▎       | 3/13 [00:08<00:31,  3.11s/it][A
 31%|███       | 4/13 [00:11<00:26,  2.91s/it][A
 38%|███▊      | 5/13 [00:13<00:22,  2.75s/it][A
 46%|████▌     | 6/13 [00:15<00:18,  2.62s/it][A
 54%|█████▍    | 7/13 [00:18<00:15,  2.55s/it][A
 62%|██████▏   | 8/13 [00:20<00:12,  2.48s/it][A
 69%|██████▉   | 9/13 [00:22<00:09,  2.44s/it][A
 77%|███████▋  | 10/13 [00:25<00:07,  2.41s/it][A
 85%|████████▍ | 11/13 [00:27<00:04,  2.41s/it][A
 92%|█████████▏| 12/13 [00:30<00:02,  2.39s/it][A
100%|██████████| 13/13 [00:31<00:00,  2.40s/it][A


In [44]:
test_output1 = np.array(test_output1).argmax(-1)
test_output2 = np.array(test_output2).argmax(-1)

In [45]:
idx2label = {i:w for (w,i) in label2idx.items()}
idx2label

{0: 'entailment', 1: 'neutral', 2: 'contradiction', 3: '-'}

In [48]:
val_df['prediction1'] = [idx2label[i] for i in test_output1]
val_df['prediction2'] = [idx2label[i] for i in test_output2]
val_df['label'] = [idx2label[val_df.label.iloc[i]] for i in range(val_df.shape[0])]
val_df.head(10)

Unnamed: 0,premise,hypothesis,label,premise_transitions,hypothesis_transitions,prediction1,prediction2
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,neutral,"[shift, shift, shift, shift, shift, shift, shi...","[shift, shift, shift, shift, shift, shift, shi...",neutral,neutral
1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",contradiction,"[shift, shift, shift, shift, shift, shift, shi...","[shift, shift, shift, shift, shift, shift, shi...",contradiction,contradiction
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",entailment,"[shift, shift, shift, shift, shift, shift, shi...","[shift, shift, shift, shift, shift, shift, shi...",entailment,entailment
3,Children smiling and waving at camera,They are smiling at their parents,neutral,"[shift, shift, shift, shift, shift, shift, shi...","[shift, shift, shift, shift, shift, shift, shi...",neutral,neutral
4,Children smiling and waving at camera,There are children present,entailment,"[shift, shift, shift, shift, shift, shift, shi...","[shift, shift, shift, shift, shift, shift, shi...",entailment,entailment
5,Children smiling and waving at camera,The kids are frowning,contradiction,"[shift, shift, shift, shift, shift, shift, shi...","[shift, shift, shift, shift, shift, shift, shi...",contradiction,contradiction
6,A boy is jumping on skateboard in the middle o...,The boy skates down the sidewalk.,contradiction,"[shift, shift, shift, shift, shift, shift, shi...","[shift, shift, shift, shift, shift, shift, shi...",contradiction,contradiction
7,A boy is jumping on skateboard in the middle o...,The boy does a skateboarding trick.,entailment,"[shift, shift, shift, shift, shift, shift, shi...","[shift, shift, shift, shift, shift, shift, shi...",entailment,entailment
8,A boy is jumping on skateboard in the middle o...,The boy is wearing safety equipment.,neutral,"[shift, shift, shift, shift, shift, shift, shi...","[shift, shift, shift, shift, shift, shift, shi...",neutral,neutral
9,An older man sits with his orange juice at a s...,An older man drinks his juice as he waits for ...,neutral,"[shift, shift, shift, shift, shift, shift, shi...","[shift, shift, shift, shift, shift, shift, shi...",neutral,neutral


In [49]:
from sklearn.metrics import classification_report

report = classification_report(val_df.label, val_df.prediction1)
print (report)

               precision    recall  f1-score   support

            -       0.00      0.00      0.00         1
contradiction       0.88      0.91      0.89        65
   entailment       0.81      0.91      0.86        67
      neutral       0.83      0.72      0.77        67

     accuracy                           0.84       200
    macro avg       0.63      0.63      0.63       200
 weighted avg       0.84      0.84      0.84       200



  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
report = classification_report(val_df.label, val_df.prediction2)
print (report)

               precision    recall  f1-score   support

            -       0.00      0.00      0.00         1
contradiction       0.78      0.91      0.84        65
   entailment       0.83      0.79      0.81        67
      neutral       0.77      0.69      0.72        67

     accuracy                           0.79       200
    macro avg       0.59      0.60      0.59       200
 weighted avg       0.79      0.79      0.79       200

