<a href="https://colab.research.google.com/github/swilsonmfc/nlp/blob/master/RoBERTa_XLNet_ELECTRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-Training : Model Improvements for BERT
* RoBERTa, ELECTRA and XLNet
* https://ai.googleblog.com/2020/03/more-efficient-nlp-model-pre-training.html

# Install

In [1]:
!pip install transformers
!pip install datasets



# Setup

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer
from transformers import TrainingArguments
from transformers import pipeline

from datasets import load_dataset

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support

# Data

In [3]:
emotion = load_dataset('emotion')

Using custom data configuration default
Reusing dataset emotion (/root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
emotion

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

# Parameters

In [5]:
LABELS = 6 
LABEL_NAMES = ['Sad', 'Joy', 'Love', 'Anger', 'Fear', 'Surprise']

# Results

In [6]:
def metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [7]:
results_df = pd.DataFrame(columns=['Accuracy', 'F1', 'Precision', 'Recall'])

# BERT
* Bi-directional Encoders Representations from Transformers
* https://arxiv.org/abs/1810.04805

## Approach

### Masked Language Modeling
* Bidirectional approach (can't use causal / autoregressive)
* Masked Language Model
  * Randomly mask 15% of tokens
  * 80% replace with [MASK]
  * 10% use original word
  * 10% replace word randomly
* Process
  * Mask tokens in pre-training
  * Duplicate data 10 times masking differently each time


In [8]:
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

## Tokenize

In [9]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)
emotion_enc = emotion.map(tokenize, batched=True, batch_size=None)

Loading cached processed dataset at /root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705/cache-6a615c61d7fc36c4.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705/cache-2a190f2f4565286c.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

In [10]:
emotion_enc

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

## Model

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=LABELS)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [12]:
emotion_enc['train'].features

{'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'label': ClassLabel(num_classes=6, names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None),
 'text': Value(dtype='string', id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

## Train

In [13]:
BATCH_SIZE = 64
logging_steps = len(emotion_enc['train']) // BATCH_SIZE
training_args = TrainingArguments(output_dir="results-bert",
                                  num_train_epochs=10,
                                  learning_rate=1e-5,
                                  per_device_train_batch_size=BATCH_SIZE,
                                  per_device_eval_batch_size=BATCH_SIZE,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1",
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  save_strategy='epoch',
                                  disable_tqdm=False)

In [14]:
trainer = Trainer(model=model, 
                  args=training_args,
                  compute_metrics=metrics,
                  train_dataset=emotion_enc['train'],
                  eval_dataset=emotion_enc['validation'])
trainer.train();

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 16000
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 2500


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.514073,0.838,0.826175,0.845074,0.838
2,0.738600,0.230451,0.9225,0.922803,0.925956,0.9225
3,0.738600,0.184923,0.931,0.931063,0.932535,0.931
4,0.158800,0.161651,0.9305,0.930125,0.931609,0.9305
5,0.158800,0.169025,0.934,0.934095,0.93503,0.934
6,0.103000,0.177586,0.9325,0.932268,0.933726,0.9325
7,0.103000,0.183072,0.929,0.928559,0.929199,0.929
8,0.082300,0.177397,0.9315,0.931676,0.93235,0.9315
9,0.082300,0.182376,0.933,0.933028,0.933209,0.933
10,0.067800,0.183335,0.9305,0.930553,0.930709,0.9305


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
Saving model checkpoint to results-bert/checkpoint-250
Configuration saved in results-bert/checkpoint-250/config.json
Model weights saved in results-bert/checkpoint-250/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
Saving model checkpoint to results-bert/checkpoint-500
Configuration saved in results-bert/checkpoint-500/config.json
Model weights saved in results-bert

## Evaluate

In [15]:
preds = trainer.predict(emotion_enc['test'])
results_df.loc['BERT-FineTune'] = [preds.metrics['test_accuracy'],
                                   preds.metrics['test_f1'],
                                   preds.metrics['test_precision'],
                                   preds.metrics['test_recall']]
preds.metrics

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2000
  Batch size = 64


{'test_accuracy': 0.921,
 'test_f1': 0.9210334514551484,
 'test_loss': 0.17918971180915833,
 'test_precision': 0.9221429844823874,
 'test_recall': 0.921,
 'test_runtime': 4.5536,
 'test_samples_per_second': 439.213,
 'test_steps_per_second': 7.027}

In [16]:
y_true  = np.array(emotion_enc['test']['label'])
y_pred = np.argmax(preds.predictions, axis=1)
print(classification_report(y_pred, y_true, target_names=LABEL_NAMES)) 

              precision    recall  f1-score   support

         Sad       0.96      0.97      0.96       576
         Joy       0.95      0.94      0.94       706
        Love       0.77      0.85      0.81       144
       Anger       0.89      0.95      0.92       258
        Fear       0.92      0.85      0.88       243
    Surprise       0.79      0.71      0.75        73

    accuracy                           0.92      2000
   macro avg       0.88      0.88      0.88      2000
weighted avg       0.92      0.92      0.92      2000



# RoBERTa
* A Robustly Optimized BERT Pre-Training Approach
* https://arxiv.org/abs/1907.11692


## Improvements

### Static vs Dynamic Masking
* BERT
  * Performed static masking in pre-processing
  * Duplicated the data 10 times, changing the mask in each avoiding it from being duplicated in each epoch
  * Trained for 40 epochs - each example seen 4 times
* RoBERTa
  * Uses dynamic masking (as we feed the sequence to the model)
  * Becomes more important as pre-training for more steps and on larger data sizes
  * Performs at least as good or not better than static

### Encoding / Tokenizing
* BERT
  * BPE (Byte-Pair Encoding) 30K 
  * Unicode at the base subword
* RoBERTa 
  * BPE (Byte-Pair Encoding) 50K <-- GPT
  * Uses bytes as base subword

### Next Sentence Prediction
* Tests
  * Segment Pair with NSP Loss
  * Sentence Pair with NSP Loss
  * Full Sentences without NSP Loss
  * Document Sentences without NSP Loss
* RoBERTa removes the NSP task
  * Improves SQuAD, MNLI, SST, RACE
  * Although Doc performs best, decide on Full for efficiency

### Batch Sizes
* BERT trained batch size 256 for 1MM steps
* RoBERTa increases batch sizes to 2K
  * Equivalent of 125 steps
* Large batches
  * Improve perplexity of MLM & downstream accuracy
  * Easier to parallelize

### More Training Data
* BERT used 16GB (BookCorpus and English Wikipedia)
* RoBERTa adds CC-News, OpenWebText & Stories increasing to 160 GB
* Paper looks at the effect separately

In [17]:
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 1,
  "use_cach

## Tokenize

In [18]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)
emotion_enc = emotion.map(tokenize, batched=True, batch_size=None)

Loading cached processed dataset at /root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705/cache-6b7a7b471ba45fcf.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705/cache-0f8e5a90c57f959f.arrow


In [19]:
emotion_enc

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

## Model

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=LABELS)

loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attenti

## Train

In [21]:
BATCH_SIZE = 64
logging_steps = len(emotion_enc['train']) // BATCH_SIZE
training_args = TrainingArguments(output_dir="results-roberta",
                                  num_train_epochs=10,
                                  learning_rate=1e-5,
                                  per_device_train_batch_size=BATCH_SIZE,
                                  per_device_eval_batch_size=BATCH_SIZE,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1",
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  save_strategy='epoch',
                                  disable_tqdm=False)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [22]:
trainer = Trainer(model=model, 
                  args=training_args,
                  compute_metrics=metrics,
                  train_dataset=emotion_enc['train'],
                  eval_dataset=emotion_enc['validation'])
trainer.train();

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 16000
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 2500


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.378601,0.877,0.874774,0.876456,0.877
2,0.646100,0.224999,0.9205,0.921233,0.924811,0.9205
3,0.646100,0.19567,0.9235,0.924328,0.927642,0.9235
4,0.185100,0.164863,0.935,0.934901,0.936331,0.935
5,0.185100,0.153614,0.9335,0.933773,0.935163,0.9335
6,0.125500,0.159684,0.932,0.932678,0.934744,0.932
7,0.125500,0.139564,0.9405,0.940321,0.940314,0.9405
8,0.100500,0.141773,0.9405,0.94091,0.942376,0.9405
9,0.100500,0.142601,0.938,0.938398,0.939283,0.938
10,0.085500,0.144116,0.9385,0.938847,0.940079,0.9385


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
Saving model checkpoint to results-roberta/checkpoint-250
Configuration saved in results-roberta/checkpoint-250/config.json
Model weights saved in results-roberta/checkpoint-250/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
Saving model checkpoint to results-roberta/checkpoint-500
Configuration saved in results-roberta/checkpoint-500/config.json
Model we

## Evaluate

In [23]:
preds = trainer.predict(emotion_enc['test'])
results_df.loc['RoBERTa-FineTune'] = [preds.metrics['test_accuracy'],
                                   preds.metrics['test_f1'],
                                   preds.metrics['test_precision'],
                                   preds.metrics['test_recall']]
preds.metrics

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2000
  Batch size = 64


{'test_accuracy': 0.926,
 'test_f1': 0.9266934710705544,
 'test_loss': 0.17034657299518585,
 'test_precision': 0.9299004202665879,
 'test_recall': 0.926,
 'test_runtime': 4.2648,
 'test_samples_per_second': 468.958,
 'test_steps_per_second': 7.503}

In [24]:
y_true  = np.array(emotion_enc['test']['label'])
y_pred = np.argmax(preds.predictions, axis=1)
print(classification_report(y_pred, y_true, target_names=LABEL_NAMES)) 

              precision    recall  f1-score   support

         Sad       0.96      0.96      0.96       584
         Joy       0.92      0.98      0.95       651
        Love       0.96      0.78      0.86       195
       Anger       0.91      0.93      0.92       270
        Fear       0.91      0.85      0.88       240
    Surprise       0.71      0.78      0.75        60

    accuracy                           0.93      2000
   macro avg       0.90      0.88      0.89      2000
weighted avg       0.93      0.93      0.93      2000



# XLNet
* Generalized Autoregressive Pretraining for Lanugage Understanding
* https://arxiv.org/abs/1906.08237

## Improvements

### Combining AR with MLM
* XLNet captures dependencies between words that BERT cannot
* Paper Example
  * New York is a city --> [MASK] [MASK] is a city
  * BERT could find Los York is a city
  * BERT : log p(New | is a city) + log p(York | is a city)
  * XLNet: log p(New | is a city) + log p(York | New, is a city)
* Consider all possible orderings
  * Randomly shuffle owrdering of words
  * See many variants, and maintain auto-regressive approach

### Borrowing from Transformer-XL
* Relative positional encoding
* Memory blocks to move from step to step
* Helps with longer content

![](https://miro.medium.com/max/1400/1*RGdAU7tXXKqckbDjhoIWmQ.png)

### Outperformed BERT
* SOTA on 18 of 20 tasks

In [33]:
model_name = 'xlnet-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

Could not locate the tokenizer configuration file, will try to use the model config instead.
https://huggingface.co/xlnet-base-cased/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpywa5ama_


Downloading:   0%|          | 0.00/760 [00:00<?, ?B/s]

storing https://huggingface.co/xlnet-base-cased/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/06bdb0f5882dbb833618c81c3b4c996a0c79422fa2c95ffea3827f92fc2dba6b.da982e2e596ec73828dbae86525a1870e513bd63aae5a2dc773ccc840ac5c346
creating metadata file for /root/.cache/huggingface/transformers/06bdb0f5882dbb833618c81c3b4c996a0c79422fa2c95ffea3827f92fc2dba6b.da982e2e596ec73828dbae86525a1870e513bd63aae5a2dc773ccc840ac5c346
loading configuration file https://huggingface.co/xlnet-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/06bdb0f5882dbb833618c81c3b4c996a0c79422fa2c95ffea3827f92fc2dba6b.da982e2e596ec73828dbae86525a1870e513bd63aae5a2dc773ccc840ac5c346
Model config XLNetConfig {
  "_name_or_path": "xlnet-base-cased",
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_to

Downloading:   0%|          | 0.00/779k [00:00<?, ?B/s]

storing https://huggingface.co/xlnet-base-cased/resolve/main/spiece.model in cache at /root/.cache/huggingface/transformers/df73bc9f8d13bf2ea4dab95624895e45a550a0f0a825e41fc25440bf367ee3c8.d93497120e3a865e2970f26abdf7bf375896f97fde8b874b70909592a6c785c9
creating metadata file for /root/.cache/huggingface/transformers/df73bc9f8d13bf2ea4dab95624895e45a550a0f0a825e41fc25440bf367ee3c8.d93497120e3a865e2970f26abdf7bf375896f97fde8b874b70909592a6c785c9
https://huggingface.co/xlnet-base-cased/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpkzb2skcu


Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

storing https://huggingface.co/xlnet-base-cased/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/46f47734f3dcaef7e236b9a3e887f27814e18836a8db7e6a49148000058a1a54.2a683f915238b4f560dab0c724066cf0a7de9a851e96b0fb3a1e7f0881552f53
creating metadata file for /root/.cache/huggingface/transformers/46f47734f3dcaef7e236b9a3e887f27814e18836a8db7e6a49148000058a1a54.2a683f915238b4f560dab0c724066cf0a7de9a851e96b0fb3a1e7f0881552f53
loading file https://huggingface.co/xlnet-base-cased/resolve/main/spiece.model from cache at /root/.cache/huggingface/transformers/df73bc9f8d13bf2ea4dab95624895e45a550a0f0a825e41fc25440bf367ee3c8.d93497120e3a865e2970f26abdf7bf375896f97fde8b874b70909592a6c785c9
loading file https://huggingface.co/xlnet-base-cased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/46f47734f3dcaef7e236b9a3e887f27814e18836a8db7e6a49148000058a1a54.2a683f915238b4f560dab0c724066cf0a7de9a851e96b0fb3a1e7f0881552f53
loading file https://hugg

## Tokenize

In [34]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)
emotion_enc = emotion.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [35]:
emotion_enc

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

## Model

In [36]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=LABELS)

loading configuration file https://huggingface.co/xlnet-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/06bdb0f5882dbb833618c81c3b4c996a0c79422fa2c95ffea3827f92fc2dba6b.da982e2e596ec73828dbae86525a1870e513bd63aae5a2dc773ccc840ac5c346
Model config XLNetConfig {
  "_name_or_path": "xlnet-base-cased",
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 12,
  "n_laye

Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

storing https://huggingface.co/xlnet-base-cased/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/9461853998373b0b2f8ef8011a13b62a2c5f540b2c535ef3ea46ed8a062b16a9.3e214f11a50e9e03eb47535b58522fc3cc11ac67c120a9450f6276de151af987
creating metadata file for /root/.cache/huggingface/transformers/9461853998373b0b2f8ef8011a13b62a2c5f540b2c535ef3ea46ed8a062b16a9.3e214f11a50e9e03eb47535b58522fc3cc11ac67c120a9450f6276de151af987
loading weights file https://huggingface.co/xlnet-base-cased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/9461853998373b0b2f8ef8011a13b62a2c5f540b2c535ef3ea46ed8a062b16a9.3e214f11a50e9e03eb47535b58522fc3cc11ac67c120a9450f6276de151af987
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on 

## Train

In [37]:
BATCH_SIZE = 64
logging_steps = len(emotion_enc['train']) // BATCH_SIZE
training_args = TrainingArguments(output_dir="results-xlnet",
                                  num_train_epochs=10,
                                  learning_rate=1e-5,
                                  per_device_train_batch_size=BATCH_SIZE,
                                  per_device_eval_batch_size=BATCH_SIZE,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1",
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  save_strategy='epoch',
                                  disable_tqdm=False)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [38]:
trainer = Trainer(model=model, 
                  args=training_args,
                  compute_metrics=metrics,
                  train_dataset=emotion_enc['train'],
                  eval_dataset=emotion_enc['validation'])
trainer.train();

The following columns in the training set  don't have a corresponding argument in `XLNetForSequenceClassification.forward` and have been ignored: text. If text are not expected by `XLNetForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 16000
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 2500


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.602427,0.777,0.75793,0.776453,0.777
2,0.807100,0.325453,0.8825,0.883109,0.890113,0.8825
3,0.807100,0.22942,0.9045,0.904842,0.906441,0.9045
4,0.238300,0.194307,0.924,0.923306,0.924084,0.924
5,0.238300,0.195522,0.9205,0.921556,0.924902,0.9205
6,0.147700,0.17939,0.932,0.932494,0.934272,0.932
7,0.147700,0.173508,0.928,0.927911,0.928287,0.928
8,0.111500,0.177625,0.927,0.927647,0.929252,0.927
9,0.111500,0.179483,0.929,0.928998,0.929292,0.929
10,0.094400,0.17272,0.931,0.931375,0.932419,0.931


The following columns in the evaluation set  don't have a corresponding argument in `XLNetForSequenceClassification.forward` and have been ignored: text. If text are not expected by `XLNetForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
Saving model checkpoint to results-xlnet/checkpoint-250
Configuration saved in results-xlnet/checkpoint-250/config.json
Model weights saved in results-xlnet/checkpoint-250/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `XLNetForSequenceClassification.forward` and have been ignored: text. If text are not expected by `XLNetForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
Saving model checkpoint to results-xlnet/checkpoint-500
Configuration saved in results-xlnet/checkpoint-500/config.json
Model weights saved in res

## Evaluate

In [39]:
preds = trainer.predict(emotion_enc['test'])
results_df.loc['XLNet-FineTune'] = [preds.metrics['test_accuracy'],
                                   preds.metrics['test_f1'],
                                   preds.metrics['test_precision'],
                                   preds.metrics['test_recall']]
preds.metrics

The following columns in the test set  don't have a corresponding argument in `XLNetForSequenceClassification.forward` and have been ignored: text. If text are not expected by `XLNetForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2000
  Batch size = 64


{'test_accuracy': 0.921,
 'test_f1': 0.9220821506638656,
 'test_loss': 0.19725088775157928,
 'test_precision': 0.9257849597454793,
 'test_recall': 0.921,
 'test_runtime': 6.5745,
 'test_samples_per_second': 304.207,
 'test_steps_per_second': 4.867}

In [40]:
y_true  = np.array(emotion_enc['test']['label'])
y_pred = np.argmax(preds.predictions, axis=1)
print(classification_report(y_pred, y_true, target_names=LABEL_NAMES)) 

              precision    recall  f1-score   support

         Sad       0.96      0.96      0.96       585
         Joy       0.93      0.97      0.95       669
        Love       0.89      0.82      0.85       173
       Anger       0.91      0.90      0.91       277
        Fear       0.82      0.91      0.86       202
    Surprise       0.94      0.66      0.78        94

    accuracy                           0.92      2000
   macro avg       0.91      0.87      0.88      2000
weighted avg       0.92      0.92      0.92      2000



# ELECTRA
* Pre-Training Text Encoders as Discriminators Rather than Generators
* https://arxiv.org/abs/2003.10555




## Improvements

### Corrupt Token Detection
* Rather than use masked language modeling
* Corrupt tokens in the input
* Solves the mismatch between [MASK] in training and not in test data

### Generator - Discriminator
* Discriminator and Generator equates to twice the parameters and size / compute
  * Best results obtained using a small MLM to corrupt tokens
  * Finds a generator 1/4 to 1/2 of the size performs well
* Outperforms using a Language Model

![](https://1.bp.blogspot.com/-sHybc03nJRo/XmfLongdVYI/AAAAAAAAFbI/a0t5w_zOZ-UtxYaoQlVkmTRsyFJyFddtQCLcBGAsYHQ/s640/image1.png)

### Not Quite a GAN
* When generator produces the correct token, it's considered real not a corrupted token
* No noise vector is added to the generator


### Faster Convergence
* ELECTRA Small - 1 GPU in 4 Days
* ELECTRA Small outperforms GPT & BERT Small
* Outperforms RoBERTa and XLNet with fewer parameters and 1/4 the compute

![](https://1.bp.blogspot.com/-H9nbLel4KvY/XmfMV_bD_7I/AAAAAAAAFbQ/c2aHgsCeoEs6M6Hm18EE07HdwKY-2x9nwCLcBGAsYHQ/s640/image2.png)

## Tokenize

In [25]:
model_name = 'google/electra-base-discriminator'
tokenizer = AutoTokenizer.from_pretrained(model_name)

loading configuration file https://huggingface.co/google/electra-base-discriminator/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/7d1569a4df2372d67341bda716bce4e3edf3e3ffadb97251bc4b6b35d459f624.57c13443a51769ce892714c93bb3ee3952bad66d7d9662d9de382b808377c3f8
Model config ElectraConfig {
  "_name_or_path": "google/electra-base-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_ve

In [26]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)
emotion_enc = emotion.map(tokenize, batched=True, batch_size=None)

Loading cached processed dataset at /root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705/cache-b5da8b86996d8144.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705/cache-17dc7671cf79fba3.arrow


## Model

In [27]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=LABELS)

loading configuration file https://huggingface.co/google/electra-base-discriminator/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/7d1569a4df2372d67341bda716bce4e3edf3e3ffadb97251bc4b6b35d459f624.57c13443a51769ce892714c93bb3ee3952bad66d7d9662d9de382b808377c3f8
Model config ElectraConfig {
  "_name_or_path": "google/electra-base-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type

## Train

In [28]:
BATCH_SIZE = 64
logging_steps = len(emotion_enc['train']) // BATCH_SIZE
training_args = TrainingArguments(output_dir="results-electra",
                                  num_train_epochs=10,
                                  learning_rate=1e-5,
                                  per_device_train_batch_size=BATCH_SIZE,
                                  per_device_eval_batch_size=BATCH_SIZE,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1",
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  save_strategy='epoch',
                                  disable_tqdm=False)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [29]:
trainer = Trainer(model=model, 
                  args=training_args,
                  compute_metrics=metrics,
                  train_dataset=emotion_enc['train'],
                  eval_dataset=emotion_enc['validation'])
trainer.train();

The following columns in the training set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: text. If text are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 16000
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 2500


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.985671,0.6055,0.48238,0.448588,0.6055
2,1.012700,0.426771,0.887,0.880329,0.890379,0.887
3,1.012700,0.248832,0.9225,0.922753,0.925354,0.9225
4,0.288000,0.210988,0.934,0.933812,0.934059,0.934
5,0.288000,0.19251,0.924,0.924834,0.926927,0.924
6,0.161800,0.169732,0.9365,0.936972,0.938768,0.9365
7,0.161800,0.168509,0.934,0.934027,0.934469,0.934
8,0.118500,0.168234,0.933,0.933337,0.934363,0.933
9,0.118500,0.165572,0.934,0.934603,0.935916,0.934
10,0.100400,0.164854,0.937,0.937463,0.938554,0.937


The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: text. If text are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to results-electra/checkpoint-250
Configuration saved in results-electra/checkpoint-250/config.json
Model weights saved in results-electra/checkpoint-250/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: text. If text are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
Saving model checkpoint to results-electra/checkpoint-500
Configuration saved

## Evaluate

In [30]:
preds = trainer.predict(emotion_enc['test'])
results_df.loc['ELECTRA-FineTune'] = [preds.metrics['test_accuracy'],
                                   preds.metrics['test_f1'],
                                   preds.metrics['test_precision'],
                                   preds.metrics['test_recall']]
preds.metrics

The following columns in the test set  don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: text. If text are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2000
  Batch size = 64


{'test_accuracy': 0.9275,
 'test_f1': 0.9280471468049704,
 'test_loss': 0.17102466523647308,
 'test_precision': 0.9299815941349044,
 'test_recall': 0.9275,
 'test_runtime': 4.5415,
 'test_samples_per_second': 440.38,
 'test_steps_per_second': 7.046}

In [31]:
y_true  = np.array(emotion_enc['test']['label'])
y_pred = np.argmax(preds.predictions, axis=1)
print(classification_report(y_pred, y_true, target_names=LABEL_NAMES)) 

              precision    recall  f1-score   support

         Sad       0.96      0.96      0.96       582
         Joy       0.93      0.97      0.95       663
        Love       0.91      0.79      0.84       183
       Anger       0.92      0.93      0.92       272
        Fear       0.92      0.86      0.89       240
    Surprise       0.71      0.78      0.75        60

    accuracy                           0.93      2000
   macro avg       0.89      0.88      0.89      2000
weighted avg       0.93      0.93      0.93      2000



# Classification

In [53]:
LABEL_NAMES

['Sad', 'Joy', 'Love', 'Anger', 'Fear', 'Surprise']

In [69]:
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0, return_all_scores=True)

In [70]:
classifier('I was really okay with the outcome')

[[{'label': 'LABEL_0', 'score': 0.00023302437330130488},
  {'label': 'LABEL_1', 'score': 0.9991430044174194},
  {'label': 'LABEL_2', 'score': 0.00012993838754482567},
  {'label': 'LABEL_3', 'score': 0.0002640315506141633},
  {'label': 'LABEL_4', 'score': 0.00015110323147382587},
  {'label': 'LABEL_5', 'score': 7.889809785410762e-05}]]

In [71]:
classifier('I ran down a dark alley and was')

[[{'label': 'LABEL_0', 'score': 0.006133202463388443},
  {'label': 'LABEL_1', 'score': 0.004056353121995926},
  {'label': 'LABEL_2', 'score': 0.00025297998217865825},
  {'label': 'LABEL_3', 'score': 0.05238369479775429},
  {'label': 'LABEL_4', 'score': 0.9321501851081848},
  {'label': 'LABEL_5', 'score': 0.005023508798331022}]]

# Fine-Tuning Tips

# Comparison

In [43]:
results_df.sort_values('F1', ascending=False)

Unnamed: 0,Accuracy,F1,Precision,Recall
ELECTRA-FineTune,0.9275,0.928047,0.929982,0.9275
RoBERTa-FineTune,0.926,0.926693,0.9299,0.926
XLNet-FineTune,0.921,0.922082,0.925785,0.921
BERT-FineTune,0.921,0.921033,0.922143,0.921
