In [1]:
!pip install transformers



In [2]:
!pip install datasets



In [3]:
from datasets import load_dataset

dataset = load_dataset("emotion")

Using custom data configuration default
Reusing dataset emotion (/root/.cache/huggingface/datasets/emotion/default/0.0.0/6e4212efe64fd33728549b8f0435c73081391d543b596a05936857df98acb681)


In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 2000
    })
})


In [5]:
dataset['train'][100]

{'label': 2,
 'text': 'i wont let me child cry it out because i feel that loving her and lily when she was little was going to be opportunities that only lasted for those short few months'}

In [6]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [7]:
def tokenize_function(example):
  return tokenizer(example['text'], truncation=True)

In [8]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Loading cached processed dataset at /root/.cache/huggingface/datasets/emotion/default/0.0.0/6e4212efe64fd33728549b8f0435c73081391d543b596a05936857df98acb681/cache-c2071efaefea9b32.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/emotion/default/0.0.0/6e4212efe64fd33728549b8f0435c73081391d543b596a05936857df98acb681/cache-9b9f49179f9efae0.arrow


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [9]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'text', 'token_type_ids'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'text', 'token_type_ids'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'text', 'token_type_ids'],
        num_rows: 2000
    })
})

In [10]:
set(tokenized_dataset['train']['label'])

{0, 1, 2, 3, 4, 5}

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
data_collator

DataCollatorWithPadding(tokenizer=PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None)

In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [14]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=6)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [15]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [16]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 16000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6000


Step,Training Loss
500,0.7516
1000,0.3577
1500,0.2634
2000,0.2509
2500,0.182
3000,0.1568
3500,0.1391
4000,0.1388
4500,0.0886
5000,0.1126


Saving model checkpoint to test-trainer/checkpoint-500
Configuration saved in test-trainer/checkpoint-500/config.json
Model weights saved in test-trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-500/special_tokens_map.json
Saving model checkpoint to test-trainer/checkpoint-1000
Configuration saved in test-trainer/checkpoint-1000/config.json
Model weights saved in test-trainer/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-1000/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to test-trainer/checkpoint-1500
Configuration saved in test-trainer/checkpoint-1500/config.json
Model weights saved in test-trainer/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-1500/tokenizer_config.json
Special tokens file saved

TrainOutput(global_step=6000, training_loss=0.22143900680541992, metrics={'train_runtime': 659.3443, 'train_samples_per_second': 72.8, 'train_steps_per_second': 9.1, 'total_flos': 1302049199981952.0, 'train_loss': 0.22143900680541992, 'epoch': 3.0})

In [17]:
predictions = trainer.predict(tokenized_dataset["test"])

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 2000
  Batch size = 8


In [21]:
predictions

PredictionOutput(predictions=array([[ 8.454301  , -1.8288338 , -1.9460562 , -1.1483477 , -1.6323161 ,
        -1.7061889 ],
       [ 8.478015  , -1.8272443 , -1.7600707 , -1.1059024 , -1.8097972 ,
        -1.8396385 ],
       [ 8.411227  , -1.6476626 , -1.5176547 , -1.4721982 , -1.9037353 ,
        -1.8326322 ],
       ...,
       [-1.7230492 ,  8.401841  , -0.6115043 , -1.8850343 , -2.71189   ,
        -1.2977178 ],
       [-1.8113626 ,  8.315013  , -0.97638655, -1.839709  , -1.8033446 ,
        -1.6238836 ],
       [-1.0787348 , -2.1998155 , -2.1257672 , -2.0551157 ,  5.087704  ,
         4.2144423 ]], dtype=float32), label_ids=array([0, 0, 0, ..., 1, 1, 4]), metrics={'test_loss': 0.19932958483695984, 'test_runtime': 6.0181, 'test_samples_per_second': 332.333, 'test_steps_per_second': 41.542})

In [24]:
import numpy as np
preds = np.argmax(predictions.predictions, axis=-1)
preds

array([0, 0, 0, ..., 1, 1, 4])

In [31]:
labels = np.array(tokenized_dataset['test']['label'])

In [32]:
labels

array([0, 0, 0, ..., 1, 1, 4])

In [36]:
np.sum(preds==labels) / float( len(labels) )

0.9345

In [38]:
from datasets import list_metrics

list_metrics()

['accuracy',
 'bertscore',
 'bleu',
 'bleurt',
 'cer',
 'comet',
 'coval',
 'cuad',
 'f1',
 'gleu',
 'glue',
 'indic_glue',
 'matthews_correlation',
 'meteor',
 'pearsonr',
 'precision',
 'recall',
 'rouge',
 'sacrebleu',
 'sari',
 'seqeval',
 'spearmanr',
 'squad',
 'squad_v2',
 'super_glue',
 'wer',
 'xnli']

In [39]:
from datasets import load_metric

metric = load_metric('accuracy')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1362.0, style=ProgressStyle(description…




In [40]:
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.9345}

In [71]:
p = trainer.predict([tokenizer("Wow what a surprise")])
np.argmax(p.predictions)

***** Running Prediction *****
  Num examples = 1
  Batch size = 8


5

In [65]:
dataset['train'].features

{'label': ClassLabel(num_classes=6, names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], names_file=None, id=None),
 'text': Value(dtype='string', id=None)}