In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
from datasets import load_dataset
from transformers import DistilBertTokenizerFast, TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import create_optimizer
import os
import tensorflow as tf
import numpy as np
import sklearn

## 1. Load Data

In [None]:
data_sst2 = load_dataset("glue", "sst2")



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
data_sst2

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [None]:
data_sst2['train'][0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

In [None]:
data_sst2['train'].features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

## 1. Train on SST2 and Test on SST2

In [None]:
#base model = "distilbert-base-uncased", "DistilRoBERTa"
def preprocess(data, tokenizer):
  return tokenizer(data['sentence'], truncation=True)

### 1.1 DistilBert (Model v1)

we can use the map method of our dataset object to apply above function on all datapoints of all splits.

Note that we passed batched=True to encode the texts by batches together. This is to leverage the full benefit of the fast tokenizer we loaded earlier, which will use multi-threading to treat the texts in a batch concurrently.

In [None]:
base_model = "distilbert-base-uncased"
tokenizer_v1 = AutoTokenizer.from_pretrained(base_model)
dataset_enc = data_sst2.map(preprocess, batched=True, fn_kwargs={"tokenizer": tokenizer_v1})
dataset_enc["train"].features



  0%|          | 0/1 [00:00<?, ?ba/s]



{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [None]:
dataset_enc["train"][0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0,
 'input_ids': [101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
dataset_enc["train"].features["label"]

ClassLabel(names=['negative', 'positive'], id=None)

Convert datasets to tf.data.Dataset, so that Keras can understand it.

In [None]:
model_v1 = TFAutoModelForSequenceClassification.from_pretrained(base_model, num_labels=2)
tf_train_dataset = model_v1.prepare_tf_dataset(
    dataset_enc["train"],
    shuffle=True,
    batch_size=16,
    tokenizer=tokenizer_v1
)

tf_validation_dataset = model_v1.prepare_tf_dataset(
    dataset_enc["validation"],
    shuffle=False,
    batch_size=16,
    tokenizer=tokenizer_v1,
)

""" We cant use test, since labels are hidden
tf_test_dataset = model_v1.prepare_tf_dataset(
    dataset_enc["test"],
    shuffle=False,
    batch_size=16,
    tokenizer=tokenizer_v1,
)"""
# WE can use tf_train_dataset and tf_validation_dataset in model.fit

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_layer_norm', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'dropout_19', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

' We cant use test, since labels are hidden\ntf_test_dataset = model_v1.prepare_tf_dataset(\n    dataset_enc["test"],\n    shuffle=False,\n    batch_size=16,\n    tokenizer=tokenizer_v1,\n)'

In [None]:
batch_size = 16
num_epochs = 3
batches_per_epoch = len(dataset_enc["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)

optimizer, schedule = create_optimizer(
    init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps
)
model_v1.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [None]:
checkpoint_path = "training_bert_sst2/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)
model_v1.fit(
    tf_train_dataset,
    epochs=num_epochs,
    callbacks=[cp_callback]
)

Epoch 1/3
Epoch 1: saving model to training_bert_sst2/cp.ckpt
Epoch 2/3
Epoch 2: saving model to training_bert_sst2/cp.ckpt
Epoch 3/3
Epoch 3: saving model to training_bert_sst2/cp.ckpt


<keras.callbacks.History at 0x7f33b60c6110>

In [None]:
# Evaluate model performace
pred = model_v1.predict(tf_validation_dataset)
pred_labels = np.argmax(pred.logits, axis=-1)
acc = sklearn.metrics.accuracy_score(pred_labels, dataset_enc["validation"]['label'])
print(f"Model accuracy is {acc}")

Model accuracy is 0.9151376146788991


In [None]:
# save model
model_v1.save_pretrained('drive/MyDrive/FIRE/OOD/train_on_sst2/bert')

In [None]:
# load model
new_model = TFAutoModelForSequenceClassification.from_pretrained('drive/MyDrive/FIRE/OOD/train_on_sst2/bert')
#new_model.load_weights('drive/MyDrive/FIRE/OOD/train_on_sst2/bert')
pred = new_model.predict(tf_validation_dataset)
pred_labels = np.argmax(pred.logits, axis=-1)
acc = sklearn.metrics.accuracy_score(pred_labels, dataset_enc["validation"]['label'])
print(f"Model accuracy is {acc}")

Some layers from the model checkpoint at drive/MyDrive/FIRE/OOD/train_on_sst2/bert were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at drive/MyDrive/FIRE/OOD/train_on_sst2/bert and are newly initialized: ['dropout_59']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model accuracy is 0.9151376146788991


### 1.2 DistilRoBERTa (Model v2)

In [None]:
base_model = "distilroberta-base"
tokenizer_v2 = AutoTokenizer.from_pretrained(base_model)
dataset_enc = data_sst2.map(preprocess, batched=True, fn_kwargs={"tokenizer": tokenizer_v2})
dataset_enc["train"].features



  0%|          | 0/1 [00:00<?, ?ba/s]



{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [None]:
dataset_enc["train"][0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0,
 'input_ids': [0, 37265, 92, 3556, 2485, 31, 5, 20536, 2833, 1437, 2],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
dataset_enc["train"].features["label"]

ClassLabel(names=['negative', 'positive'], id=None)

In [None]:
model_v2 = TFAutoModelForSequenceClassification.from_pretrained(base_model, num_labels=2)
tf_train_dataset = model_v2.prepare_tf_dataset(
    dataset_enc["train"],
    shuffle=True,
    batch_size=16,
    tokenizer=tokenizer_v2
)

tf_validation_dataset = model_v2.prepare_tf_dataset(
    dataset_enc["validation"],
    shuffle=False,
    batch_size=16,
    tokenizer=tokenizer_v2,
)
# WE can use tf_train_dataset and tf_validation_dataset in model.fit

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
batch_size = 32
num_epochs = 3
batches_per_epoch = len(dataset_enc["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)

optimizer, schedule = create_optimizer(
    init_lr=1e-5, num_warmup_steps=0, num_train_steps=total_train_steps
)
model_v2.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [None]:
model_v2.fit(
    tf_train_dataset,
    epochs=num_epochs
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f84f732c050>

In [None]:
# Evaluate model performace
pred = model_v2.predict(tf_validation_dataset)
pred_labels = np.argmax(pred.logits, axis=-1)
acc = sklearn.metrics.accuracy_score(pred_labels, dataset_enc["validation"]['label'])
print(f"Model accuracy is {acc}")

Model accuracy is 0.9094036697247706


In [None]:
# save model
model_v2.save_pretrained('drive/MyDrive/FIRE/OOD/train_on_sst2/roberta')

In [None]:
# load model
new_model = TFAutoModelForSequenceClassification.from_pretrained('drive/MyDrive/FIRE/OOD/train_on_sst2/roberta')
pred = new_model.predict(tf_validation_dataset)
pred_labels = np.argmax(pred.logits, axis=-1)
acc = sklearn.metrics.accuracy_score(pred_labels, dataset_enc["validation"]['label'])
print(f"Model accuracy is {acc}")

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at drive/MyDrive/FIRE/OOD/train_on_sst2/roberta.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Model accuracy is 0.9094036697247706


## Inference Examples

In [None]:
sentences = [
    'uneasy mishmash of styles and genres .',
    'director rob marshall went out gunning to make a great one .'
]

In [None]:
tokenized_v1 = tokenizer_v1(sentences, return_tensors="np", padding="longest")
tokenized_v2 = tokenizer_v2(sentences, return_tensors="np", padding="longest")

outputs_v1 = model_v1(tokenized_v1).logits
outputs_v2 = model_v2(tokenized_v1).logits

classifications_v1 = np.argmax(outputs_v1, axis=1)
print("Prediction with finetuned BERT: ")
print(classifications_v1)

classifications_v2 = np.argmax(outputs_v2, axis=1)
print("Prediction with finetuned RoBERTa: ")
print(classifications_v2)

Prediction with finetuned BERT: 
[0 1]
Prediction with finetuned RoBERTa: 
[1 0]
