In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
model_path = "/gdrive/MyDrive/PPNCKH/saved/SST2/DistilBERT"

# Data downloader


In [3]:
!pip install tokenizers -q
!pip install transformers -q
!pip install datasets -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m101.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
# Random
from transformers import set_seed
set_seed(42)

In [5]:
from datasets import load_dataset
from datasets import DatasetDict

train_dataset = load_dataset("sst2", split = "train")
test_dataset = load_dataset("sst2", split = "validation")

Downloading builder script:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.85k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.06k [00:00<?, ?B/s]

Downloading and preparing dataset sst2/default to /root/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5...


Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset sst2 downloaded and prepared to /root/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5. Subsequent calls will reuse this data.




Split the train into train and dev

In [6]:
train, dev = train_dataset.train_test_split(
    test_size = .3,
    seed = 42,
    stratify_by_column = "label"
).values()

In [7]:
train

Dataset({
    features: ['idx', 'sentence', 'label'],
    num_rows: 47144
})

In [8]:
dev

Dataset({
    features: ['idx', 'sentence', 'label'],
    num_rows: 20205
})

In [9]:
test_dataset

Dataset({
    features: ['idx', 'sentence', 'label'],
    num_rows: 872
})

In [10]:
dataset = DatasetDict({
    "train" : train,
    "dev" : dev,
    "test" : test_dataset
})

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 47144
    })
    dev: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 20205
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
})

In [12]:
num_labels = len(dataset["train"].features["label"].names)

In [13]:
num_labels

2

# Preprocessing

In [14]:
from transformers import AutoTokenizer

In [15]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased", use_fast = True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [16]:
tokenizer(dataset["train"][0]["sentence"])

{'input_ids': [101, 1893, 1183, 1105, 4888, 1183, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [17]:
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation = True)

In [18]:
tokenized_dataset = dataset.map(preprocess_function, batched = True)

Map:   0%|          | 0/47144 [00:00<?, ? examples/s]

Map:   0%|          | 0/20205 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Rename the label column to fit the concept

In [19]:
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [20]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 47144
    })
    dev: Dataset({
        features: ['idx', 'sentence', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 20205
    })
    test: Dataset({
        features: ['idx', 'sentence', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
})

# Finetuning DistilBERT for text classification

In [21]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

In [22]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-cased", num_labels = num_labels)

Downloading pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bia

In [23]:
batch_size = 16
learning_rate = 2e-5
weight_decay = .01
num_train_epochs = 10

In [24]:
args = TrainingArguments(
    "/saved",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = learning_rate,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_train_epochs,
    weight_decay = weight_decay,
    load_best_model_at_end = True,
    metric_for_best_model = "accuracy",
)

In [25]:
from datasets import load_metric
import numpy as np

In [26]:
metric = load_metric("glue", "mnli")

  metric = load_metric("glue", "mnli")


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

In [27]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    predictions = np.argmax(predictions, axis = 1)

    return metric.compute(predictions = predictions, references = labels)

In [28]:
trainer = Trainer(
    model,
    args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["dev"],
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

In [29]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2207,0.186214,0.93754
2,0.1528,0.187133,0.944667


KeyboardInterrupt: ignored

In [30]:
trainer.save_model(model_path)

# Evaluation

In [31]:
import torch

In [32]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [33]:
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels = num_labels).to(device)

In [34]:
trainer.evaluate()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2207,0.186214,0.93754
2,0.1528,0.187133,0.944667
2,0.1528,0.198887,0.944568


{'eval_loss': 0.19888655841350555, 'eval_accuracy': 0.9445681761940113}

In [35]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])

predictions = np.argmax(predictions, axis = 1)

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2207,0.186214,0.93754
2,0.1528,0.187133,0.944667
2,0.1528,0.198887,0.944568


In [36]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

In [37]:
print(classification_report(predictions, labels))

              precision    recall  f1-score   support

           0       0.88      0.92      0.90       407
           1       0.93      0.89      0.91       465

    accuracy                           0.90       872
   macro avg       0.90      0.91      0.90       872
weighted avg       0.91      0.90      0.90       872



In [38]:
print(f"Accuracy = {accuracy_score(predictions, labels):.6f}, \
        Precision = {precision_score(predictions, labels, average = 'macro'):.6f}, \
        Recall = {recall_score(predictions, labels, average = 'macro'):.6f}, \
        F1 = {f1_score(predictions, labels, average = 'macro'):.6f}")

Accuracy = 0.904817,         Precision = 0.904342,         Recall = 0.906002,         F1 = 0.904645


In [39]:
print(confusion_matrix(predictions, labels))

[[376  31]
 [ 52 413]]


In [40]:
def print_clarification(idx):
    for sent, pred, label in zip(tokenized_dataset["test"][0:idx]["sentence"], predictions[0:idx], labels[0:idx]):
        print(sent)
        print(f"Predicted: {pred} - Label: {label}")

In [41]:
print_clarification(10)

it 's a charming and often affecting journey . 
Predicted: 1 - Label: 1
unflinchingly bleak and desperate 
Predicted: 0 - Label: 0
allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker . 
Predicted: 1 - Label: 1
the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales . 
Predicted: 1 - Label: 1
it 's slow -- very , very slow . 
Predicted: 0 - Label: 0
although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women . 
Predicted: 1 - Label: 1
a sometimes tedious film . 
Predicted: 0 - Label: 0
or doing last year 's taxes with your ex-wife . 
Predicted: 0 - Label: 0
you do n't have to know about music to appreciate the film 's easygoing blend of comedy and romance . 
Predicted: 1 - Label: 1
in exactly 89 minutes , most of which passed as slowly as if i 'd been sitting naked on an igloo , formula 51 sank from quirky to jerky to utter tur

In [42]:
for i in range(len(predictions)):
    if predictions[i] != labels[i]:
        print(tokenized_dataset["test"][i]["sentence"])
        print(f"Predicted: {predictions[i]} - Label: {labels[i]}")

we root for ( clara and paul ) , even like them , though perhaps it 's an emotion closer to pity . 
Predicted: 0 - Label: 1
the script kicks in , and mr. hartley 's distended pace and foot-dragging rhythms follow . 
Predicted: 1 - Label: 0
though it 's become almost redundant to say so , major kudos go to leigh for actually casting people who look working-class . 
Predicted: 0 - Label: 1
you wo n't like roger , but you will quickly recognize him . 
Predicted: 1 - Label: 0
this riveting world war ii moral suspense story deals with the shadow side of american culture : racial prejudice in its ugly and diverse forms . 
Predicted: 1 - Label: 0
it 's one of those baseball pictures where the hero is stoic , the wife is patient , the kids are as cute as all get-out and the odds against success are long enough to intimidate , but short enough to make a dream seem possible . 
Predicted: 0 - Label: 1
sam mendes has become valedictorian at the school for soft landings and easy ways out . 
Predict

Inference time measuring

In [43]:
def predict(text, model):
    tokenized = tokenizer(text, return_tensors = "pt").to(device)
    output = model(**tokenized)[0].detach().cpu().numpy()
    
    prediction = np.argmax(output, axis = 1)[0]

    return prediction

In [44]:
predict("How far is it from Denver to Aspen ?", model)

0

In [45]:
from time import perf_counter

In [46]:
from tqdm import tqdm_notebook

In [47]:
predicted_timelapse = []

for text in tqdm_notebook(tokenized_dataset["test"]["sentence"]):
    start_time = perf_counter()
    predict(text = text, model = model)
    end_time = perf_counter()

    predicted_timelapse.append(end_time - start_time)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text in tqdm_notebook(tokenized_dataset["test"]["sentence"]):


  0%|          | 0/872 [00:00<?, ?it/s]

In [48]:
timelapse_mean = np.mean(predicted_timelapse) * 1000
timelapse_std = np.std(predicted_timelapse) * 1000

In [49]:
print(f"Mean: {timelapse_mean:.8f}, stdev: {timelapse_std:.8f} ms")

Mean: 9.51957944, stdev: 6.76188823 ms


Count parameters

In [50]:
from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params+=params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params

In [51]:
count_parameters(model)

+---------------------------------------------------------+------------+
|                         Modules                         | Parameters |
+---------------------------------------------------------+------------+
|       distilbert.embeddings.word_embeddings.weight      |  22268928  |
|     distilbert.embeddings.position_embeddings.weight    |   393216   |
|          distilbert.embeddings.LayerNorm.weight         |    768     |
|           distilbert.embeddings.LayerNorm.bias          |    768     |
|  distilbert.transformer.layer.0.attention.q_lin.weight  |   589824   |
|   distilbert.transformer.layer.0.attention.q_lin.bias   |    768     |
|  distilbert.transformer.layer.0.attention.k_lin.weight  |   589824   |
|   distilbert.transformer.layer.0.attention.k_lin.bias   |    768     |
|  distilbert.transformer.layer.0.attention.v_lin.weight  |   589824   |
|   distilbert.transformer.layer.0.attention.v_lin.bias   |    768     |
| distilbert.transformer.layer.0.attention.out_lin.

65783042