# Data downloader


In [1]:
!pip install tokenizers -q
!pip install transformers -q
!pip install datasets -q

[K     |████████████████████████████████| 7.6 MB 23.5 MB/s 
[K     |████████████████████████████████| 5.8 MB 10.0 MB/s 
[K     |████████████████████████████████| 182 kB 24.4 MB/s 
[K     |████████████████████████████████| 451 kB 34.2 MB/s 
[K     |████████████████████████████████| 132 kB 73.7 MB/s 
[K     |████████████████████████████████| 212 kB 58.2 MB/s 
[K     |████████████████████████████████| 127 kB 76.5 MB/s 
[?25h

In [2]:
from datasets import load_dataset
from datasets import DatasetDict

train_dataset = load_dataset("trec", split = "train")
test_dataset = load_dataset("trec", split = "test")

Downloading builder script:   0%|          | 0.00/5.09k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading and preparing dataset trec/default to /root/.cache/huggingface/datasets/trec/default/2.0.0/f2469cab1b5fceec7249fda55360dfdbd92a7a5b545e91ea0f78ad108ffac1c2...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/336k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5452 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset trec downloaded and prepared to /root/.cache/huggingface/datasets/trec/default/2.0.0/f2469cab1b5fceec7249fda55360dfdbd92a7a5b545e91ea0f78ad108ffac1c2. Subsequent calls will reuse this data.




Split the train into train and dev

In [3]:
train, dev = train_dataset.train_test_split(
    test_size = .2,
    seed = 42,
    stratify_by_column = "coarse_label"
).values()

In [4]:
train

Dataset({
    features: ['text', 'coarse_label', 'fine_label'],
    num_rows: 4361
})

In [5]:
dev

Dataset({
    features: ['text', 'coarse_label', 'fine_label'],
    num_rows: 1091
})

In [6]:
test_dataset

Dataset({
    features: ['text', 'coarse_label', 'fine_label'],
    num_rows: 500
})

In [7]:
dataset = DatasetDict({
    "train" : train,
    "dev" : dev,
    "test" : test_dataset
})

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'coarse_label', 'fine_label'],
        num_rows: 4361
    })
    dev: Dataset({
        features: ['text', 'coarse_label', 'fine_label'],
        num_rows: 1091
    })
    test: Dataset({
        features: ['text', 'coarse_label', 'fine_label'],
        num_rows: 500
    })
})

In [9]:
num_labels = len(dataset["train"].features["coarse_label"].names)

In [10]:
num_labels

6

# Preprocessing

In [11]:
from transformers import AutoTokenizer

In [12]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased", use_fast = True)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [13]:
tokenizer(dataset["train"][0]["text"])

{'input_ids': [101, 10208, 1103, 2369, 1115, 1110, 16448, 1118, 170, 4354, 1761, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [14]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation = True)

In [15]:
tokenized_dataset = dataset.map(preprocess_function, batched = True, remove_columns = ["fine_label"])

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Rename the label column to fit the concept

In [16]:
tokenized_dataset = tokenized_dataset.rename_column("coarse_label", "labels")

In [17]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 4361
    })
    dev: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1091
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 500
    })
})

# Finetuning BERT for text classification

In [18]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

In [19]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-cased", num_labels = num_labels)

Downloading:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier

In [20]:
batch_size = 16
learning_rate = 2e-5
weight_decay = .01
num_train_epochs = 5

In [21]:
args = TrainingArguments(
    "/saved",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = learning_rate,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_train_epochs,
    weight_decay = weight_decay,
    load_best_model_at_end = True,
    metric_for_best_model = "accuracy",
    # push_to_hub = True
)

In [22]:
from datasets import load_metric
import numpy as np

In [23]:
metric = load_metric("glue", "mnli")

  metric = load_metric("glue", "mnli")


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

In [24]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    predictions = np.argmax(predictions, axis = 1)

    return metric.compute(predictions = predictions, references = labels)

In [25]:
trainer = Trainer(
    model,
    args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["dev"],
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

In [26]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.30832,0.901008
2,0.442200,0.245364,0.937672
3,0.442200,0.262752,0.941338
4,0.067400,0.273795,0.942255
5,0.067400,0.284069,0.942255


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1091
  Batch size = 16
Saving model checkpoint to /saved/checkpoint-1365
Configuration saved in /saved/checkpoint-1365/config.json
Model weights saved in /saved/checkpoint-1365/pytorch_model.bin
tokenizer config file saved in /saved/checkpoint-1365/tokenizer_config.json
Special tokens file saved in /saved/checkpoint-1365/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from /saved/checkpoint-1092 (score: 0.9422548120989918).


TrainOutput(global_step=1365, training_loss=0.19402435638092377, metrics={'train_runtime': 102.3049, 'train_samples_per_second': 213.137, 'train_steps_per_second': 13.342, 'total_flos': 134604498757212.0, 'train_loss': 0.19402435638092377, 'epoch': 5.0})

# Evaluation

In [27]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1091
  Batch size = 16


{'eval_loss': 0.2737949788570404,
 'eval_accuracy': 0.9422548120989918,
 'eval_runtime': 1.0376,
 'eval_samples_per_second': 1051.428,
 'eval_steps_per_second': 66.497,
 'epoch': 5.0}

In [28]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])

predictions = np.argmax(predictions, axis = 1)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 500
  Batch size = 16


In [29]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

In [30]:
print(classification_report(predictions, labels))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       0.89      0.98      0.93        86
           2       1.00      0.97      0.99       142
           3       0.98      0.97      0.98        66
           4       0.99      0.96      0.98        83
           5       0.99      0.98      0.99       114

    accuracy                           0.97       500
   macro avg       0.98      0.98      0.98       500
weighted avg       0.98      0.97      0.97       500



In [31]:
print(f"Accuracy = {accuracy_score(predictions, labels):.6f}, \
        Precision = {precision_score(predictions, labels, average = 'macro'):.6f}, \
        Recall = {recall_score(predictions, labels, average = 'macro'):.6f}, \
        F1 = {f1_score(predictions, labels, average = 'macro'):.6f}")

Accuracy = 0.974000,         Precision = 0.976173,         Recall = 0.977431,         F1 = 0.976423


In [32]:
print(confusion_matrix(predictions, labels))

[[  9   0   0   0   0   0]
 [  0  84   0   1   1   0]
 [  0   3 138   0   0   1]
 [  0   2   0  64   0   0]
 [  0   3   0   0  80   0]
 [  0   2   0   0   0 112]]


In [33]:
def print_clarification(idx):
    for sent, pred, label in zip(tokenized_dataset["test"][0:idx]["text"], predictions[0:idx], labels[0:idx]):
        print(sent)
        print(f"Predicted: {pred} - Label: {label}")

In [34]:
print_clarification(10)

How far is it from Denver to Aspen ?
Predicted: 5 - Label: 5
What county is Modesto , California in ?
Predicted: 4 - Label: 4
Who was Galileo ?
Predicted: 3 - Label: 3
What is an atom ?
Predicted: 2 - Label: 2
When did Hawaii become a state ?
Predicted: 5 - Label: 5
How tall is the Sears Building ?
Predicted: 5 - Label: 5
George Bush purchased a small interest in which baseball team ?
Predicted: 3 - Label: 3
What is Australia 's national flower ?
Predicted: 1 - Label: 1
Why does the moon turn orange ?
Predicted: 2 - Label: 2
What is autism ?
Predicted: 2 - Label: 2


In [35]:
for i in range(len(predictions)):
    if predictions[i] != labels[i]:
        print(tokenized_dataset["test"][i]["text"])
        print(f"Predicted: {predictions[i]} - Label: {labels[i]}")

What imaginary line is halfway between the North and South Poles ?
Predicted: 1 - Label: 4
What is the speed hummingbirds fly ?
Predicted: 2 - Label: 5
The U.S. Department of Treasury first issued paper currency for the U.S. during which war ?
Predicted: 3 - Label: 1
What is the longest major league baseball-winning streak ?
Predicted: 5 - Label: 1
What is the major fault line near Kentucky ?
Predicted: 4 - Label: 1
What is the sales tax in Minnesota ?
Predicted: 5 - Label: 1
What are the spots on dominoes called ?
Predicted: 4 - Label: 1
What is the electrical output in Madrid , Spain ?
Predicted: 2 - Label: 1
What are the two houses of the Legislative branch ?
Predicted: 3 - Label: 1
What is the criterion for being legally blind ?
Predicted: 2 - Label: 1
What is foot and mouth disease ?
Predicted: 2 - Label: 1
What monastery was raided by Vikings in the late eighth century ?
Predicted: 4 - Label: 1
What did Jesse Jackson organize ?
Predicted: 1 - Label: 3
