# Data downloader


In [1]:
!pip install tokenizers -q
!pip install transformers -q
!pip install seqeval -q
!pip install datasets -q

[K     |████████████████████████████████| 7.6 MB 7.9 MB/s 
[K     |████████████████████████████████| 5.8 MB 7.3 MB/s 
[K     |████████████████████████████████| 182 kB 59.3 MB/s 
[K     |████████████████████████████████| 43 kB 1.4 MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 451 kB 7.3 MB/s 
[K     |████████████████████████████████| 132 kB 75.8 MB/s 
[K     |████████████████████████████████| 212 kB 73.8 MB/s 
[K     |████████████████████████████████| 127 kB 77.7 MB/s 
[?25h

In [2]:
from datasets import load_dataset

dataset = load_dataset("trec")

Downloading builder script:   0%|          | 0.00/5.09k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading and preparing dataset trec/default to /root/.cache/huggingface/datasets/trec/default/2.0.0/f2469cab1b5fceec7249fda55360dfdbd92a7a5b545e91ea0f78ad108ffac1c2...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/336k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5452 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset trec downloaded and prepared to /root/.cache/huggingface/datasets/trec/default/2.0.0/f2469cab1b5fceec7249fda55360dfdbd92a7a5b545e91ea0f78ad108ffac1c2. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
dataset["train"][0]

{'text': 'How did serfdom develop in and then leave Russia ?',
 'coarse_label': 2,
 'fine_label': 26}

In [4]:
num_labels = len(dataset["train"].features["coarse_label"].names)

In [5]:
num_labels

6

# Preprocessing

In [6]:
from transformers import AutoTokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased", use_fast = True)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [8]:
tokenizer(dataset["train"][0]["text"])

{'input_ids': [101, 1731, 1225, 14516, 11931, 9277, 3689, 1107, 1105, 1173, 1817, 2733, 136, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation = True)

In [10]:
tokenized_dataset = dataset.map(preprocess_function, batched = True, remove_columns = ["fine_label"])

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Rename the label column to fit the concept

In [11]:
tokenized_dataset = tokenized_dataset.rename_column("coarse_label", "labels")

In [12]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 5452
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 500
    })
})

# Finetuning BERT for text classification

In [13]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

In [14]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-cased", num_labels = num_labels)

Downloading:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.w

In [15]:
batch_size = 16
learning_rate = 2e-5
weight_decay = .01
num_train_epochs = 5

In [16]:
args = TrainingArguments(
    "/saved",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = learning_rate,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_train_epochs,
    weight_decay = weight_decay,
    load_best_model_at_end = True,
    metric_for_best_model = "accuracy",
    # push_to_hub = True
)

In [17]:
from datasets import load_metric
import numpy as np

In [18]:
metric = load_metric("glue", "mnli")

  metric = load_metric("glue", "mnli")


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

In [19]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    predictions = np.argmax(predictions, axis = 1)

    return metric.compute(predictions = predictions, references = labels)

In [20]:
trainer = Trainer(
    model,
    args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["test"],
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.243678,0.93
2,0.440500,0.130365,0.968
3,0.101800,0.13526,0.972
4,0.101800,0.179389,0.966
5,0.036900,0.162615,0.97


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 16
Saving model checkpoint to /saved/checkpoint-1364
Configuration saved in /saved/checkpoint-1364/config.json
Model weights saved in /saved/checkpoint-1364/pytorch_model.bin
tokenizer config file saved in /saved/checkpoint-1364/tokenizer_config.json
Special tokens file saved in /saved/checkpoint-1364/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size =

TrainOutput(global_step=1705, training_loss=0.1739215985071624, metrics={'train_runtime': 120.0479, 'train_samples_per_second': 227.076, 'train_steps_per_second': 14.203, 'total_flos': 169213299099120.0, 'train_loss': 0.1739215985071624, 'epoch': 5.0})

# Evaluation

In [22]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 16


{'eval_loss': 0.13525953888893127,
 'eval_accuracy': 0.972,
 'eval_runtime': 0.4549,
 'eval_samples_per_second': 1099.181,
 'eval_steps_per_second': 70.348,
 'epoch': 5.0}

In [23]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])

predictions = np.argmax(predictions, axis = 1)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 500
  Batch size = 16


In [24]:
from sklearn.metrics import classification_report, confusion_matrix

In [25]:
print(classification_report(predictions, labels))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       0.90      0.96      0.93        89
           2       1.00      0.97      0.99       142
           3       0.98      0.98      0.98        65
           4       0.96      0.96      0.96        81
           5       0.99      0.98      0.99       114

    accuracy                           0.97       500
   macro avg       0.97      0.98      0.97       500
weighted avg       0.97      0.97      0.97       500



In [26]:
print(confusion_matrix(predictions, labels))

[[  9   0   0   0   0   0]
 [  0  85   0   1   3   0]
 [  0   3 138   0   0   1]
 [  0   1   0  64   0   0]
 [  0   3   0   0  78   0]
 [  0   2   0   0   0 112]]


In [27]:
def print_clarification(idx):
    for sent, pred, label in zip(tokenized_dataset["test"][0:idx]["text"], predictions[0:idx], labels[0:idx]):
        print(sent)
        print(f"Predicted: {pred} - Label: {label}")

In [28]:
print_clarification(10)

How far is it from Denver to Aspen ?
Predicted: 5 - Label: 5
What county is Modesto , California in ?
Predicted: 4 - Label: 4
Who was Galileo ?
Predicted: 3 - Label: 3
What is an atom ?
Predicted: 2 - Label: 2
When did Hawaii become a state ?
Predicted: 5 - Label: 5
How tall is the Sears Building ?
Predicted: 5 - Label: 5
George Bush purchased a small interest in which baseball team ?
Predicted: 3 - Label: 3
What is Australia 's national flower ?
Predicted: 1 - Label: 1
Why does the moon turn orange ?
Predicted: 2 - Label: 2
What is autism ?
Predicted: 2 - Label: 2
