Goals is to reach 92-93% accuracy by following this tutorial:
https://huggingface.co/docs/transformers/en/tasks/sequence_classification

In [3]:
! pip install datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


In [4]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np

In [5]:
imdb = load_dataset("imdb")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [8]:
tokenized_imdb = imdb.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [11]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [12]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
training_args = TrainingArguments(
    output_dir="text_classification_imdb_tut",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2256,0.201079,0.92112


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2256,0.201079,0.92112
2,0.1473,0.231463,0.93224


TrainOutput(global_step=3126, training_loss=0.2050294442887651, metrics={'train_runtime': 3325.0457, 'train_samples_per_second': 15.037, 'train_steps_per_second': 0.94, 'total_flos': 6556904415524352.0, 'train_loss': 0.2050294442887651, 'epoch': 2.0})

In [19]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [21]:
trainer.save_model('/content/drive/My Drive/ML_Experiments/trained_models/distillBERT-fine-tuned-IMDB')


In [29]:
# prompt: make predictions on custom inputs using the trained model

from transformers import pipeline
import json

classifier = pipeline("sentiment-analysis",
                      model='/content/drive/My Drive/ML_Experiments/trained_models/distillBERT-fine-tuned-IMDB',
                      device='cuda')

reviews = [
    "This movie was the worst movie I have ever seen. The acting was terrible and the plot was boring.",
    "I loved the movie. The actors were great and the plot was entertaining.",
    "The movie was okay. The acting was good but the plot was boring.",
    "The movie was okay. The acting was horrible but the plot was interesting.",
    "well... meh",
    "need to tell all my friends how hilariously bad it is",
    "4 out of 5",
    "The movie was awful. I had a great time discussing how hilarious all the decisions were with my friends. We had a blast!",
    "Sometimes you've got to think what is going on in the heads of writers. This was quite a departure from orthodox movies. But surprisingly we sat through all of it.",
    "I've been waiting for a movie like this for months. This is a brilliant film with a fantastic cast. The music was perfect and the story was just amazing."
]

results = classifier(reviews)
for i in range(len(reviews)):
  results[i]['review'] = reviews[i]
print (json.dumps(results, indent=4))


[
    {
        "label": "NEGATIVE",
        "score": 0.9926199316978455,
        "review": "This movie was the worst movie I have ever seen. The acting was terrible and the plot was boring."
    },
    {
        "label": "POSITIVE",
        "score": 0.9914090037345886,
        "review": "I loved the movie. The actors were great and the plot was entertaining."
    },
    {
        "label": "NEGATIVE",
        "score": 0.9740141034126282,
        "review": "The movie was okay. The acting was good but the plot was boring."
    },
    {
        "label": "NEGATIVE",
        "score": 0.8696280717849731,
        "review": "The movie was okay. The acting was horrible but the plot was interesting."
    },
    {
        "label": "NEGATIVE",
        "score": 0.6881898045539856,
        "review": "well... meh"
    },
    {
        "label": "NEGATIVE",
        "score": 0.9738196134567261,
        "review": "need to tell all my friends how hilariously bad it is"
    },
    {
        "label": "POSIT

In [30]:
# prompt: show confusion matrix on test data and 10 samples from each cell of confusion matrix

from sklearn.metrics import confusion_matrix
import pandas as pd

# Get predictions for the test dataset
predictions = trainer.predict(tokenized_imdb["test"])

# Convert logits to predictions
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Get true labels
true_labels = predictions.label_ids

# Calculate confusion matrix
cm = confusion_matrix(true_labels, predicted_labels)

# Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(cm, index = ["NEGATIVE", "POSITIVE"],
                  columns = ["NEGATIVE", "POSITIVE"])

print("Confusion Matrix:")
print(cm_df)

# Get 10 samples from each cell of the confusion matrix
def get_samples(true_label, predicted_label, n=10):
  indices = np.where((true_labels == true_label) & (predicted_labels == predicted_label))[0]
  samples = np.random.choice(indices, size=min(n, len(indices)), replace=False)
  return [tokenized_imdb["test"][i]["text"] for i in samples]

print("\nSamples from Confusion Matrix:")
for true_label in range(2):
  for predicted_label in range(2):
    print(f"\nTrue: {id2label[true_label]}, Predicted: {id2label[predicted_label]}")
    samples = get_samples(true_label, predicted_label)
    for sample in samples:
      print(sample)


Confusion Matrix:
          NEGATIVE  POSITIVE
NEGATIVE     11950       550
POSITIVE      1422     11078

Samples from Confusion Matrix:

True: NEGATIVE, Predicted: NEGATIVE


TypeError: Wrong key type: '10880' of type '<class 'numpy.int64'>'. Expected one of int, slice, range, str or Iterable.

In [38]:
# Get 10 samples from each cell of the confusion matrix
def get_samples(true_label, predicted_label, n=10):
  indices = np.where((true_labels == true_label) & (predicted_labels == predicted_label))[0]
  samples = np.random.choice(indices, size=min(n, len(indices)), replace=False)
  return [(predictions.predictions[int(i)], tokenized_imdb["test"][int(i)]["text"]) for i in samples]

print("\nSamples from Confusion Matrix:")
for true_label in range(2):
  for predicted_label in range(2):
    print(f"\nTrue: {id2label[true_label]}, Predicted: {id2label[predicted_label]}")
    samples = get_samples(true_label, predicted_label)
    for sample in samples:
      print(sample)


Samples from Confusion Matrix:

True: NEGATIVE, Predicted: NEGATIVE
(array([ 2.5648267, -2.4171383], dtype=float32), "Im sorry to myself, you know why. I feel pained from the viewing of this movie. I went to the theater with some friends to see this movie, and still did not give it the satisfaction of watching it in entirety ( i left with about 20 minutes left... hoping to god it might make me at least comfortable for a moment. ) most movies now, even this bad ones... when i watch them, there may be a small part in the movie where I feel some joy at times because of maybe a quirky joke or a good line... this movie on the other hand made me feel uncomfortable and mad at myself the whole time, especially since i wasted money on it. It was poorly written, poorly directed, poorly shot, and definitely poorly acted...<br /><br />please, for the good of humanity, do not see this movie, even if your some guy who wants to say he has seen like every movie ever... just don't...")
(array([ 1.9039

In [37]:
predictions.predictions[3]

array([ 2.2412684, -2.035056 ], dtype=float32)