# DistilBERT Classifier as Feature Extractor

In this feature-based approach, we are using the embeddings from a pretrained transformer to train a random forest and logistic regression model in scikit-learn:

![](figures/feature-extractor.jpeg)

In [1]:
# pip install transformers datasets

In [2]:
# conda install sklearn --yes

In [3]:
%load_ext watermark
%watermark --conda -p torch,transformers,datasets,sklearn

torch       : 1.12.0
transformers: 4.9.1
datasets    : 2.6.1
sklearn     : 1.0.2

conda environment: base



In [4]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


# 1 Loading the Dataset

The IMDB movie review dataset consists of 50k movie reviews with sentiment label (0: negative, 1: positive).

## 1a) Load from `datasets` Hub

In [5]:
from datasets import list_datasets, load_dataset

In [6]:
# list_datasets()

In [7]:
imdb_data = load_dataset("imdb")
print(imdb_data)

Found cached dataset imdb (/home/raschka/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)
100%|███████████████████████████████████████████████████| 3/3 [00:00<00:00, 573.54it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})





In [8]:
imdb_data["train"][99]

{'text': "This film is terrible. You don't really need to read this review further. If you are planning on watching it, suffice to say - don't (unless you are studying how not to make a good movie).<br /><br />The acting is horrendous... serious amateur hour. Throughout the movie I thought that it was interesting that they found someone who speaks and looks like Michael Madsen, only to find out that it is actually him! A new low even for him!!<br /><br />The plot is terrible. People who claim that it is original or good have probably never seen a decent movie before. Even by the standard of Hollywood action flicks, this is a terrible movie.<br /><br />Don't watch it!!! Go for a jog instead - at least you won't feel like killing yourself.",
 'label': 0}

## 1b) Load from local directory

The IMDB movie review set can be downloaded from http://ai.stanford.edu/~amaas/data/sentiment/. After downloading the dataset, decompress the files.

A) If you are working with Linux or MacOS X, open a new terminal window cd into the download directory and execute

    tar -zxf aclImdb_v1.tar.gz

B) If you are working with Windows, download an archiver such as 7Zip to extract the files from the download archive.

C) Use the following code to download and unzip the dataset via Python

**Download the movie reviews**

In [9]:
import os
import sys
import tarfile
import time
import urllib.request

source = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
target = "aclImdb_v1.tar.gz"

if os.path.exists(target):
    os.remove(target)


def reporthook(count, block_size, total_size):
    global start_time
    if count == 0:
        start_time = time.time()
        return
    duration = time.time() - start_time
    progress_size = int(count * block_size)
    speed = progress_size / (1024.0**2 * duration)
    percent = count * block_size * 100.0 / total_size

    sys.stdout.write(
        f"\r{int(percent)}% | {progress_size / (1024.**2):.2f} MB "
        f"| {speed:.2f} MB/s | {duration:.2f} sec elapsed"
    )
    sys.stdout.flush()


if not os.path.isdir("aclImdb") and not os.path.isfile("aclImdb_v1.tar.gz"):
    urllib.request.urlretrieve(source, target, reporthook)

In [10]:
if not os.path.isdir("aclImdb"):

    with tarfile.open(target, "r:gz") as tar:
        tar.extractall()

**Convert them to a pandas DataFrame and save them as CSV**

In [11]:
import os
import sys

import numpy as np
import pandas as pd
from packaging import version
from tqdm import tqdm

# change the `basepath` to the directory of the
# unzipped movie dataset

basepath = "aclImdb"

labels = {"pos": 1, "neg": 0}

df = pd.DataFrame()

with tqdm(total=50000) as pbar:
    for s in ("test", "train"):
        for l in ("pos", "neg"):
            path = os.path.join(basepath, s, l)
            for file in sorted(os.listdir(path)):
                with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
                    txt = infile.read()

                if version.parse(pd.__version__) >= version.parse("1.3.2"):
                    x = pd.DataFrame(
                        [[txt, labels[l]]], columns=["review", "sentiment"]
                    )
                    df = pd.concat([df, x], ignore_index=False)

                else:
                    df = df.append([[txt, labels[l]]], ignore_index=True)
                pbar.update()
df.columns = ["text", "label"]

100%|███████████████████████████████████████████| 50000/50000 [00:55<00:00, 900.54it/s]


In [12]:
import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))

**Basic datasets analysis and sanity checks**

In [13]:
print("Class distribution:")
np.bincount(df["label"].values)

Class distribution:


array([25000, 25000])

In [14]:
text_len = df["text"].apply(lambda x: len(x.split()))
text_len.min(), text_len.median(), text_len.max() 

(4, 173.0, 2470)

**Split data into training, validation, and test sets**

In [15]:
df_train = df.iloc[:35_000]
df_val = df.iloc[35_000:40_000]
df_test = df.iloc[40_000:]

df_train.to_csv("train.csv", index=False, encoding="utf-8")
df_val.to_csv("validation.csv", index=False, encoding="utf-8")
df_test.to_csv("test.csv", index=False, encoding="utf-8")

**Load the dataset via `load_dataset`**

In [16]:
imdb_dataset = load_dataset(
    "csv",
    data_files={
        "train": "train.csv",
        "validation": "validation.csv",
        "test": "test.csv",
    },
)

print(imdb_dataset)

Using custom data configuration default-0d0a861d0393656c


Downloading and preparing dataset csv/default to /home/raschka/.cache/huggingface/datasets/csv/default-0d0a861d0393656c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|██████████████████████████| 3/3 [00:00<00:00, 8799.24it/s]
Extracting data files: 100%|███████████████████████████| 3/3 [00:00<00:00, 1311.68it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
  self.handle.detach()
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
  self.handle.detach()
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
  self.handle.detach()
                                

Dataset csv downloaded and prepared to /home/raschka/.cache/huggingface/datasets/csv/default-0d0a861d0393656c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


100%|███████████████████████████████████████████████████| 3/3 [00:00<00:00, 753.24it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 35000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 10000
    })
})





# 2 Tokenization and Numericalization

In [17]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print("Tokenizer input max length:", tokenizer.model_max_length)
print("Tokenizer vocabulary size:", tokenizer.vocab_size)

Tokenizer input max length: 512
Tokenizer vocabulary size: 30522


In [21]:
def tokenize_text(batch):
    return tokenizer(batch["text"], truncation=True, padding=True)

In [23]:
imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)

  0%|                                                            | 0/1 [00:10<?, ?ba/s]
  0%|                                                            | 0/1 [00:01<?, ?ba/s]
  0%|                                                            | 0/1 [00:02<?, ?ba/s]


In [28]:
del imdb_dataset

# 3 Using DistilBERT as a Feature Extractor

In [29]:
from transformers import AutoModel

model = AutoModel.from_pretrained("distilbert-base-uncased")
model.to(device);

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [30]:
imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [31]:
test_batch = {"attention_mask": imdb_tokenized["train"][:3]["attention_mask"].to(device),
              "input_ids": imdb_tokenized["train"][:3]["input_ids"].to(device)}

with torch.inference_mode():
    test_output = model(**test_batch)
    
test_output.last_hidden_state.shape

torch.Size([3, 512, 768])

In [32]:
cls_token_output = test_output.last_hidden_state[:, 0]
cls_token_output.shape

torch.Size([3, 768])

In [35]:
def get_output_embeddings(batch):
    inputs = {key:tensor.to(device) for key,tensor in batch.items() if key != "label"}
    with torch.inference_mode():
        output = model(**inputs).last_hidden_state[:, 0]
    return {"features": output.cpu().numpy()}

In [36]:
imdb_features = imdb_tokenized.map(get_output_embeddings, batched=True, batch_size=10)

100%|█████████████████████████████████████████████▉| 3499/3500 [05:35<00:00, 10.44ba/s]
100%|███████████████████████████████████████████████▉| 499/500 [00:48<00:00, 10.37ba/s]
100%|██████████████████████████████████████████████▉| 999/1000 [01:36<00:00, 10.38ba/s]


In [38]:
X_train = np.array(imdb_features["train"]["features"])
y_train = np.array(imdb_features["train"]["label"])

X_val = np.array(imdb_features["validation"]["features"])
y_val = np.array(imdb_features["validation"]["label"])

X_test = np.array(imdb_features["test"]["features"])
y_test = np.array(imdb_features["test"]["label"])

## 4 Train Model on Embeddings (Extracted Features)

In [43]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

print("Training accuracy", rf.score(X_train, y_train))
print("Validation accuracy", rf.score(X_val, y_val))
print("test accuracy", rf.score(X_test, y_test))

Training accuracy 1.0
Validation accuracy 0.7876
test accuracy 0.6623


In [46]:
from sklearn.linear_model import LogisticRegression

rf = LogisticRegression(max_iter=1000)
rf.fit(X_train, y_train)

print("Training accuracy", rf.score(X_train, y_train))
print("Validation accuracy", rf.score(X_val, y_val))
print("test accuracy", rf.score(X_test, y_test))

Training accuracy 0.8931714285714286
Validation accuracy 0.866
test accuracy 0.8291
