In [1]:
!pip install transformers
!pip install accelerate -U

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          AutoConfig,
                          TrainingArguments,
                          Trainer,
                          DataCollatorWithPadding)
import torch
from torch.utils.data import Dataset
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('bbc-text.csv')
y = df.pop('category')

In [4]:
# create dictionaries with id to label and label to id mappings which are going
# to be used with the classification model
id2label = dict(enumerate(y.unique()))
label2id = {v: k for (k,v) in id2label.items()}

In [5]:
# encode class labels
y = y.map(label2id)

In [6]:
# create stratified train, validation and test datasets
text_train, text_test, y_train, y_test = train_test_split(
    df,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)
text_val, text_test, y_val, y_test = train_test_split(
    text_test,
    y_test,
    test_size=0.5,
    stratify=y_test,
    random_state=42
)

In [7]:
# load tokenizer of pretrained model
# here we will use BERT, but we can use any model
model_name = 'bert-base-uncased'
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
class TextDataset(Dataset):
    def __init__(self, text, labels, tokenizer, config):
        self.tokenized_txt = tokenizer(
            text,
            max_length=config.max_length,
            truncation=True,
            padding=True
        )
        self.labels = labels

    def __len__(self):
      return len(self.labels)

    def __getitem__(self, idx):
      item = {
          key: torch.tensor(val[idx]) for key, val in self.tokenized_txt.items()
      }
      item['labels'] = torch.tensor(self.labels[idx])
      return item


In [9]:
ds_train = TextDataset(
    text_train['text'].to_list(),
    y_train.to_list(),
    tokenizer,
    config
)
ds_val = TextDataset(
    text_val['text'].to_list(),
    y_val.to_list(),
    tokenizer,
    config
)
ds_test = TextDataset(
    text_test['text'].to_list(),
    y_test.to_list(),
    tokenizer,
    config
)

In [10]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1_result = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    balanced_acc = balanced_accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'balanced_accuracy': balanced_acc,
        'f1_score': f1_result
        }

In [11]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
# load model for classification
num_labels = len(label2id)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# create a dir to store model checkpoints
my_dir = 'model_checkpoints'
os.mkdir(my_dir)

In [14]:
# define training arguments
training_args = TrainingArguments(
    output_dir=my_dir,
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [16]:
# train the model
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Balanced Accuracy,F1 Score
1,No log,0.13166,0.981982,0.982132,0.981994
2,No log,0.125488,0.959459,0.957269,0.959809
3,No log,0.057312,0.986486,0.985,0.986339
4,No log,0.044187,0.981982,0.983235,0.98185
5,No log,0.05316,0.981982,0.983235,0.98185


TrainOutput(global_step=280, training_loss=0.19474390574863978, metrics={'train_runtime': 73.0779, 'train_samples_per_second': 121.788, 'train_steps_per_second': 3.832, 'total_flos': 91474666716000.0, 'train_loss': 0.19474390574863978, 'epoch': 5.0})

In [17]:
# evaluate model performance
trainer.evaluate(ds_test)

{'eval_loss': 0.15638288855552673,
 'eval_accuracy': 0.9596412556053812,
 'eval_balanced_accuracy': 0.959295410471881,
 'eval_f1_score': 0.9596794226742349,
 'eval_runtime': 0.3442,
 'eval_samples_per_second': 647.856,
 'eval_steps_per_second': 20.336,
 'epoch': 5.0}