Running multiclass text classification using BERT. I will switch from running in colab to a server as the project continues.

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
! pip install transformers

In [25]:
#BERT implementation based on https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/text_classification.ipynb

import numpy as np
from random import shuffle
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

MODEL = 'bert-base-uncased'
BATCH_SIZE = 16
args = TrainingArguments(
    f"{MODEL}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='logs'
)


class Data(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item


def train_test_val(text):
    cat_map = {'Unbalanced_power_relations':0, 'Shallow_solution':1, 
               'Presupposition':2, 'Authority_voice':3, 'Metaphors':4,
               'Compassion':5, 'The_poorer_the_merrier':6}
    
    data = text.split('\n')[4:-1]
    shuffle(data)
    
    X = []
    y = []
    for line in data:
        columns = line.split('\t')
        X.append(columns[-3]) #using the 'span' of PCL to train
        y.append(cat_map[columns[-2]])

    X_train, X_test, X_val = np.split(X, [int(.6*len(X)), int(.8*len(X))])
    y_train, y_test, y_val = np.split(y, [int(.6*len(y)), int(.8*len(y))])

    return X_train.tolist(), y_train.tolist(), X_test.tolist(), \
           y_test.tolist(), X_val.tolist(), y_val.tolist()


pcl = open('/content/drive/My Drive/data/dontpatronizeme_categories.tsv').read() #TODO some sentences have <h>, not sure if mistake
X_train, y_train, X_test, y_test, X_val, y_val = train_test_val(pcl)

tokenizer = AutoTokenizer.from_pretrained(MODEL)
encoded_train = tokenizer(X_train)
encoded_test = tokenizer(X_test)
encoded_val = tokenizer(X_val)

model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=7)
trainer = Trainer(
    model,
    args,
    train_dataset=Data(encoded_train, y_train),
    eval_dataset=Data(encoded_test, y_test),
    tokenizer=tokenizer
)
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "

Epoch,Training Loss,Validation Loss
1,No log,1.296107
2,No log,1.164997
3,No log,1.149105
4,No log,1.171006
5,1.034100,1.171144


***** Running Evaluation *****
  Num examples = 558
  Batch size = 16
Saving model checkpoint to bert-base-uncased-finetuned/checkpoint-105
Configuration saved in bert-base-uncased-finetuned/checkpoint-105/config.json
Model weights saved in bert-base-uncased-finetuned/checkpoint-105/pytorch_model.bin
tokenizer config file saved in bert-base-uncased-finetuned/checkpoint-105/tokenizer_config.json
Special tokens file saved in bert-base-uncased-finetuned/checkpoint-105/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 558
  Batch size = 16
Saving model checkpoint to bert-base-uncased-finetuned/checkpoint-210
Configuration saved in bert-base-uncased-finetuned/checkpoint-210/config.json
Model weights saved in bert-base-uncased-finetuned/checkpoint-210/pytorch_model.bin
tokenizer config file saved in bert-base-uncased-finetuned/checkpoint-210/tokenizer_config.json
Special tokens file saved in bert-base-uncased-finetuned/checkpoint-210/special_tokens_map.json
***** Runnin

TrainOutput(global_step=525, training_loss=1.0118289130074638, metrics={'train_runtime': 222.5358, 'train_samples_per_second': 37.634, 'train_steps_per_second': 2.359, 'total_flos': 206504170861800.0, 'train_loss': 1.0118289130074638, 'epoch': 5.0})

In [26]:
from sklearn.metrics import precision_score

outputs = trainer.predict(Data(encoded_val, y_val))
y_pred = outputs.predictions.argmax(1)
print(outputs.metrics)
print(precision_score(y_val, y_pred, average=None, zero_division=1)) #last class has no samples represented as it's too small

***** Running Prediction *****
  Num examples = 559
  Batch size = 16


{'test_loss': 1.239670991897583, 'test_runtime': 3.9807, 'test_samples_per_second': 140.426, 'test_steps_per_second': 8.792}
[0.62184874 0.59459459 0.30612245 0.29166667 0.73529412 0.53107345
 1.        ]
