Adapted from Hugging Face transformer tutorial:
https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb

In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import csv
csv.field_size_limit(999999)
train_df = pd.read_csv('/content/drive/MyDrive/fulltrain.csv', header = None, names=['class','text'])
test_df = pd.read_csv('/content/drive/MyDrive/balancedtest.csv', header = None, names=['class','text'])

In [None]:
# Setting up variables
MAX_LEN = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 5e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class BERTDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_token_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item_index):
        text = self.texts[item_index]
        label = self.labels[item_index]

        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_token_len,
            truncation=True,
            padding="max_length",
            add_special_tokens=True,  # Adds '[CLS]' and '[SEP]'
            return_attention_mask=True,
            return_token_type_ids=True
        )

        ids = encoding['input_ids']
        mask = encoding['attention_mask']
        token_type_ids = encoding["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
X_train = train_df['text']
y_train = train_df['class'] - 1
X_test = test_df['text']
y_test = test_df['class'] - 1


training_set = BERTDataset(X_train, y_train, tokenizer, MAX_LEN)
testing_set = BERTDataset(X_test, y_test, tokenizer, MAX_LEN)

In [None]:
y_train.value_counts()

class
2    17870
0    14047
3     9995
1     6942
Name: count, dtype: int64

In [None]:
y_test.value_counts()

class
0    750
1    750
2    750
3    750
Name: count, dtype: int64

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 4)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [None]:
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        labels = data['labels'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, labels.long())
        if loss < 0.001:
          break
        if _%10==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  1.5796750783920288
Epoch: 0, Loss:  1.0998647212982178
Epoch: 0, Loss:  0.6066374778747559
Epoch: 0, Loss:  0.7555053234100342
Epoch: 0, Loss:  0.7150360345840454
Epoch: 0, Loss:  0.3286060690879822
Epoch: 0, Loss:  0.33396753668785095
Epoch: 0, Loss:  0.1957828253507614
Epoch: 0, Loss:  0.4285515546798706
Epoch: 0, Loss:  0.14623130857944489
Epoch: 0, Loss:  0.29962119460105896
Epoch: 0, Loss:  0.09654861688613892
Epoch: 0, Loss:  0.2466573417186737
Epoch: 0, Loss:  0.38201668858528137
Epoch: 0, Loss:  0.18343350291252136
Epoch: 0, Loss:  0.07868955284357071
Epoch: 0, Loss:  0.06012280657887459
Epoch: 0, Loss:  0.06402578949928284
Epoch: 0, Loss:  0.10873911529779434
Epoch: 0, Loss:  0.02949383109807968
Epoch: 0, Loss:  0.2703172266483307
Epoch: 0, Loss:  0.2155565321445465
Epoch: 0, Loss:  0.03169868513941765
Epoch: 0, Loss:  0.040695857256650925
Epoch: 0, Loss:  0.0166550874710083
Epoch: 0, Loss:  0.015261387452483177
Epoch: 0, Loss:  0.012182851321995258
Epoch: 0, 

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/model_weights.pth')

In [None]:
model.eval()  # Set the model to evaluation mode

true_labels = []
predictions = []

with torch.no_grad():  # No need to track gradients for validation
    for batch in testing_loader:
        # Adjust these lines to match your DataLoader structure
        input_ids, attention_mask, token_type_ids, labels = batch['ids'], batch['mask'], batch['token_type_ids'], batch['labels']
        input_ids, attention_mask, token_type_ids, labels = input_ids.to(device), attention_mask.to(device), token_type_ids.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask, token_type_ids)
        _, predicted_labels = torch.max(outputs, dim=1)

        true_labels.extend(labels.cpu().numpy())
        predictions.extend(predicted_labels.cpu().numpy())

In [None]:
from sklearn.metrics import classification_report
print(classification_report(true_labels, predictions))

              precision    recall  f1-score   support

           0       0.92      0.64      0.75       750
           1       0.69      0.72      0.71       750
           2       0.14      0.04      0.06       750
           3       0.50      0.98      0.66       750

    accuracy                           0.60      3000
   macro avg       0.56      0.60      0.55      3000
weighted avg       0.56      0.60      0.55      3000

