Adapted from Hugging Face transformer tutorial:
https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb

In [18]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import RobertaModel, RobertaTokenizer
from sklearn.utils.class_weight import compute_class_weight

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import csv
csv.field_size_limit(999999)
train_df = pd.read_csv('/content/drive/MyDrive/fulltrain.csv', header = None, names=['class','text'])
test_df = pd.read_csv('/content/drive/MyDrive/balancedtest.csv', header = None, names=['class','text'])

In [5]:
# Setting up variables
MAX_LEN = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 5e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [6]:
class BERTDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_token_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item_index):
        text = self.texts[item_index]
        label = self.labels[item_index]

        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_token_len,
            truncation=True,
            padding="max_length",
            add_special_tokens=True,  # Adds '[CLS]' and '[SEP]'
            return_attention_mask=True,
            return_token_type_ids=True
        )

        ids = encoding['input_ids']
        mask = encoding['attention_mask']
        token_type_ids = encoding["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [7]:
X_train = train_df['text']
y_train = train_df['class'] - 1
X_test = test_df['text']
y_test = test_df['class'] - 1
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

training_set = BERTDataset(X_train, y_train, tokenizer, MAX_LEN)
testing_set = BERTDataset(X_test, y_test, tokenizer, MAX_LEN)

In [None]:
y_train.value_counts()

class
2    17870
0    14047
3     9995
1     6942
Name: count, dtype: int64

In [None]:
y_test.value_counts()

class
0    750
1    750
2    750
3    750
Name: count, dtype: int64

In [8]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [9]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.RobertaModel.from_pretrained('roberta-base')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 4)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768

In [10]:
loss_fn = torch.nn.CrossEntropyLoss()

In [11]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [13]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        labels = data['labels'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, labels.long())
        if _%100==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [14]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  1.3302911520004272
Epoch: 0, Loss:  0.3895200788974762
Epoch: 0, Loss:  0.14917173981666565
Epoch: 0, Loss:  0.05304345861077309
Epoch: 0, Loss:  0.06052478775382042
Epoch: 0, Loss:  0.02350599505007267
Epoch: 0, Loss:  0.015668168663978577
Epoch: 0, Loss:  0.006012363824993372
Epoch: 0, Loss:  0.0029221661388874054
Epoch: 0, Loss:  0.009619610384106636
Epoch: 0, Loss:  0.009174547158181667
Epoch: 0, Loss:  0.060941681265830994
Epoch: 0, Loss:  0.0009526272770017385
Epoch: 0, Loss:  0.040544770658016205
Epoch: 0, Loss:  0.0016735108802095056
Epoch: 0, Loss:  0.0015183871146291494
Epoch: 0, Loss:  0.05427601560950279
Epoch: 0, Loss:  0.0031080672051757574
Epoch: 0, Loss:  0.0018963445909321308
Epoch: 0, Loss:  0.0005505990120582283
Epoch: 0, Loss:  0.012696269899606705
Epoch: 0, Loss:  0.23194220662117004
Epoch: 0, Loss:  0.0568280927836895
Epoch: 0, Loss:  0.028120772913098335
Epoch: 0, Loss:  0.02200489304959774
Epoch: 0, Loss:  0.0052482872270047665
Epoch: 0, Loss:  

In [15]:
torch.save(model.state_dict(), '/content/drive/MyDrive/model_weights.pth')

In [16]:
model.eval()  # Set the model to evaluation mode

true_labels = []
predictions = []

with torch.no_grad():  # No need to track gradients for validation
    for batch in testing_loader:
        # Adjust these lines to match your DataLoader structure
        input_ids, attention_mask, token_type_ids, labels = batch['ids'], batch['mask'], batch['token_type_ids'], batch['labels']
        input_ids, attention_mask, token_type_ids, labels = input_ids.to(device), attention_mask.to(device), token_type_ids.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask, token_type_ids)
        _, predicted_labels = torch.max(outputs, dim=1)

        true_labels.extend(labels.cpu().numpy())
        predictions.extend(predicted_labels.cpu().numpy())

In [17]:
from sklearn.metrics import classification_report
print(classification_report(true_labels, predictions))

              precision    recall  f1-score   support

           0       0.94      0.78      0.85       750
           1       0.91      0.65      0.76       750
           2       0.14      0.07      0.09       750
           3       0.50      0.99      0.66       750

    accuracy                           0.62      3000
   macro avg       0.62      0.62      0.59      3000
weighted avg       0.62      0.62      0.59      3000

