In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m80.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m69.1 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
import torch
import numpy as np
import pandas as pd
from torch import nn
from tqdm import tqdm
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, AdamW, BertTokenizer, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 10
learning_rate = 2e-5


class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        # input_ids encodes input texts into tokenized integer sequence
        # attention_mask is used to ignore padded tokens, only attend to real tokens
        # torch.tensor(label) convert label into pytorch tensor
        encoding = self.tokenizer(
            text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}


class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        # load pre-trained bert model
        self.bert = BertModel.from_pretrained(bert_model_name)
        # initialize dropout layer to prevent overfitting, dropout rate = 0.1
        self.dropout = nn.Dropout(0.1)
        # * initialize fully connected layer, input size = bert.config.hidden_size, output size = num_classes
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        # feed input_ids and attention_mask into bert model
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # extract the last hidden state of the first token, which is [CLS] token
        pooled_output = outputs.pooler_output
        # feed pooled_output into dropout layer to prevent overfitting
        x = self.dropout(pooled_output)
        # feed pooled_output into fully connected layer to produce the final logits,
        # which represent the unnormalized scores for each class
        logits = self.fc(x)
        return logits


def train(model, data_loader, optimizer, scheduler, device):
    # set model to training mode
    model.train()
    for batch in data_loader:
        # PyTorch accumulates gradients, so we need to clear them out before each batch
        optimizer.zero_grad()
        # move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        # feed input_ids and attention_mask into model to get logits
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        # calculate loss between logits and labels
        # Cross-entropy loss is commonly used for classification tasks
        # nn.CrossEntropyLoss() combines nn.LogSoftmax() and nn.NLLLoss() in a single class
        loss = nn.CrossEntropyLoss()(outputs, labels)
        # calculate gradients of model parameters with respect to loss
        loss.backward()
        # optimizer.step() performs a parameter update based on the current gradient
        optimizer.step()
        # scheduler.step() update learning rate
        scheduler.step()


def evaluate(model, data_loader, device):
    # set model to evaluation mode
    model.eval()
    predictions = []
    actual_labels = []
    # disable gradient calculation to save memory and computation
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            # print(outputs)
            # torch.max() returns the maximum value of each row of the input tensor in the given dimension dim
            _, preds = torch.max(outputs, dim=1)
            # append predictions and actual labels to calculate accuracy and classification report
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions),\
        classification_report(actual_labels, predictions),\
        mean_squared_error(actual_labels, predictions)


if __name__ == "__main__":
    df = pd.read_csv("cert_dataset.csv")
    cert_texts = list(df["text"])
    cert_labels = list(df["label"])

    train_texts, eval_texts, train_labels, eval_labels = train_test_split(cert_texts, cert_labels,test_size=0.4, shuffle= True, random_state=42)
    # print(len(train_texts))
    tokenizer = BertTokenizer.from_pretrained(bert_model_name)
    train_dataset = TextClassificationDataset(
        train_texts, train_labels, tokenizer, max_length)
    eval_dataset = TextClassificationDataset(
        eval_texts, eval_labels, tokenizer, max_length)
    train_dataloader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True)
    eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BERTClassifier(bert_model_name, num_classes).to(device)

    # AdamW is a class from the huggingface library (as opposed to pytorch)
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    # Total number of training steps is number of batches * number of epochs.
    total_steps = len(train_dataloader) * num_epochs
    # Create the learning rate scheduler. This changes the learning rate as the training loop progresses
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report, mse = evaluate(model, eval_dataloader, device)
        print(f"Validation Accuracy: {accuracy}")
        print(f"mean_squared_error: {mse}")
        print(report)
    torch.save(model.state_dict(), 'BERT_model_state.pt')


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



Epoch 1/10
Validation Accuracy: 0.9818712231714941
mean_squared_error: 0.01812877682850594
              precision    recall  f1-score   support

           0       0.97      0.92      0.94       781
           1       0.98      0.99      0.99      4018

    accuracy                           0.98      4799
   macro avg       0.98      0.96      0.97      4799
weighted avg       0.98      0.98      0.98      4799

Epoch 2/10
Validation Accuracy: 0.9885392790164618
mean_squared_error: 0.011460720983538238
              precision    recall  f1-score   support

           0       0.98      0.95      0.96       781
           1       0.99      1.00      0.99      4018

    accuracy                           0.99      4799
   macro avg       0.99      0.97      0.98      4799
weighted avg       0.99      0.99      0.99      4799

Epoch 3/10
Validation Accuracy: 0.9881225255261513
mean_squared_error: 0.011877474473848718
              precision    recall  f1-score   support

           0    