<a href="https://colab.research.google.com/github/xutian1113/pytorch_practice/blob/main/glue_mrpc_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets
!pip install tqdm




### dataset Description
```
DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})
```

### dataset["train"][0]
```
{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}
```
sentence1 and sentence2. if the two sentences are semantically equivalent (i.e. they are paraphrases), the label is 1,
otherwise, the label is 0

### dataset statistics
- train: ({0: 1194, 1: 2474}, 0.3255179934569248)
- val: ({0: 129, 1: 279}, 0.3161764705882353)
- test: ({0: 578, 1: 1147}, 0.33507246376811595)


In [2]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix

from tqdm import tqdm



In [3]:
def accuracy(preds, labels):
  _, predicted = torch.max(preds, 1)
  correct = (predicted == labels).sum().item()
  return correct / len(labels)

def f1_score(preds, labels):
  _, predicted = torch.max(preds, 1)
  tp = ((predicted == 1) & (labels == 1)).sum().item()
  fp = ((predicted == 1) & (labels == 0)).sum().item()
  fn = ((predicted == 0) & (labels == 1)).sum().item()
  precision = tp / (tp + fp) if tp + fp > 0 else 0
  recall = tp / (tp + fn) if tp + fn > 0 else 0
  f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
  return f1

def precision(preds, labels):
  _, predicted = torch.max(preds, 1)
  tp = ((predicted == 1) & (labels == 1)).sum().item()
  fp = ((predicted == 1) & (labels == 0)).sum().item()
  precision = tp / (tp + fp) if tp + fp > 0 else 0
  return precision


def auc_roc(preds, labels):
  # Convert predictions and labels to NumPy arrays
  _, predicted = torch.max(preds, 1)
  preds_np = predicted.detach().cpu().numpy()
  labels_np = labels.detach().cpu().numpy()

  # Calculate AUC-ROC using scikit-learn's roc_auc_score
  auc_score = roc_auc_score(labels_np, preds_np)

  return auc_score


def auc_pr(preds, labels):
  """
  Calculates the AUPRC score for binary classification.

  Args:
    preds: Predicted probabilities (tensor) for the positive class.
    labels: True labels (tensor) (0 or 1).

  Returns:
    AUPRC score (float).
  """
  # Convert predictions and labels to NumPy arrays
  _, predicted = torch.max(preds, 1)
  preds_np = predicted.detach().cpu().numpy()
  labels_np = labels.detach().cpu().numpy()

  # Calculate AUPRC using scikit-learn's average_precision_score
  auprc_score = average_precision_score(labels_np, preds_np)

  return auprc_score

def confusion_matrix_func(preds, labels, num_classes=2):
  """
  Calculates the confusion matrix.

  Args:
    preds: Predicted labels (tensor).
    labels: True labels (tensor).
    num_classes: Number of classes. Defaults to 2 for binary classification.

  Returns:
    Confusion matrix (NumPy array).
  """
  # Get predicted class indices
  _, predicted = torch.max(preds, 1)

  # Convert predictions and labels to NumPy arrays
  predicted_np = predicted.detach().cpu().numpy()
  labels_np = labels.detach().cpu().numpy()

  # Calculate confusion matrix using scikit-learn's confusion_matrix
  cm = confusion_matrix(labels_np, predicted_np, labels=range(num_classes))

  return cm

def recall(preds, labels):
  """
  Calculates the recall score for binary classification.

  Args:
    preds: Predicted labels (tensor).
    labels: True labels (tensor).

  Returns:
    Recall score (float).
  """
  # Get predicted class indices
  _, predicted = torch.max(preds, 1)

  # Calculate true positives (TP), false negatives (FN)
  tp = ((predicted == 1) & (labels == 1)).sum().item()
  fn = ((predicted == 0) & (labels == 1)).sum().item()

  # Calculate recall
  recall_score = tp / (tp + fn) if tp + fn > 0 else 0  # Avoid division by zero

  return recall_score



In [4]:
# 1. load the GLUE MRPC dataset
dataset = load_dataset('glue', 'mrpc')

# 2. load the tokenizer for bert-base-cased
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
# converting raw text into numerical representations that can be processed by BERT model.
'''

1. splits input text into tokens
2. maps tokens to numerical IDs using a vocabulary
3. adds special tokens required by BERT
4. handles padding and truncation to ensure uniform input size

'''

# 3. define a tokenization function for sentence pair

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True, padding="max_length", max_length=128)

'''
input_ids: Tokenized numerical representation of both sentences.
token_type_ids: Differentiates sentence1 (0) from sentence2 (1).
attention_mask: Indicates which tokens are real (1) and padding (0).
'''
# 4. Tokenize the dataset (batched processing)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 5. Remove unnecessary columns and set the format to PyTorch tensors
# Remove columns that are not used by the model (like the original sentences and index)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets.set_format("torch")

# 6. Create DataLoaders for training and validation
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=8)
val_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=8)


# 7. Set up the device, model, optimizer, and scheduler
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
model.to(device)

# optimizer = AdamW(model.parameters(), lr=2e-5)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
epochs = 10
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model.train()
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    total_loss = 0
    all_preds = []
    all_labels = []
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}"):
        # Move batch to the device
        batch = {key: value.to(device) for key, value in batch.items()}
        batch['labels'] = batch['label']
        del batch['label']
        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        # predictions = torch.argmax(outputs.logits, dim=-1)
        all_preds.append(outputs.logits.detach())
        all_labels.append(batch['labels'].detach())

    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)

    train_acc = accuracy(all_preds, all_labels)
    train_auc = auc_roc(all_preds, all_labels)
    train_rec = recall(all_preds, all_labels)
    train_avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {train_avg_loss:.4f}, Accuracy: {train_acc:.4f}, AUC-ROC: {train_auc:.4f}, Recall: {train_rec:.4f}")

    # 9. Validation loop
    model.eval()
    correct_predictions = 0
    total_predictions = 0
    val_preds = []
    val_labels = []
    for batch in val_dataloader:
        batch = {key: value.to(device) for key, value in batch.items()}
        batch['labels'] = batch['label']
        del batch['label']
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        val_preds.append(logits.detach())
        val_labels.append(batch['labels'].detach())

    val_preds = torch.cat(val_preds)
    val_labels = torch.cat(val_labels)
    val_acc = accuracy(val_preds, val_labels)
    val_auc = auc_roc(val_preds, val_labels)
    val_rec = recall(val_preds, val_labels)
    print(f"Validation - Accuracy: {val_acc:.4f}, AUC-ROC: {val_auc:.4f}, Recall: {val_rec:.4f}")


    model.train()  # Switch back to training mode for the next epoch


Epoch 1/10


Training Epoch 1: 100%|██████████| 459/459 [00:22<00:00, 20.39it/s]


Epoch 1/10 - Loss: 0.5199, Accuracy: 0.7481, AUC-ROC: 0.6720, Recall: 0.8901
Validation - Accuracy: 0.8578, AUC-ROC: 0.8127, Recall: 0.9355
Epoch 2/10


Training Epoch 2: 100%|██████████| 459/459 [00:22<00:00, 20.86it/s]


Epoch 2/10 - Loss: 0.2536, Accuracy: 0.9019, AUC-ROC: 0.8854, Recall: 0.9325
Validation - Accuracy: 0.8358, AUC-ROC: 0.7632, Recall: 0.9606
Epoch 3/10


Training Epoch 3: 100%|██████████| 459/459 [00:21<00:00, 20.87it/s]


Epoch 3/10 - Loss: 0.1024, Accuracy: 0.9684, AUC-ROC: 0.9633, Recall: 0.9778
Validation - Accuracy: 0.8554, AUC-ROC: 0.7901, Recall: 0.9677
Epoch 4/10


Training Epoch 4: 100%|██████████| 459/459 [00:21<00:00, 20.88it/s]


Epoch 4/10 - Loss: 0.0445, Accuracy: 0.9869, AUC-ROC: 0.9844, Recall: 0.9915
Validation - Accuracy: 0.8578, AUC-ROC: 0.8127, Recall: 0.9355
Epoch 5/10


Training Epoch 5: 100%|██████████| 459/459 [00:21<00:00, 20.88it/s]


Epoch 5/10 - Loss: 0.0269, Accuracy: 0.9918, AUC-ROC: 0.9918, Recall: 0.9919
Validation - Accuracy: 0.8554, AUC-ROC: 0.7963, Recall: 0.9570
Epoch 6/10


Training Epoch 6: 100%|██████████| 459/459 [00:21<00:00, 20.89it/s]


Epoch 6/10 - Loss: 0.0138, Accuracy: 0.9967, AUC-ROC: 0.9963, Recall: 0.9976
Validation - Accuracy: 0.8529, AUC-ROC: 0.8008, Recall: 0.9427
Epoch 7/10


Training Epoch 7: 100%|██████████| 459/459 [00:21<00:00, 20.88it/s]


Epoch 7/10 - Loss: 0.0095, Accuracy: 0.9970, AUC-ROC: 0.9967, Recall: 0.9976
Validation - Accuracy: 0.8529, AUC-ROC: 0.8133, Recall: 0.9211
Epoch 8/10


Training Epoch 8: 100%|██████████| 459/459 [00:22<00:00, 20.86it/s]


Epoch 8/10 - Loss: 0.0032, Accuracy: 0.9986, AUC-ROC: 0.9986, Recall: 0.9988
Validation - Accuracy: 0.8529, AUC-ROC: 0.8050, Recall: 0.9355
Epoch 9/10


Training Epoch 9: 100%|██████████| 459/459 [00:22<00:00, 20.85it/s]


Epoch 9/10 - Loss: 0.0034, Accuracy: 0.9989, AUC-ROC: 0.9990, Recall: 0.9988
Validation - Accuracy: 0.8554, AUC-ROC: 0.8026, Recall: 0.9462
Epoch 10/10


Training Epoch 10: 100%|██████████| 459/459 [00:21<00:00, 20.89it/s]


Epoch 10/10 - Loss: 0.0034, Accuracy: 0.9995, AUC-ROC: 0.9992, Recall: 1.0000
Validation - Accuracy: 0.8554, AUC-ROC: 0.8026, Recall: 0.9462


In [6]:
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=8)
model.eval()
test_preds = []
test_labels = []

for batch in test_dataloader:
    batch = {key: value.to(device) for key, value in batch.items()}
    batch['labels'] = batch['label']
    del batch['label']
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    test_preds.append(logits.detach())
    test_labels.append(batch['labels'].detach())

test_preds = torch.cat(test_preds)
test_labels = torch.cat(test_labels)

test_acc = accuracy(test_preds, test_labels)
test_auc = auc_roc(test_preds, test_labels)
test_rec = recall(test_preds, test_labels)
print(f"Test - Accuracy: {test_acc:.4f}, AUC-ROC: {test_auc:.4f}, Recall: {test_rec:.4f}")

Test - Accuracy: 0.8435, AUC-ROC: 0.8051, Recall: 0.9215
