In [1]:
# Install required libraries
!pip install datasets transformers evaluate
!apt-get install git-lfs


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [3

In [2]:
from datasets import load_dataset

sst2 = load_dataset("glue", "sst2")
print(sst2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})


In [7]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Load BERT tokenizer - using base uncased model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Initialize the BERT model for the final training
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)


# Prepare the datasets
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

# Tokenize the datasets for SST2
tokenized_sst2 = {}
for split in sst2:
    tokenized_sst2[split] = sst2[split].map(tokenize_function, batched=True)
    tokenized_sst2[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda


In [8]:
# Define evaluation metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Common training arguments for all finetuning experiments
base_training_args = {
    'per_device_train_batch_size': 16,
    'per_device_eval_batch_size': 16,
    'num_train_epochs': 3,
    'learning_rate': 2e-5,
    'weight_decay': 0.01,
    'logging_dir': './logs/',
    'logging_steps': 100,
    'evaluation_strategy': 'epoch',
    'save_strategy': 'epoch',
    'load_best_model_at_end': True,
    'metric_for_best_model': 'accuracy',
    'greater_is_better': True,
    'report_to': "none"
}

In [11]:
# For baseline evaluation, we need to either provide an eval_dataset or change the strategy
baseline_args = base_training_args.copy()

trainer = Trainer(
    model=model,
    args=TrainingArguments(output_dir='./results/baseline', **baseline_args),
    compute_metrics=compute_metrics,
    train_dataset=tokenized_sst2['train'],
    eval_dataset=tokenized_sst2['validation'],
)

trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1667,0.247271,0.926606,0.93578,0.918919,0.927273
2,0.1121,0.307257,0.919725,0.899573,0.948198,0.923246
3,0.0719,0.343281,0.933486,0.92511,0.945946,0.935412


{'eval_loss': 0.34328147768974304, 'eval_accuracy': 0.9334862385321101, 'eval_precision': 0.9251101321585903, 'eval_recall': 0.9459459459459459, 'eval_f1': 0.9354120267260579, 'eval_runtime': 1.5976, 'eval_samples_per_second': 545.827, 'eval_steps_per_second': 34.427, 'epoch': 3.0}
