In [None]:
!pip install datasets transformers evaluate optuna
!apt-get install git-lfs


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading datasets-3.5.0-py3-non

In [None]:
from datasets import load_dataset
imdb = load_dataset("imdb")
print(imdb)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [None]:
train_dataset = imdb['train'].shuffle(seed=42)
test_dataset = imdb['test'].shuffle(seed=42)

# For hyperparameter tuning
val_split_ratio = 0.3
val_size = int(len(imdb["train"]) * val_split_ratio)

val_dataset_ht = train_dataset.select([i for i in list(range(val_size))])
train_dataset_ht = train_dataset.select([i for i in list(range(val_size, len(imdb["train"])))])

print(f"Full train dataset size: {len(train_dataset)}")
print(f"Validation dataset size for hyperparameter tuning: {len(val_dataset_ht)}")
print(f"Train dataset size for hyperparameter tuning: {len(train_dataset_ht)}")

Full train dataset size: 25000
Validation dataset size for hyperparameter tuning: 7500
Train dataset size for hyperparameter tuning: 17500


In [None]:
import torch
import optuna
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import evaluate

torch.manual_seed(42)

# Load BERT tokenizer - using base uncased model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
def tokenize_function(examples):
    # BERT has a max sequence length of 512
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

train_dataset_ht = train_dataset_ht.map(tokenize_function, batched=True)
val_dataset_ht = val_dataset_ht.map(tokenize_function, batched=True)

# Format datasets for the model
train_dataset_ht.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset_ht.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/17500 [00:00<?, ? examples/s]

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

In [None]:
# Hyperparameter tuning with Optuna
def objective(trial: optuna.Trial):
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

    # Define the hyperparameters to tune
    batch_size = trial.suggest_categorical('batch_size', [16, 32])
    learning_rate = trial.suggest_categorical('learning_rate', [5e-5, 3e-5, 2e-5])
    num_epochs = trial.suggest_categorical('num_epochs', [2, 3, 4])
    weight_decay = trial.suggest_categorical('weight_decay', [0.01, 0.1])

    training_args = TrainingArguments(
        output_dir=f"./results/trial_{trial.number}",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=weight_decay,
        logging_dir=f"./logs/trial_{trial.number}",
        logging_steps=100,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset_ht,
        eval_dataset=val_dataset_ht
    )

    trainer.train()

    # Evaluate the model on the validation dataset
    eval_result = trainer.evaluate()

    # Return validation loss
    return eval_result["eval_loss"]

# Run Optuna study
print("Starting hyperparameter tuning...")
study = optuna.create_study(study_name='hp-search-bert', direction='minimize')
study.optimize(func=objective, n_trials=5)

best_lr = float(study.best_params['learning_rate'])
best_batch_size = study.best_params['batch_size']
best_epoch = int(study.best_params['num_epochs'])
best_weight_decay = float(study.best_params['weight_decay'])

print(f"Best Learning Rate: {best_lr}")
print(f"Best Batch Size: {best_batch_size}")
print(f"Best Epochs: {best_epoch}")
print(f"Best Weight Decay: {best_weight_decay}")

[I 2025-04-08 03:30:02,716] A new study created in memory with name: hp-search-bert


Starting hyperparameter tuning...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msyedalir001[0m ([33msyedalir001-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.2224,0.209591
2,0.13,0.242053
3,0.049,0.297329
4,0.0306,0.335841


[I 2025-04-08 03:58:39,583] Trial 0 finished with value: 0.20959147810935974 and parameters: {'batch_size': 32, 'learning_rate': 5e-05, 'num_epochs': 4, 'weight_decay': 0.01}. Best is trial 0 with value: 0.20959147810935974.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.2052,0.186614
2,0.1427,0.283526
3,0.0415,0.306951


[I 2025-04-08 04:18:37,453] Trial 1 finished with value: 0.18661415576934814 and parameters: {'batch_size': 16, 'learning_rate': 3e-05, 'num_epochs': 3, 'weight_decay': 0.1}. Best is trial 1 with value: 0.18661415576934814.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.2106,0.192384
2,0.1393,0.247387


[I 2025-04-08 04:32:11,229] Trial 2 finished with value: 0.19238434731960297 and parameters: {'batch_size': 16, 'learning_rate': 3e-05, 'num_epochs': 2, 'weight_decay': 0.01}. Best is trial 1 with value: 0.18661415576934814.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.2227,0.228407
2,0.1364,0.291581
3,0.0353,0.340204


[I 2025-04-08 04:52:08,297] Trial 3 finished with value: 0.22840693593025208 and parameters: {'batch_size': 16, 'learning_rate': 5e-05, 'num_epochs': 3, 'weight_decay': 0.01}. Best is trial 1 with value: 0.18661415576934814.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.2031,0.209678
2,0.1285,0.242661
3,0.0691,0.280605
4,0.0292,0.322975


[I 2025-04-08 05:17:04,172] Trial 4 finished with value: 0.20967787504196167 and parameters: {'batch_size': 32, 'learning_rate': 3e-05, 'num_epochs': 4, 'weight_decay': 0.1}. Best is trial 1 with value: 0.18661415576934814.


Best Learning Rate: 3e-05
Best Batch Size: 16
Best Epochs: 3
Best Weight Decay: 0.1


In [None]:
# Define evaluation metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")
precision_metric = evaluate.load("precision")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='binary')["f1"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average='binary')["recall"]
    precision = precision_metric.compute(predictions=predictions, references=labels, average='binary')["precision"]
    return {"accuracy": accuracy, "f1": f1, "recall": recall, "precision": precision}

# Initialize the BERT model for the final training
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Process the full datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Format datasets for the model
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Set up training arguments with best hyperparameters
training_args = TrainingArguments(
    output_dir="./results/bert_final",
    learning_rate=best_lr,
    per_device_train_batch_size=best_batch_size,
    per_device_eval_batch_size=best_batch_size,
    num_train_epochs=best_epoch,
    weight_decay=best_weight_decay,
    logging_dir="./logs/bert_final",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to="tensorboard"
)

# Initialize Trainer for final training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]



In [None]:
# Train the model
print("Starting final training...")
train_results = trainer.train()
print(train_results)

# Evaluate the model on test set
print("Evaluating on test set...")
eval_results = trainer.evaluate()
print(eval_results)

# Save the final model
model_path = "./bert_imdb_sentiment"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model saved to {model_path}")

Starting final training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision
1,0.2248,0.22769,0.92476,0.92774,0.966,0.892395
2,0.1268,0.225532,0.93912,0.939866,0.95152,0.928493
3,0.054,0.274605,0.93964,0.939773,0.94184,0.937714


TrainOutput(global_step=4689, training_loss=0.14637515213881558, metrics={'train_runtime': 1906.6215, 'train_samples_per_second': 39.337, 'train_steps_per_second': 2.459, 'total_flos': 1.9733329152e+16, 'train_loss': 0.14637515213881558, 'epoch': 3.0})
Evaluating on test set...


{'eval_loss': 0.2746053636074066, 'eval_accuracy': 0.93964, 'eval_f1': 0.9397725004989024, 'eval_recall': 0.94184, 'eval_precision': 0.9377140581441656, 'eval_runtime': 152.1002, 'eval_samples_per_second': 164.365, 'eval_steps_per_second': 10.276, 'epoch': 3.0}
Model saved to ./bert_imdb_sentiment
