# Transformer (DistilBERT) Hyperparameter Tuning with Bayesian Optimization

In [1]:
import numpy as np
import pandas as pd
import time
import tensorflow as tf
import tf_keras
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, DistilBertConfig
from sklearn.model_selection import train_test_split

print(f"TensorFlow: {tf.__version__}")
print(f"GPUs: {len(tf.config.list_physical_devices('GPU'))}")

2025-12-13 14:37:47.250196: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-13 14:37:47.304977: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-13 14:37:56.191752: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


TensorFlow: 2.20.0
GPUs: 1


## Load Data

In [2]:
MODEL_NAME = 'distilbert-base-uncased'
MAX_LEN = 250  # Fixed
TEST_SIZE = 0.2

print("Loading data...")
dataset = pd.read_csv('data/imdb_dataset.csv')
train_df, test_df = train_test_split(dataset, test_size=TEST_SIZE, random_state=42)

y_train = (train_df['sentiment'] == 'positive').astype(int).values
y_test = (test_df['sentiment'] == 'positive').astype(int).values

print(f"Train: {len(train_df)}, Test: {len(test_df)}")

Loading data...
Train: 40000, Test: 10000


## Tokenize Data

In [3]:
print(f"Tokenizing with MAX_LEN={MAX_LEN}...")
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

X_train = tokenizer(
    train_df['review'].tolist(),
    max_length=MAX_LEN,
    truncation=True,
    padding=True,
    return_tensors='tf'
)

X_test = tokenizer(
    test_df['review'].tolist(),
    max_length=MAX_LEN,
    truncation=True,
    padding=True,
    return_tensors='tf'
)

print("Tokenization complete")

Tokenizing with MAX_LEN=250...


TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
I0000 00:00:1765629564.966494   10703 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5518 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


Tokenization complete


## Keras Tuner Setup

In [4]:
# !pip install keras-tuner

import keras_tuner as kt
from keras_tuner import BayesianOptimization
from tf_keras.callbacks import EarlyStopping

print("Keras Tuner imported")

Keras Tuner imported


## Tunable Transformer Model Builder

Tuning: Learning Rate, Dropout, Weight Decay, Batch Size

In [5]:
def build_tunable_transformer(hp):
    # Hyperparameter: Learning rate (log scale)
    learning_rate = hp.Float(
        'learning_rate',
        min_value=1e-5,
        max_value=5e-5,
        sampling='log',
        default=2e-5
    )
    
    # Hyperparameter: Dropout (transformer internal dropout)
    dropout = hp.Float(
        'dropout',
        min_value=0.05,
        max_value=0.3,
        step=0.05,
        default=0.1
    )
    
    # Hyperparameter: Attention dropout
    attention_dropout = hp.Float(
        'attention_dropout',
        min_value=0.05,
        max_value=0.3,
        step=0.05,
        default=0.1
    )
    
    # Hyperparameter: Weight decay (L2 regularization)
    weight_decay = hp.Float(
        'weight_decay',
        min_value=0.001,
        max_value=0.02,
        step=0.001,
        default=0.01
    )
    
    # Hyperparameter: Batch size
    batch_size = hp.Choice(
        'batch_size',
        values=[8, 16, 32],
        default=16
    )
    
    # Store batch size for training
    hp.values['_batch_size'] = batch_size
    
    # Create custom config with dropout
    config = DistilBertConfig.from_pretrained(MODEL_NAME)
    config.dropout = dropout
    config.attention_dropout = attention_dropout
    config.num_labels = 2  # Set num_labels in config
    
    # Load model with custom config
    model = TFDistilBertForSequenceClassification.from_pretrained(
        MODEL_NAME,
        config=config,
        use_safetensors=False
    )
    
    # Compile with AdamW (Adam with weight decay) - use tf_keras optimizer
    optimizer = tf_keras.optimizers.AdamW(
        learning_rate=learning_rate,
        weight_decay=weight_decay
    )
    
    model.compile(
        optimizer=optimizer,
        loss=tf_keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy']
    )
    
    return model

print("Model builder ready")

Model builder ready


## Configure Bayesian Optimization Tuner

In [6]:
# Custom tuner to handle variable batch size
class TransformerTuner(BayesianOptimization):
    def run_trial(self, trial, *args, **kwargs):
        hp = trial.hyperparameters
        
        # Build model
        model = self.hypermodel.build(hp)
        
        # Get batch size
        batch_size = hp.values.get('_batch_size', 16)
        
        # Train
        history = model.fit(
            {'input_ids': X_train['input_ids'], 'attention_mask': X_train['attention_mask']},
            y_train,
            batch_size=batch_size,
            epochs=kwargs.get('epochs', 3),
            validation_data=(
                {'input_ids': X_test['input_ids'], 'attention_mask': X_test['attention_mask']},
                y_test
            ),
            verbose=kwargs.get('verbose', 1),
            callbacks=kwargs.get('callbacks', [])
        )
        
        return max(history.history['val_accuracy'])

# TEST MODE: Only 2 trials (change to 30 for full run)
tuner = TransformerTuner(
    hypermodel=build_tunable_transformer,
    objective='val_accuracy',
    max_trials=2,  # TEST: 2 trials only (change to 30 for full search)
    executions_per_trial=1,
    directory='tuner_results',
    project_name='transformer_tuning_test',  # Different project name for test
    overwrite=True,  # Overwrite test runs
    seed=42
)

print("TEST MODE: Tuner configured for 2 trials only")
print("Tuning: LEARNING_RATE, DROPOUT, ATTENTION_DROPOUT, WEIGHT_DECAY, BATCH_SIZE")
print("To run full search: Change max_trials=2 to max_trials=30")

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'vocab_layer_norm', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-

TEST MODE: Tuner configured for 2 trials only
Tuning: LEARNING_RATE, DROPOUT, ATTENTION_DROPOUT, WEIGHT_DECAY, BATCH_SIZE
To run full search: Change max_trials=2 to max_trials=30


## Early Stopping

In [7]:
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=1,
    restore_best_weights=True,
    mode='max',
    verbose=1
)

print("Early stopping configured (patience=1)")

Early stopping configured (patience=1)


## Execute Hyperparameter Search

**TEST MODE:** Running 2 trials (~10-20 minutes)

For full search: Change max_trials to 30 (3-6 hours)

In [None]:
print("="*80)
print("TEST MODE: TRANSFORMER HYPERPARAMETER SEARCH (2 trials)")
print(f"Training set: {len(train_df)}, Test set: {len(test_df)}")
print("Max epochs per trial: 3")
print("Estimated time: 10-20 minutes")
print("="*80)

search_start = time.time()

tuner.search(
    epochs=3,
    callbacks=[early_stopping],
    verbose=1
)

search_time = time.time() - search_start
print(f"\nSearch complete: {search_time/60:.1f} minutes")

Trial 1 Complete [00h 00m 20s]

Best val_accuracy So Far: None
Total elapsed time: 00h 00m 20s

Search: Running Trial #2

Value             |Best Value So Far |Hyperparameter
1.7617e-05        |2.7986e-05        |learning_rate
0.15              |0                 |dropout
0                 |0.1               |attention_dropout
0.005             |0.002             |weight_decay
8                 |32                |batch_size



Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'vocab_layer_norm', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_59']
You should probably TRAIN this model on a down-stream task to be able to use i

Epoch 1/3


2025-12-13 14:40:10.469070: I external/local_xla/xla/service/service.cc:163] XLA service 0x7cce51ebc430 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-12-13 14:40:10.469100: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): NVIDIA GeForce RTX 4070 Laptop GPU, Compute Capability 8.9
2025-12-13 14:40:10.486218: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-12-13 14:40:10.531811: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91700
I0000 00:00:1765629610.632305   10795 device_compiler.h:196] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


 641/5000 [==>...........................] - ETA: 9:11 - loss: 0.3840 - accuracy: 0.8264

## Best Hyperparameters

In [None]:
print("="*80)
print("BEST HYPERPARAMETERS:")
print("="*80)

best_hps = tuner.get_best_hyperparameters(1)[0]

print(f"Learning Rate: {best_hps.get('learning_rate'):.6f}")
print(f"Dropout: {best_hps.get('dropout'):.3f}")
print(f"Attention Dropout: {best_hps.get('attention_dropout'):.3f}")
print(f"Weight Decay: {best_hps.get('weight_decay'):.4f}")
print(f"Batch Size: {best_hps.get('batch_size')}")

print("\nComparison with original:")
print(f"  LR: 2e-05 -> {best_hps.get('learning_rate'):.6f}")
print(f"  Dropout: 0.1 -> {best_hps.get('dropout'):.3f}")
print(f"  Batch: 16 -> {best_hps.get('batch_size')}")

## Top 5 Configurations

In [None]:
print("TOP 5 CONFIGURATIONS:")

for i, trial in enumerate(tuner.oracle.get_best_trials(5), 1):
    hp = trial.hyperparameters
    print(f"\n#{i}: Accuracy={trial.score:.4f}")
    print(f"  LR={hp.get('learning_rate'):.6f}, Dropout={hp.get('dropout'):.2f}, ")
    print(f"  AttnDrop={hp.get('attention_dropout'):.2f}, WeightDecay={hp.get('weight_decay'):.4f}, Batch={hp.get('batch_size')}")

## Visualization

In [None]:
import matplotlib.pyplot as plt

trial_data = []
for trial in tuner.oracle.trials.values():
    if trial.score is not None:
        hp = trial.hyperparameters
        trial_data.append({
            'trial_id': trial.trial_id,
            'val_accuracy': trial.score,
            'learning_rate': hp.get('learning_rate'),
            'dropout': hp.get('dropout'),
            'attention_dropout': hp.get('attention_dropout'),
            'weight_decay': hp.get('weight_decay'),
            'batch_size': hp.get('batch_size')
        })

df = pd.DataFrame(trial_data)

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Transformer Hyperparameter Tuning Results', fontsize=16)

# 1. Progress
axes[0,0].plot(df['trial_id'], df['val_accuracy'], 'o-')
axes[0,0].set_title('Accuracy Progress')
axes[0,0].set_xlabel('Trial')
axes[0,0].set_ylabel('Validation Accuracy')
axes[0,0].grid(alpha=0.3)

# 2. Learning Rate
axes[0,1].scatter(df['learning_rate'], df['val_accuracy'], s=100, alpha=0.6)
axes[0,1].set_xscale('log')
axes[0,1].set_title('Learning Rate vs Accuracy')
axes[0,1].grid(alpha=0.3)

# 3. Dropout
axes[0,2].scatter(df['dropout'], df['val_accuracy'], s=100, alpha=0.6)
axes[0,2].set_title('Dropout vs Accuracy')
axes[0,2].grid(alpha=0.3)

# 4. Attention Dropout
axes[1,0].scatter(df['attention_dropout'], df['val_accuracy'], s=100, alpha=0.6)
axes[1,0].set_title('Attention Dropout vs Accuracy')
axes[1,0].grid(alpha=0.3)

# 5. Weight Decay
axes[1,1].scatter(df['weight_decay'], df['val_accuracy'], s=100, alpha=0.6)
axes[1,1].set_title('Weight Decay vs Accuracy')
axes[1,1].grid(alpha=0.3)

# 6. Batch Size
df.groupby('batch_size')['val_accuracy'].agg(['mean', 'max']).plot(kind='bar', ax=axes[1,2])
axes[1,2].set_title('Performance by Batch Size')
axes[1,2].set_xticklabels(axes[1,2].get_xticklabels(), rotation=0)

plt.tight_layout()
plt.show()

print(f"\nBest: {df['val_accuracy'].max():.4f}, Mean: {df['val_accuracy'].mean():.4f}")

## Train Final Model

In [None]:
print("Training final model with best hyperparameters...")

# Get best config
config = DistilBertConfig.from_pretrained(MODEL_NAME)
config.dropout = best_hps.get('dropout')
config.attention_dropout = best_hps.get('attention_dropout')
config.num_labels = 2  # Set num_labels in config

final_model = TFDistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    config=config,
    use_safetensors=False
)

final_optimizer = tf_keras.optimizers.AdamW(
    learning_rate=best_hps.get('learning_rate'),
    weight_decay=best_hps.get('weight_decay')
)

final_model.compile(
    optimizer=final_optimizer,
    loss=tf_keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

final_early_stopping = EarlyStopping(monitor='val_accuracy', patience=2, restore_best_weights=True)

final_history = final_model.fit(
    {'input_ids': X_train['input_ids'], 'attention_mask': X_train['attention_mask']},
    y_train,
    batch_size=best_hps.get('batch_size'),
    epochs=5,
    validation_data=(
        {'input_ids': X_test['input_ids'], 'attention_mask': X_test['attention_mask']},
        y_test
    ),
    callbacks=[final_early_stopping],
    verbose=1
)

final_loss, final_acc = final_model.evaluate(
    {'input_ids': X_test['input_ids'], 'attention_mask': X_test['attention_mask']},
    y_test,
    batch_size=best_hps.get('batch_size')
)

print(f"\nFinal Accuracy: {final_acc:.4f}")
print(f"Final Loss: {final_loss:.4f}")

## Save Results

In [None]:
import json
import os
from datetime import datetime

os.makedirs('models', exist_ok=True)

results = {
    'date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'model': 'DistilBERT',
    'max_length': 250,
    'total_trials': len(df),
    'best_hyperparameters': {
        'learning_rate': float(best_hps.get('learning_rate')),
        'dropout': float(best_hps.get('dropout')),
        'attention_dropout': float(best_hps.get('attention_dropout')),
        'weight_decay': float(best_hps.get('weight_decay')),
        'batch_size': int(best_hps.get('batch_size'))
    },
    'final_accuracy': float(final_acc),
    'final_loss': float(final_loss)
}

with open('transformer_tuning_results.json', 'w') as f:
    json.dump(results, f, indent=2)

final_model.save_pretrained('models/distilbert_tuned')
tokenizer.save_pretrained('models/distilbert_tuned')
df.to_csv('transformer_tuning_history.csv', index=False)

print('Saved: transformer_tuning_results.json')
print('Saved: models/distilbert_tuned/')
print('Saved: transformer_tuning_history.csv')

## Test Predictions

In [None]:
test_texts = [
    "This was the best movie I have ever seen!",
    "I really hated this film. It was slow and boring."
]

test_enc = tokenizer(test_texts, max_length=250, truncation=True, padding=True, return_tensors='tf')
preds = final_model.predict({'input_ids': test_enc['input_ids'], 'attention_mask': test_enc['attention_mask']})

print("\nPredictions:")
for text, pred in zip(test_texts, preds.logits):
    sentiment = "Positive" if pred[1] > pred[0] else "Negative"
    conf = tf.nn.softmax(pred).numpy().max()
    print(f"{text[:50]}... -> {sentiment} ({conf:.4f})")