In [1]:
!pip install datasets evaluate

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [3

In [2]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    EvalPrediction
)
import evaluate
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Load the SST2 dataset
print("Loading SST2 dataset...")
sst2 = load_dataset("glue", "sst2")
print(sst2)

# Load the pre-trained BERT model fine-tuned on IMDB
model_name = "yyammerrrss/imdb-sft-bert"
print(f"Loading model from {model_name}...")
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# Display model architecture
print("Model architecture:")
print(model)

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)

Loading SST2 dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})
Loading model from yyammerrrss/imdb-sft-bert...


tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Model architecture:
BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Laye

In [5]:
# Prepare the datasets
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

# Tokenize the datasets for SST2
tokenized_sst2 = {}
for split in sst2:
    tokenized_sst2[split] = sst2[split].map(tokenize_function, batched=True)
    tokenized_sst2[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print(f"Train dataset size: {len(tokenized_sst2['train'])}")
print(f"Validation dataset size: {len(tokenized_sst2['validation'])}")

# Check data format
print("\nSample from tokenized validation set:")
print(f"Keys: {list(tokenized_sst2['validation'][0].keys())}")
print(f"Label type: {type(tokenized_sst2['validation'][0]['label'])}")

def tokenize_imdb(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Tokenize the IMDB test dataset
imdb = load_dataset("imdb")
tokenized_imdb_test = imdb["test"].map(tokenize_imdb, batched=True)
tokenized_imdb_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print(f"IMDB test dataset size: {len(tokenized_imdb_test)}")


# Define evaluation metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Common training arguments for all finetuning experiments
base_training_args = {
    'per_device_train_batch_size': 16,
    'per_device_eval_batch_size': 16,
    'num_train_epochs': 3,
    'learning_rate': 2e-5,
    'weight_decay': 0.01,
    'logging_dir': './logs/',
    'logging_steps': 100,
    'evaluation_strategy': 'epoch',
    'save_strategy': 'epoch',
    'load_best_model_at_end': True,
    'metric_for_best_model': 'accuracy',
    'greater_is_better': True,
    'report_to': "none"
}

Train dataset size: 67349
Validation dataset size: 872

Sample from tokenized validation set:
Keys: ['label', 'input_ids', 'attention_mask']
Label type: <class 'torch.Tensor'>


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

IMDB test dataset size: 25000


In [6]:
# Let's do a quick evaluation on SST2 validation set before any fine-tuning to get a baseline for our model's performance
print("Evaluating baseline performance on SST2 validation set...")

# For baseline evaluation, we need to either provide an eval_dataset or change the strategy
baseline_args = base_training_args.copy()

eval_trainer = Trainer(
    model=model,
    args=TrainingArguments(output_dir='./results/baseline', **baseline_args),
    compute_metrics=compute_metrics,
    eval_dataset=tokenized_sst2['validation'],
)

baseline_results = eval_trainer.evaluate(tokenized_sst2['validation'])
print("Baseline performance before domain adaptation:")
print(baseline_results)


Evaluating baseline performance on SST2 validation set...




Baseline performance before domain adaptation:
{'eval_loss': 0.5829965472221375, 'eval_model_preparation_time': 0.0031, 'eval_accuracy': 0.8692660550458715, 'eval_precision': 0.8353658536585366, 'eval_recall': 0.9256756756756757, 'eval_f1': 0.8782051282051282, 'eval_runtime': 3.3928, 'eval_samples_per_second': 257.016, 'eval_steps_per_second': 16.211}


# Finetuning the Head only

In [7]:
# First, reload the model to ensure we start fresh
model_name = "yyammerrrss/imdb-sft-bert"
model = BertForSequenceClassification.from_pretrained(model_name)
model = model.to(device)

# Freeze all parameters in the BERT encoder
for param in model.bert.parameters():
    param.requires_grad = False

# Only the classification head parameters should be trainable
for param in model.classifier.parameters():
    param.requires_grad = True

# Count trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params:,} ({trainable_params/total_params:.2%} of total)")

# Set up the trainer for head-only finetuning
head_only_training_args = TrainingArguments(
    **base_training_args,
    output_dir="./results/head_only",
)

head_only_trainer = Trainer(
    model=model,
    args=head_only_training_args,
    train_dataset=tokenized_sst2['train'],
    eval_dataset=tokenized_sst2['validation'],
    compute_metrics=compute_metrics,
)

Trainable parameters: 1,538 (0.00% of total)




In [8]:
# Train the model
print("Starting head-only finetuning...")
head_only_trainer.train()

# Evaluate the model
head_only_results = head_only_trainer.evaluate()
print("Head-only finetuning results:")
print(head_only_results)

# Save the finetuned model
head_only_model_path = "./sst2_head_only_finetuned"
head_only_trainer.save_model(head_only_model_path)
print(f"Model saved to {head_only_model_path}")

# Let's track our results for comparison later
adaptation_results = {
    "baseline": baseline_results,
    "head_only": head_only_results,
}

Starting head-only finetuning...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3644,0.320545,0.870413,0.84265,0.916667,0.878101
2,0.3501,0.315226,0.868119,0.839175,0.916667,0.876211
3,0.3687,0.31577,0.866972,0.837449,0.916667,0.875269


Head-only finetuning results:
{'eval_loss': 0.3205454349517822, 'eval_accuracy': 0.8704128440366973, 'eval_precision': 0.8426501035196687, 'eval_recall': 0.9166666666666666, 'eval_f1': 0.8781014023732471, 'eval_runtime': 2.7744, 'eval_samples_per_second': 314.304, 'eval_steps_per_second': 19.824, 'epoch': 3.0}
Model saved to ./sst2_head_only_finetuned


In [9]:
# Create evaluation trainer
imdb_eval_trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./results/head_only_on_imdb",
        per_device_eval_batch_size=16,
        report_to="none"
    ),
    compute_metrics=compute_metrics,
    eval_dataset=tokenized_imdb_test,
)

# Evaluate on IMDB test set
imdb_results = imdb_eval_trainer.evaluate(tokenized_imdb_test)
print(f"head_only model performance on IMDB test set:")
print(imdb_results)

# Store these results for comparison later
adaptation_results["head_only_on_imdb"] = imdb_results

head_only model performance on IMDB test set:
{'eval_loss': 0.17359478771686554, 'eval_model_preparation_time': 0.005, 'eval_accuracy': 0.94116, 'eval_precision': 0.9391574420641873, 'eval_recall': 0.94344, 'eval_f1': 0.94129385002195, 'eval_runtime': 352.4632, 'eval_samples_per_second': 70.929, 'eval_steps_per_second': 4.435}


# Finetuning the Body only

In [11]:
# Reload the model to ensure we start fresh
model_name = "yyammerrrss/imdb-sft-bert"
model = BertForSequenceClassification.from_pretrained(model_name)
model = model.to(device)

# Freeze the classification head parameters
for param in model.classifier.parameters():
    param.requires_grad = False

# Make BERT body parameters trainable
for param in model.bert.parameters():
    param.requires_grad = True

# Count trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params:,} ({trainable_params/total_params:.2%} of total)")

# We might need a lower learning rate since we're finetuning the body
base_training_args['num_train_epochs'] = 5
body_only_training_args = TrainingArguments(
    **base_training_args,
    output_dir="./results/body_only",
)

body_only_trainer = Trainer(
    model=model,
    args=body_only_training_args,
    train_dataset=tokenized_sst2['train'],
    eval_dataset=tokenized_sst2['validation'],
    compute_metrics=compute_metrics,
)


Trainable parameters: 109,482,240 (100.00% of total)




In [12]:
# Train the model
print("Starting body-only finetuning...")
body_only_trainer.train()

# Evaluate the model
body_only_results = body_only_trainer.evaluate()
print("Body-only finetuning results:")
print(body_only_results)

# Save the finetuned model
body_only_model_path = "./sst2_body_only_finetuned"
body_only_trainer.save_model(body_only_model_path)
print(f"Model saved to {body_only_model_path}")

adaptation_results["body_only"] = body_only_results

Starting body-only finetuning...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1825,0.245273,0.925459,0.933638,0.918919,0.92622
2,0.1148,0.288182,0.924312,0.918142,0.934685,0.926339
3,0.0802,0.323131,0.922018,0.943396,0.900901,0.921659
4,0.0674,0.354955,0.924312,0.925676,0.925676,0.925676
5,0.0254,0.421681,0.927752,0.92809,0.93018,0.929134


Body-only finetuning results:
{'eval_loss': 0.4216805100440979, 'eval_accuracy': 0.9277522935779816, 'eval_precision': 0.9280898876404494, 'eval_recall': 0.9301801801801802, 'eval_f1': 0.9291338582677166, 'eval_runtime': 2.746, 'eval_samples_per_second': 317.555, 'eval_steps_per_second': 20.029, 'epoch': 5.0}
Model saved to ./sst2_body_only_finetuned


In [13]:
# Create evaluation trainer
imdb_eval_trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./results/body_only_on_imdb",
        per_device_eval_batch_size=16,
        report_to="none"
    ),
    compute_metrics=compute_metrics,
    eval_dataset=tokenized_imdb_test,
)

# Evaluate on IMDB test set
imdb_results = imdb_eval_trainer.evaluate(tokenized_imdb_test)
print(f"body_only model performance on IMDB test set:")
print(imdb_results)

# Store these results for comparison later
adaptation_results["body_only_on_imdb"] = imdb_results

body_only model performance on IMDB test set:
{'eval_loss': 0.3591756224632263, 'eval_model_preparation_time': 0.0031, 'eval_accuracy': 0.92908, 'eval_precision': 0.9335542801713685, 'eval_recall': 0.92392, 'eval_f1': 0.9287121547183467, 'eval_runtime': 352.3337, 'eval_samples_per_second': 70.955, 'eval_steps_per_second': 4.436}


#Finetuning the Head and Last 2 layers only

In [14]:
# Reload the model to ensure we start fresh
model_name = "yyammerrrss/imdb-sft-bert"
model = BertForSequenceClassification.from_pretrained(model_name)
model = model.to(device)

# Freeze most of BERT layers
for param in model.bert.parameters():
    param.requires_grad = False

# Unfreeze the last 2 encoder layers
for i in range(10, 12):  # BERT base has 12 layers (0-11)
    for param in model.bert.encoder.layer[i].parameters():
        param.requires_grad = True

# Make classification head trainable
for param in model.classifier.parameters():
    param.requires_grad = True

# Count trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params:,} ({trainable_params/total_params:.2%} of total)")

# Set up the trainer for partial finetuning
partial_training_args = TrainingArguments(
    **base_training_args,
    output_dir="./results/head_and_last_layers",
)

partial_trainer = Trainer(
    model=model,
    args=partial_training_args,
    train_dataset=tokenized_sst2['train'],
    eval_dataset=tokenized_sst2['validation'],
    compute_metrics=compute_metrics,
)



APPROACH 3: FINETUNING CLASSIFICATION HEAD AND LAST 2 LAYERS
Trainable parameters: 14,177,282 (12.95% of total)




In [15]:
# Train the model
print("Starting partial finetuning (head + last 2 layers)...")
partial_trainer.train()

# Evaluate the model
partial_results = partial_trainer.evaluate()
print("Partial finetuning results:")
print(partial_results)

# Save the finetuned model
partial_model_path = "./sst2_partial_finetuned"
partial_trainer.save_model(partial_model_path)
print(f"Model saved to {partial_model_path}")

# Update our results tracking
adaptation_results["head_and_last_layers"] = partial_results


Starting partial finetuning (head + last 2 layers)...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2477,0.246669,0.908257,0.90625,0.914414,0.910314
2,0.2106,0.250542,0.913991,0.928074,0.900901,0.914286
3,0.2187,0.2604,0.916284,0.922551,0.912162,0.917327
4,0.1931,0.290792,0.911697,0.9161,0.90991,0.912994
5,0.1709,0.293372,0.908257,0.90991,0.90991,0.90991


Partial finetuning results:
{'eval_loss': 0.26040032505989075, 'eval_accuracy': 0.9162844036697247, 'eval_precision': 0.9225512528473804, 'eval_recall': 0.9121621621621622, 'eval_f1': 0.9173272933182333, 'eval_runtime': 2.7579, 'eval_samples_per_second': 316.187, 'eval_steps_per_second': 19.943, 'epoch': 5.0}
Model saved to ./sst2_partial_finetuned


In [16]:
# Create evaluation trainer
imdb_eval_trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./results/partial_finetuned_on_imdb",
        per_device_eval_batch_size=16,
        report_to="none"
    ),
    compute_metrics=compute_metrics,
    eval_dataset=tokenized_imdb_test,
)

# Evaluate on IMDB test set
imdb_results = imdb_eval_trainer.evaluate(tokenized_imdb_test)
print(f"partial_finetuned model performance on IMDB test set:")
print(imdb_results)

# Store these results for comparison later
adaptation_results["partial_finetuned_on_imdb"] = imdb_results

partial_finetuned model performance on IMDB test set:
{'eval_loss': 0.24394738674163818, 'eval_model_preparation_time': 0.0029, 'eval_accuracy': 0.92804, 'eval_precision': 0.926640618770433, 'eval_recall': 0.92968, 'eval_f1': 0.9281578211732758, 'eval_runtime': 352.7768, 'eval_samples_per_second': 70.866, 'eval_steps_per_second': 4.431}


# Comparisons and Conclusion

In [22]:
# Compare all approaches
print("\n\n" + "="*80)
print("COMPARISON OF ALL DOMAIN ADAPTATION APPROACHES")
print("="*80)

# Create a formatted results table
print(f"{'Approach':<25} {'Accuracy':<10} {'F1':<10} {'Precision':<10} {'Recall':<10}")
print("-" * 65)

for approach, results in adaptation_results.items():
    print(f"{approach:<25} {results['eval_accuracy']:<10.4f} {results['eval_f1']:<10.4f} "
          f"{results['eval_precision']:<10.4f} {results['eval_recall']:<10.4f}")

# Determine the best approach based on accuracy
best_approach = max(adaptation_results.items(), key=lambda x: x[1]['eval_accuracy'])[0]
print(f"\nBest approach based on accuracy: {best_approach}")



COMPARISON OF ALL DOMAIN ADAPTATION APPROACHES
Approach                  Accuracy   F1         Precision  Recall    
-----------------------------------------------------------------
baseline                  0.8693     0.8782     0.8354     0.9257    
head_only                 0.8704     0.8781     0.8427     0.9167    
body_only                 0.9232     0.9236     0.9353     0.9122    
body_only_5_epochs        0.9278     0.9291     0.9281     0.9302    
head_and_last_layers      0.9163     0.9173     0.9226     0.9122    
head_and_last_layers_5_epochs 0.9163     0.9173     0.9226     0.9122    

Best approach based on accuracy: body_only_5_epochs
