# 1. Install Dependencies

In [None]:
!pip install datasets transformers evaluate
!apt-get install git-lfs


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.


In [2]:
from datasets import load_dataset

sst2 = load_dataset("glue", "sst2")
print(sst2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})


In [3]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Load BERT tokenizer - using base uncased model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Initialize the BERT model for the final training
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)


# Prepare the datasets
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

# Tokenize the datasets for SST2
tokenized_sst2 = {}
for split in sst2:
    tokenized_sst2[split] = sst2[split].map(tokenize_function, batched=True)
    tokenized_sst2[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda


In [4]:
# Define evaluation metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Common training arguments for all finetuning experiments
base_training_args = {
    'per_device_train_batch_size': 16,
    'per_device_eval_batch_size': 16,
    'num_train_epochs': 3,
    'learning_rate': 2e-5,
    'weight_decay': 0.01,
    'logging_dir': './logs/',
    'logging_steps': 100,
    'evaluation_strategy': 'epoch',
    'save_strategy': 'epoch',
    'load_best_model_at_end': True,
    'metric_for_best_model': 'accuracy',
    'greater_is_better': True,
    'report_to': "none"
}

In [5]:
# For baseline evaluation, we need to either provide an eval_dataset or change the strategy
baseline_args = base_training_args.copy()

trainer = Trainer(
    model=model,
    args=TrainingArguments(output_dir='./results/baseline', **baseline_args),
    compute_metrics=compute_metrics,
    train_dataset=tokenized_sst2['train'],
    eval_dataset=tokenized_sst2['validation'],
)

trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1599,0.26776,0.919725,0.936916,0.903153,0.919725
2,0.1068,0.278301,0.925459,0.911063,0.945946,0.928177
3,0.0648,0.328081,0.927752,0.924276,0.934685,0.929451


{'eval_loss': 0.3280811011791229, 'eval_accuracy': 0.9277522935779816, 'eval_precision': 0.9242761692650334, 'eval_recall': 0.9346846846846847, 'eval_f1': 0.9294512877939529, 'eval_runtime': 1.515, 'eval_samples_per_second': 575.581, 'eval_steps_per_second': 36.304, 'epoch': 3.0}


In [6]:
! huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write

In [7]:
model.push_to_hub("bert-sst2")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/yyammerrrss/bert-sst2/commit/bdc490845ed8bfe3033fd6526262e44b19f76d8c', commit_message='Upload BertForSequenceClassification', commit_description='', oid='bdc490845ed8bfe3033fd6526262e44b19f76d8c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/yyammerrrss/bert-sst2', endpoint='https://huggingface.co', repo_type='model', repo_id='yyammerrrss/bert-sst2'), pr_revision=None, pr_num=None)

# Testing on IMDB

In [10]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from datasets import load_dataset

imdb = load_dataset("imdb")

# Prepare the datasets
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Tokenize the datasets for SST2
tokenized_imdb = {}
for split in imdb:
    tokenized_imdb[split] = imdb[split].map(tokenize_function, batched=True)
    tokenized_imdb[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [11]:
eval_trainer = Trainer(
    model=model,
    args=TrainingArguments(output_dir='./results/baseline', **baseline_args),
    compute_metrics=compute_metrics,
    eval_dataset=tokenized_imdb['test'],
)

baseline_results = eval_trainer.evaluate(tokenized_imdb['test'])
print(baseline_results)



Baseline performance before domain adaptation:
{'eval_loss': 0.3772938549518585, 'eval_model_preparation_time': 0.0029, 'eval_accuracy': 0.90404, 'eval_precision': 0.8881715471524095, 'eval_recall': 0.92448, 'eval_f1': 0.9059621339814198, 'eval_runtime': 151.3304, 'eval_samples_per_second': 165.201, 'eval_steps_per_second': 10.328}
