In [1]:
!pip install transformers datasets torch evaluate -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/472.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━[0m [32m399.4/472.7 kB[0m [31m12.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [2]:
import evaluate
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import numpy as np

## DistilBERT on GLUE Tasks

In [None]:
GLUE_TASKS = ["cola", "mnli", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]

In [None]:
task_to_keys = {
    "sst2": ("sentence", None),
    "cola": ("sentence", None),
    "mrpc": ("sentence1", "sentence2"),
    "stsb": ("sentence1", "sentence2"),
    "qqp": ("question1", "question2"),
    "qnli": ("question", "sentence"),
    "mnli": ("premise", "hypothesis"),
    "rte": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

In [None]:
def train_and_evaluate(task):

    # Step 1: Load the dataset
    dataset = load_dataset("glue", task)

    num_labels = 3 if task == "mnli" else 1 if task =="stsb" else 2
    batch_size = 32 if task in ["qqp", "mnli", "qnli"] else 16
    max_length = 128

    # Step 2: Load the tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    sentence1_key, sentence2_key = task_to_keys[task]

    # Step 3: Preprocess the dataset
    def preprocess_function(examples):
        if sentence2_key is None:
            return tokenizer(examples[sentence1_key], truncation=True, padding="max_length", max_length=max_length)
        return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True, padding="max_length", max_length=max_length)

    tokenized_dataset = dataset.map(preprocess_function, batched=True)

    # Step 4: Load pre-trained model with a classification head
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

    # Step 6: Load metric using the `evaluate` library
    metric = evaluate.load("glue", task)

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        # Convert logits to a PyTorch tensor
        logits = torch.tensor(logits)
        predictions = logits[:, 0] if task == "stsb" else torch.argmax(logits, dim=-1)
        return metric.compute(predictions=predictions, references=labels)

    # Define a list of seeds for 5 different runs
    seeds = [42, 1234, 100, 456, 789]

    # Store results from each run
    all_results = []

    # Loop for 5 runs with different seeds
    for seed in seeds:
        print(f"Running with seed: {seed}")

        # Step 5: Define training arguments (update with the current seed)
        training_args = TrainingArguments(
          output_dir=f"./results_{task}",
          eval_strategy="epoch",
          save_strategy = "epoch",
          learning_rate=5e-5,
          per_device_train_batch_size=batch_size,
          per_device_eval_batch_size=batch_size,
          num_train_epochs=3,
          warmup_ratio=0.1,
          weight_decay=0.01,
          logging_dir=f"./logs_{task}_seed_{seed}",
          logging_steps=100,
          gradient_accumulation_steps=1,
          max_grad_norm=1.0,
          save_total_limit=2,
          fp16=True,
          seed=seed,
          load_best_model_at_end=True,
          report_to=[],
        )

        validation_key = "validation_matched" if task == "mnli" else "validation"
        # Step 7: Set up the Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_dataset["train"],
            eval_dataset=tokenized_dataset[validation_key],
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
          )

        # Step 8: Train the model
        trainer.train()

        # Evaluate the model and store results
        results = trainer.evaluate()
        print(f"Results for {task.capitalize()} with seed {seed}: {results}")

        # Get the appropriate metric key based on the task
        metric_key = "eval_pearson" if task == "stsb" else "eval_matthews_correlation" if task == "cola" else "eval_accuracy"

        # Append the accuracy to the results list
        all_results.append(results[metric_key])

    # Calculate the median accuracy from all runs
    median_accuracy = np.median(all_results)

    # Print the final median result
    print(f"DistilBERT results are the medians of 5 runs with different seeds: {median_accuracy}")

In [None]:
# Loop through each task and run training/evaluation
for task in GLUE_TASKS:
    print(f"Training and evaluating {task.upper()}...")
    train_and_evaluate(task)


Training and evaluating COLA...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/251k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/37.6k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/37.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8551 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1063 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

Running with seed: 42


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.4891,0.486559,0.406366
2,0.3147,0.522503,0.470132
3,0.166,0.709664,0.498608


Results for Cola with seed 42: {'eval_loss': 0.48655861616134644, 'eval_matthews_correlation': 0.4063664761456974, 'eval_runtime': 0.7469, 'eval_samples_per_second': 1396.473, 'eval_steps_per_second': 88.367, 'epoch': 3.0}
Running with seed: 1234


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3339,0.507532,0.472848
2,0.2074,0.555143,0.496703
3,0.1322,0.885839,0.505905


Results for Cola with seed 1234: {'eval_loss': 0.5075321793556213, 'eval_matthews_correlation': 0.4728475040113381, 'eval_runtime': 0.7657, 'eval_samples_per_second': 1362.198, 'eval_steps_per_second': 86.199, 'epoch': 3.0}
Running with seed: 100


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.2608,0.641342,0.49947
2,0.1746,0.944342,0.475137
3,0.0856,0.967929,0.51764


Results for Cola with seed 100: {'eval_loss': 0.6413422226905823, 'eval_matthews_correlation': 0.4994701324058332, 'eval_runtime': 0.7563, 'eval_samples_per_second': 1379.005, 'eval_steps_per_second': 87.262, 'epoch': 3.0}
Running with seed: 456


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.193,0.974146,0.448129
2,0.1321,1.002907,0.457567
3,0.0589,1.186214,0.480953


Results for Cola with seed 456: {'eval_loss': 0.9741463661193848, 'eval_matthews_correlation': 0.4481287541235995, 'eval_runtime': 0.807, 'eval_samples_per_second': 1292.495, 'eval_steps_per_second': 81.788, 'epoch': 3.0}
Running with seed: 789


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.1469,0.866475,0.458402
2,0.1117,1.042787,0.462504
3,0.0483,1.313485,0.489444


Results for Cola with seed 789: {'eval_loss': 0.8664747476577759, 'eval_matthews_correlation': 0.4584020596329, 'eval_runtime': 0.7612, 'eval_samples_per_second': 1370.12, 'eval_steps_per_second': 86.7, 'epoch': 3.0}
DistilBERT results are the medians of 5 runs with different seeds: 0.4584020596329
Training and evaluating MNLI...


train-00000-of-00001.parquet:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

(…)alidation_matched-00000-of-00001.parquet:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

(…)dation_mismatched-00000-of-00001.parquet:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

test_matched-00000-of-00001.parquet:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

test_mismatched-00000-of-00001.parquet:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]



Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/9832 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Running with seed: 42


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5297,0.500738,0.800509
2,0.3862,0.480404,0.818441
3,0.2765,0.559721,0.821294


Results for Mnli with seed 42: {'eval_loss': 0.48040369153022766, 'eval_accuracy': 0.8184411614875191, 'eval_runtime': 4.2663, 'eval_samples_per_second': 2300.585, 'eval_steps_per_second': 71.959, 'epoch': 3.0}
Running with seed: 1234


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3629,0.554357,0.805502
2,0.2282,0.588536,0.812124
3,0.1282,0.772928,0.819052


Results for Mnli with seed 1234: {'eval_loss': 0.5543566346168518, 'eval_accuracy': 0.8055017829852267, 'eval_runtime': 4.2857, 'eval_samples_per_second': 2290.158, 'eval_steps_per_second': 71.633, 'epoch': 3.0}
Running with seed: 100


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2772,0.557221,0.808558
2,0.1854,0.661923,0.811513
3,0.0982,0.921295,0.813245


Results for Mnli with seed 100: {'eval_loss': 0.5572211146354675, 'eval_accuracy': 0.8085583290881304, 'eval_runtime': 4.2992, 'eval_samples_per_second': 2282.968, 'eval_steps_per_second': 71.408, 'epoch': 3.0}
Running with seed: 456


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2297,0.647162,0.803057
2,0.1473,0.72914,0.810596
3,0.0684,1.033484,0.814264


Results for Mnli with seed 456: {'eval_loss': 0.6471623182296753, 'eval_accuracy': 0.8030565461029037, 'eval_runtime': 4.2938, 'eval_samples_per_second': 2285.853, 'eval_steps_per_second': 71.498, 'epoch': 3.0}
Running with seed: 789


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2059,0.659684,0.802649
2,0.1106,0.898452,0.807132
3,0.0728,1.144512,0.811004


Results for Mnli with seed 789: {'eval_loss': 0.6596836447715759, 'eval_accuracy': 0.8026490066225166, 'eval_runtime': 4.312, 'eval_samples_per_second': 2276.196, 'eval_steps_per_second': 71.196, 'epoch': 3.0}
DistilBERT results are the medians of 5 runs with different seeds: 0.8055017829852267
Training and evaluating MRPC...


train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]



Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running with seed: 42


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5009,0.395981,0.801471,0.85192
2,0.315,0.349854,0.840686,0.889267
3,0.1703,0.502075,0.85049,0.895726


Results for Mrpc with seed 42: {'eval_loss': 0.3498542606830597, 'eval_accuracy': 0.8406862745098039, 'eval_f1': 0.889267461669506, 'eval_runtime': 0.3158, 'eval_samples_per_second': 1292.161, 'eval_steps_per_second': 82.344, 'epoch': 3.0}
Running with seed: 1234


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2194,0.574973,0.845588,0.894472
2,0.1221,0.801021,0.848039,0.893471
3,0.0625,0.79351,0.840686,0.888508


Results for Mrpc with seed 1234: {'eval_loss': 0.5749725103378296, 'eval_accuracy': 0.8455882352941176, 'eval_f1': 0.8944723618090452, 'eval_runtime': 0.3214, 'eval_samples_per_second': 1269.363, 'eval_steps_per_second': 80.891, 'epoch': 3.0}
Running with seed: 100


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1318,0.642359,0.85049,0.892416
2,0.0683,0.883539,0.838235,0.888889
3,0.0389,0.824227,0.852941,0.896194


Results for Mrpc with seed 100: {'eval_loss': 0.6423591375350952, 'eval_accuracy': 0.8504901960784313, 'eval_f1': 0.892416225749559, 'eval_runtime': 0.3172, 'eval_samples_per_second': 1286.25, 'eval_steps_per_second': 81.967, 'epoch': 3.0}
Running with seed: 456


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1225,1.043202,0.811275,0.874388
2,0.0552,0.873262,0.852941,0.896552
3,0.0237,0.972302,0.845588,0.893401


Results for Mrpc with seed 456: {'eval_loss': 0.873261570930481, 'eval_accuracy': 0.8529411764705882, 'eval_f1': 0.896551724137931, 'eval_runtime': 0.3128, 'eval_samples_per_second': 1304.319, 'eval_steps_per_second': 83.118, 'epoch': 3.0}
Running with seed: 789


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0719,0.992376,0.845588,0.893401
2,0.0331,0.996702,0.835784,0.880143
3,0.0249,0.977945,0.843137,0.889655


Results for Mrpc with seed 789: {'eval_loss': 0.9779454469680786, 'eval_accuracy': 0.8431372549019608, 'eval_f1': 0.8896551724137931, 'eval_runtime': 0.3316, 'eval_samples_per_second': 1230.462, 'eval_steps_per_second': 78.412, 'epoch': 3.0}
DistilBERT results are the medians of 5 runs with different seeds: 0.8455882352941176
Training and evaluating QNLI...


train-00000-of-00001.parquet:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/872k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/877k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/104743 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5463 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5463 [00:00<?, ? examples/s]



Map:   0%|          | 0/104743 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/5463 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/5463 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Running with seed: 42


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3484,0.287042,0.882848
2,0.2343,0.281746,0.889804
3,0.1378,0.333308,0.891085


Results for Qnli with seed 42: {'eval_loss': 0.2817458212375641, 'eval_accuracy': 0.8898041369211056, 'eval_runtime': 2.4137, 'eval_samples_per_second': 2263.368, 'eval_steps_per_second': 70.847, 'epoch': 3.0}
Running with seed: 1234


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1886,0.331417,0.881384
2,0.1165,0.423207,0.884679
3,0.0599,0.561991,0.884496


Results for Qnli with seed 1234: {'eval_loss': 0.3314168155193329, 'eval_accuracy': 0.8813838550247117, 'eval_runtime': 2.5309, 'eval_samples_per_second': 2158.552, 'eval_steps_per_second': 67.566, 'epoch': 3.0}
Running with seed: 100


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1338,0.426412,0.871682
2,0.0976,0.478202,0.875892
3,0.0338,0.652432,0.882299


Results for Qnli with seed 100: {'eval_loss': 0.4264117479324341, 'eval_accuracy': 0.8716822258832143, 'eval_runtime': 2.4517, 'eval_samples_per_second': 2228.262, 'eval_steps_per_second': 69.748, 'epoch': 3.0}
Running with seed: 456


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.111,0.496331,0.870767
2,0.0605,0.572426,0.881201
3,0.0188,0.710053,0.881201


Results for Qnli with seed 456: {'eval_loss': 0.4963311553001404, 'eval_accuracy': 0.8707669778509977, 'eval_runtime': 2.4086, 'eval_samples_per_second': 2268.105, 'eval_steps_per_second': 70.995, 'epoch': 3.0}
Running with seed: 789


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1021,0.5378,0.872414
2,0.0533,0.625927,0.878089
3,0.014,0.828853,0.87937


Results for Qnli with seed 789: {'eval_loss': 0.5378004908561707, 'eval_accuracy': 0.8724144243089877, 'eval_runtime': 2.4421, 'eval_samples_per_second': 2236.968, 'eval_steps_per_second': 70.02, 'epoch': 3.0}
DistilBERT results are the medians of 5 runs with different seeds: 0.8724144243089877
Training and evaluating QQP...


train-00000-of-00001.parquet:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/3.73M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/36.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/363846 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/40430 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/390965 [00:00<?, ? examples/s]



Map:   0%|          | 0/363846 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/40430 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/390965 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Running with seed: 42


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2879,0.271536,0.884492,0.848051
2,0.1894,0.251229,0.898565,0.866612
3,0.1291,0.29603,0.904526,0.872211


Results for Qqp with seed 42: {'eval_loss': 0.2512287497520447, 'eval_accuracy': 0.8985654217165471, 'eval_f1': 0.8666124573101317, 'eval_runtime': 17.4829, 'eval_samples_per_second': 2312.545, 'eval_steps_per_second': 72.299, 'epoch': 3.0}
Running with seed: 1234


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1749,0.256355,0.896043,0.862427
2,0.1088,0.314597,0.90277,0.869873
3,0.0576,0.42162,0.902943,0.869799


Results for Qqp with seed 1234: {'eval_loss': 0.2563547194004059, 'eval_accuracy': 0.8960425426663369, 'eval_f1': 0.8624267618081242, 'eval_runtime': 17.4694, 'eval_samples_per_second': 2314.336, 'eval_steps_per_second': 72.355, 'epoch': 3.0}
Running with seed: 100


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1382,0.310205,0.897601,0.86324
2,0.0904,0.361172,0.901088,0.869215
3,0.0444,0.49768,0.902226,0.868989


Results for Qqp with seed 100: {'eval_loss': 0.3102051019668579, 'eval_accuracy': 0.8976007914914668, 'eval_f1': 0.8632399577167019, 'eval_runtime': 17.8692, 'eval_samples_per_second': 2262.552, 'eval_steps_per_second': 70.736, 'epoch': 3.0}
Running with seed: 456


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1137,0.361887,0.895177,0.862017


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1137,0.361887,0.895177,0.862017
2,0.0867,0.422811,0.901163,0.867586
3,0.0446,0.541683,0.903438,0.869858


Results for Qqp with seed 456: {'eval_loss': 0.36188724637031555, 'eval_accuracy': 0.895176848874598, 'eval_f1': 0.862017321091359, 'eval_runtime': 17.8002, 'eval_samples_per_second': 2271.326, 'eval_steps_per_second': 71.011, 'epoch': 3.0}
Running with seed: 789


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1083,0.382236,0.895226,0.860327
2,0.0617,0.440299,0.895968,0.862764
3,0.0283,0.58668,0.901261,0.868129


Results for Qqp with seed 789: {'eval_loss': 0.3822355568408966, 'eval_accuracy': 0.8952263170912689, 'eval_f1': 0.8603270904774466, 'eval_runtime': 17.8841, 'eval_samples_per_second': 2260.668, 'eval_steps_per_second': 70.677, 'epoch': 3.0}
DistilBERT results are the medians of 5 runs with different seeds: 0.8960425426663369
Training and evaluating RTE...


train-00000-of-00001.parquet:   0%|          | 0.00/584k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/69.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/621k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/277 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3000 [00:00<?, ? examples/s]



Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/277 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Running with seed: 42


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6939,0.695552,0.472924
2,0.6284,0.656391,0.595668
3,0.3921,0.788108,0.620939


Results for Rte with seed 42: {'eval_loss': 0.6563912630081177, 'eval_accuracy': 0.5956678700361011, 'eval_runtime': 0.2247, 'eval_samples_per_second': 1232.874, 'eval_steps_per_second': 80.115, 'epoch': 3.0}
Running with seed: 1234


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4305,0.797531,0.610108
2,0.206,1.333692,0.606498
3,0.1012,1.776001,0.606498


Results for Rte with seed 1234: {'eval_loss': 0.7975314259529114, 'eval_accuracy': 0.6101083032490975, 'eval_runtime': 0.2321, 'eval_samples_per_second': 1193.278, 'eval_steps_per_second': 77.542, 'epoch': 3.0}
Running with seed: 100


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.227,1.009736,0.606498
2,0.1671,1.670562,0.606498
3,0.0667,1.951416,0.631769


Results for Rte with seed 100: {'eval_loss': 1.0097357034683228, 'eval_accuracy': 0.6064981949458483, 'eval_runtime': 0.2268, 'eval_samples_per_second': 1221.353, 'eval_steps_per_second': 79.366, 'epoch': 3.0}
Running with seed: 456


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1528,1.656199,0.613718
2,0.1073,2.008191,0.610108
3,0.0422,2.237496,0.592058


Results for Rte with seed 456: {'eval_loss': 1.6561990976333618, 'eval_accuracy': 0.6137184115523465, 'eval_runtime': 0.2347, 'eval_samples_per_second': 1180.186, 'eval_steps_per_second': 76.691, 'epoch': 3.0}
Running with seed: 789


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1297,2.274474,0.592058
2,0.058,2.303484,0.595668
3,0.0238,2.591018,0.581227


Results for Rte with seed 789: {'eval_loss': 2.2744741439819336, 'eval_accuracy': 0.592057761732852, 'eval_runtime': 0.2345, 'eval_samples_per_second': 1181.152, 'eval_steps_per_second': 76.754, 'epoch': 3.0}
DistilBERT results are the medians of 5 runs with different seeds: 0.6064981949458483
Training and evaluating SST2...


train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]



Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running with seed: 42


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1801,0.291957,0.896789
2,0.1125,0.356528,0.892202
3,0.0789,0.381957,0.90367


Results for Sst2 with seed 42: {'eval_loss': 0.2919568717479706, 'eval_accuracy': 0.8967889908256881, 'eval_runtime': 0.6321, 'eval_samples_per_second': 1379.5, 'eval_steps_per_second': 87.01, 'epoch': 3.0}
Running with seed: 1234


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1519,0.331217,0.896789
2,0.0763,0.367773,0.900229
3,0.054,0.420165,0.902523


Results for Sst2 with seed 1234: {'eval_loss': 0.33121687173843384, 'eval_accuracy': 0.8967889908256881, 'eval_runtime': 0.6851, 'eval_samples_per_second': 1272.741, 'eval_steps_per_second': 80.276, 'epoch': 3.0}
Running with seed: 100


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0769,0.472224,0.885321
2,0.0805,0.383781,0.90367
3,0.0269,0.45696,0.901376


Results for Sst2 with seed 100: {'eval_loss': 0.383781373500824, 'eval_accuracy': 0.9036697247706422, 'eval_runtime': 0.6587, 'eval_samples_per_second': 1323.772, 'eval_steps_per_second': 83.495, 'epoch': 3.0}
Running with seed: 456


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0835,0.464347,0.888761
2,0.0558,0.508006,0.901376
3,0.0251,0.545836,0.901376


Results for Sst2 with seed 456: {'eval_loss': 0.46434661746025085, 'eval_accuracy': 0.8887614678899083, 'eval_runtime': 0.6525, 'eval_samples_per_second': 1336.438, 'eval_steps_per_second': 84.294, 'epoch': 3.0}
Running with seed: 789


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0577,0.545746,0.886468
2,0.0464,0.585495,0.888761
3,0.0171,0.713965,0.884174


Results for Sst2 with seed 789: {'eval_loss': 0.5457461476325989, 'eval_accuracy': 0.8864678899082569, 'eval_runtime': 0.6376, 'eval_samples_per_second': 1367.703, 'eval_steps_per_second': 86.266, 'epoch': 3.0}
DistilBERT results are the medians of 5 runs with different seeds: 0.8967889908256881
Training and evaluating STSB...


train-00000-of-00001.parquet:   0%|          | 0.00/502k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/151k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/114k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]



Map:   0%|          | 0/5749 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1379 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running with seed: 42


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Pearson,Spearmanr
1,0.8022,0.872529,0.857718,0.857449
2,0.4122,0.573132,0.874682,0.870717
3,0.2307,0.540815,0.873142,0.869923


Results for Stsb with seed 42: {'eval_loss': 0.5408149361610413, 'eval_pearson': 0.8731420644304388, 'eval_spearmanr': 0.8699231267387169, 'eval_runtime': 1.079, 'eval_samples_per_second': 1390.22, 'eval_steps_per_second': 87.12, 'epoch': 3.0}
Running with seed: 1234


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Pearson,Spearmanr
1,0.2741,0.589205,0.863712,0.861112
2,0.1785,0.552024,0.869189,0.866619
3,0.0992,0.5533,0.870098,0.866689


Results for Stsb with seed 1234: {'eval_loss': 0.552023708820343, 'eval_pearson': 0.8691890560329024, 'eval_spearmanr': 0.8666191236993688, 'eval_runtime': 1.0798, 'eval_samples_per_second': 1389.2, 'eval_steps_per_second': 87.057, 'epoch': 3.0}
Running with seed: 100


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Pearson,Spearmanr
1,0.1376,0.604697,0.859003,0.857243
2,0.1293,0.56959,0.866734,0.864444
3,0.0753,0.557451,0.869631,0.867243


Results for Stsb with seed 100: {'eval_loss': 0.5574513077735901, 'eval_pearson': 0.8696312479750451, 'eval_spearmanr': 0.8672426042181853, 'eval_runtime': 1.0936, 'eval_samples_per_second': 1371.619, 'eval_steps_per_second': 85.955, 'epoch': 3.0}
Running with seed: 456


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Pearson,Spearmanr
1,0.1021,0.573322,0.865523,0.86398
2,0.0959,0.558995,0.870888,0.86884
3,0.0608,0.536764,0.873778,0.871877


Results for Stsb with seed 456: {'eval_loss': 0.5367641448974609, 'eval_pearson': 0.8737775793358272, 'eval_spearmanr': 0.8718768770501653, 'eval_runtime': 1.1051, 'eval_samples_per_second': 1357.362, 'eval_steps_per_second': 85.061, 'epoch': 3.0}
Running with seed: 789


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Pearson,Spearmanr
1,0.0839,0.555801,0.869493,0.86761
2,0.0777,0.540413,0.873989,0.872332
3,0.0488,0.536642,0.874472,0.872384


Results for Stsb with seed 789: {'eval_loss': 0.53664231300354, 'eval_pearson': 0.8744716476892955, 'eval_spearmanr': 0.87238428922975, 'eval_runtime': 1.1088, 'eval_samples_per_second': 1352.855, 'eval_steps_per_second': 84.779, 'epoch': 3.0}
DistilBERT results are the medians of 5 runs with different seeds: 0.8731420644304388
Training and evaluating WNLI...


train-00000-of-00001.parquet:   0%|          | 0.00/38.8k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/11.1k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/13.6k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/635 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/71 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/146 [00:00<?, ? examples/s]



Map:   0%|          | 0/635 [00:00<?, ? examples/s]

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running with seed: 42


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.690773,0.56338
2,No log,0.692926,0.56338
3,0.697100,0.693029,0.56338


Results for Wnli with seed 42: {'eval_loss': 0.6907734870910645, 'eval_accuracy': 0.5633802816901409, 'eval_runtime': 0.0714, 'eval_samples_per_second': 993.722, 'eval_steps_per_second': 69.98, 'epoch': 3.0}
Running with seed: 1234


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.692878,0.56338
2,No log,0.691998,0.56338
3,0.696700,0.691674,0.56338


Results for Wnli with seed 1234: {'eval_loss': 0.6916742920875549, 'eval_accuracy': 0.5633802816901409, 'eval_runtime': 0.0714, 'eval_samples_per_second': 994.704, 'eval_steps_per_second': 70.05, 'epoch': 3.0}
Running with seed: 100


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.692259,0.56338
2,No log,0.691819,0.56338
3,0.694300,0.692692,0.56338


Results for Wnli with seed 100: {'eval_loss': 0.6918188333511353, 'eval_accuracy': 0.5633802816901409, 'eval_runtime': 0.0724, 'eval_samples_per_second': 980.591, 'eval_steps_per_second': 69.056, 'epoch': 3.0}
Running with seed: 456


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.691193,0.56338
2,No log,0.69937,0.521127
3,0.696100,0.704879,0.352113


Results for Wnli with seed 456: {'eval_loss': 0.6911929845809937, 'eval_accuracy': 0.5633802816901409, 'eval_runtime': 0.0782, 'eval_samples_per_second': 908.165, 'eval_steps_per_second': 63.955, 'epoch': 3.0}
Running with seed: 789


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.708757,0.352113
2,No log,0.71461,0.295775
3,0.692800,0.724107,0.15493


Results for Wnli with seed 789: {'eval_loss': 0.7087572813034058, 'eval_accuracy': 0.352112676056338, 'eval_runtime': 0.1396, 'eval_samples_per_second': 508.731, 'eval_steps_per_second': 35.826, 'epoch': 3.0}
DistilBERT results are the medians of 5 runs with different seeds: 0.5633802816901409


## BERT on GLUE Tasks

In [None]:
def train_and_evaluate_bert(task):

    # Step 1: Load the dataset
    dataset = load_dataset("glue", task)

    num_labels = 3 if task == "mnli" else 1 if task =="stsb" else 2
    batch_size = 32 if task in ["qqp", "mnli", "qnli"] else 16
    max_length = 128

    # Step 2: Load the tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    sentence1_key, sentence2_key = task_to_keys[task]

    # Step 3: Preprocess the dataset
    def preprocess_function(examples):
        if sentence2_key is None:
            return tokenizer(examples[sentence1_key], truncation=True, padding="max_length", max_length=max_length)
        return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True, padding="max_length", max_length=max_length)

    tokenized_dataset = dataset.map(preprocess_function, batched=True)

    # Step 4: Load pre-trained model with a classification head
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

    # Step 6: Load metric using the `evaluate` library
    metric = evaluate.load("glue", task)

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        # Convert logits to a PyTorch tensor
        logits = torch.tensor(logits)
        predictions = logits[:, 0] if task == "stsb" else torch.argmax(logits, dim=-1)
        return metric.compute(predictions=predictions, references=labels)

    # Define a list of seeds for 5 different runs
    seeds = [42, 1234, 100, 456, 789]

    # Store results from each run
    all_results = []

    # Loop for 5 runs with different seeds
    for seed in seeds:
        print(f"Running with seed: {seed}")

        # Step 5: Define training arguments (update with the current seed)
        training_args = TrainingArguments(
          output_dir=f"./bert_results_{task}",
          eval_strategy="epoch",
          save_strategy = "epoch",
          learning_rate=5e-6,
          per_device_train_batch_size=batch_size,
          per_device_eval_batch_size=batch_size,
          num_train_epochs=3,
          warmup_ratio=0.1,
          weight_decay=0.01,
          logging_dir=f"./bert_logs_{task}_seed_{seed}",
          logging_steps=100,
          # gradient_accumulation_steps=1,
          max_grad_norm=1.0,
          # save_total_limit=2,
          fp16=True,
          seed=seed,
          load_best_model_at_end=True,
          report_to=[],
        )

        validation_key = "validation_matched" if task == "mnli" else "validation"
        # Step 7: Set up the Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_dataset["train"],
            eval_dataset=tokenized_dataset[validation_key],
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
          )

        # Step 8: Train the model
        trainer.train()

        # Evaluate the model and store results
        results = trainer.evaluate()
        print(f"Results for {task.capitalize()} with seed {seed}: {results}")

        metric_key = "eval_pearson" if task == "stsb" else "eval_matthews_correlation" if task == "cola" else "eval_accuracy"

        # Append the accuracy to the results list
        all_results.append(results[metric_key])

    # Calculate the median accuracy from all runs
    median_accuracy = np.median(all_results)

    # Print the final median result
    print(f"BERT results are the medians of 5 runs with different seeds: {median_accuracy}")

In [None]:
# Loop through each task and run training/evaluation
for task in GLUE_TASKS:
    print(f"Training and evaluating {task.upper()}...")
    train_and_evaluate_bert(task)

Training and evaluating COLA...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/251k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/37.6k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/37.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8551 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1063 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

Running with seed: 42


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.4637,0.460256,0.492126
2,0.3855,0.486367,0.50215
3,0.3338,0.493518,0.507275


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Results for Cola with seed 42: {'eval_loss': 0.46025630831718445, 'eval_matthews_correlation': 0.49212649339854364, 'eval_runtime': 1.1827, 'eval_samples_per_second': 881.895, 'eval_steps_per_second': 55.805, 'epoch': 3.0}
Running with seed: 1234


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3858,0.465702,0.51271
2,0.3312,0.485403,0.528568
3,0.2956,0.527692,0.528616


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Results for Cola with seed 1234: {'eval_loss': 0.4657018184661865, 'eval_matthews_correlation': 0.5127103010689016, 'eval_runtime': 1.2776, 'eval_samples_per_second': 816.368, 'eval_steps_per_second': 51.659, 'epoch': 3.0}
Running with seed: 100


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3332,0.47625,0.536455
2,0.2832,0.601736,0.513713
3,0.2348,0.601817,0.539119


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Results for Cola with seed 100: {'eval_loss': 0.47624993324279785, 'eval_matthews_correlation': 0.5364554348637038, 'eval_runtime': 1.1918, 'eval_samples_per_second': 875.151, 'eval_steps_per_second': 55.379, 'epoch': 3.0}
Running with seed: 456


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.2883,0.514928,0.533881
2,0.224,0.582799,0.557289
3,0.2,0.658986,0.544257


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Results for Cola with seed 456: {'eval_loss': 0.5149280428886414, 'eval_matthews_correlation': 0.53388112595817, 'eval_runtime': 1.1987, 'eval_samples_per_second': 870.145, 'eval_steps_per_second': 55.062, 'epoch': 3.0}
Running with seed: 789


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.2299,0.655805,0.518103
2,0.1996,0.687873,0.554667
3,0.1989,0.752344,0.53395


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Results for Cola with seed 789: {'eval_loss': 0.6558050513267517, 'eval_matthews_correlation': 0.5181026129120675, 'eval_runtime': 1.2123, 'eval_samples_per_second': 860.377, 'eval_steps_per_second': 54.444, 'epoch': 3.0}
BERT results are the medians of 5 runs with different seeds: 0.5181026129120675
Training and evaluating MNLI...


train-00000-of-00001.parquet:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

(…)alidation_matched-00000-of-00001.parquet:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

(…)dation_mismatched-00000-of-00001.parquet:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

test_matched-00000-of-00001.parquet:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

test_mismatched-00000-of-00001.parquet:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]



Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/9832 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Running with seed: 42


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## DistilBERT on IMDb Dataset

In [None]:
# Import necessary libraries
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import evaluate

# Load IMDb dataset and DistilBERT tokenizer
dataset = load_dataset("imdb")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased", force_download=True)

# Tokenize data
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

train_data = dataset["train"].map(tokenize, batched=True)
test_data = dataset["test"].map(tokenize, batched=True)

# Convert labels to tensor format
train_data = train_data.with_format("torch", columns=["input_ids", "attention_mask", "label"])
test_data = test_data.with_format("torch", columns=["input_ids", "attention_mask", "label"])

# Load DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to=[],
    logging_dir="./logs",
)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    """Computes and returns a dictionary of metrics."""
    logits, labels = eval_pred
    logits = torch.tensor(logits)
    predictions = torch.argmax(logits, dim=-1)
    return metric.compute(predictions=predictions, references=labels)

# Initialize Trainer with IMDb data
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics
)


# Train model
trainer.train()

# Evaluate model
results = trainer.evaluate()
print("IMDb Test Accuracy:", results["eval_accuracy"])


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2262,0.237228,0.90608
2,0.1544,0.237441,0.92952
3,0.0934,0.281255,0.9306


IMDb Test Accuracy: 0.9306


## BERT on IMDb Dataset

In [None]:
# Import necessary libraries
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

# Load IMDb dataset and DistilBERT tokenizer
dataset = load_dataset("imdb")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", force_download=True)

# Tokenize data
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

train_data = dataset["train"].map(tokenize, batched=True)
test_data = dataset["test"].map(tokenize, batched=True)

# Convert labels to tensor format
train_data = train_data.with_format("torch", columns=["input_ids", "attention_mask", "label"])
test_data = test_data.with_format("torch", columns=["input_ids", "attention_mask", "label"])

# Load DistilBERT model for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to=[],
    logging_dir="./logs",
)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    """Computes and returns a dictionary of metrics."""
    logits, labels = eval_pred
    logits = torch.tensor(logits)
    predictions = torch.argmax(logits, dim=-1)
    return metric.compute(predictions=predictions, references=labels)

# Initialize Trainer with IMDb data
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics
)

# Train model
trainer.train()

# Evaluate model
results = trainer.evaluate()
print("IMDb Test Accuracy:", results["eval_accuracy"])


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2064,0.204074,0.92296
2,0.135,0.214014,0.93936
3,0.0791,0.252385,0.94244


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into 

IMDb Test Accuracy: 0.94244


## DistilBERT on SQuAD Dataset (Without Distillation)

In [None]:
# Necessary imports remain the same
from transformers import AutoTokenizer, DistilBertForQuestionAnswering, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import evaluate

# Load SQuAD 1.1 dataset
squad = load_dataset("squad")

# Initialize evaluation metric
metric = evaluate.load("squad")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        padding="max_length",
        return_offsets_mapping=True,
    )

    offset_mapping = inputs.pop("offset_mapping")
    start_positions = []
    end_positions = []
    for i, offsets in enumerate(offset_mapping):
        answer = examples["answers"][i]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        # Find the start and end token positions
        sequence_ids = inputs.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        if offsets[context_start][0] > start_char or offsets[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offsets[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offsets[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Apply preprocessing
encoded_dataset = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

# Function to compute metrics for SQuAD
# Function to compute metrics for SQuAD
def compute_metrics(eval_pred):
    """Computes and returns a dictionary of metrics."""
    start_logits, end_logits = eval_pred.predictions
    start_positions, end_positions = eval_pred.label_ids

    predictions = []
    references = []

    for i in range(len(start_logits)):
        start_pred = torch.argmax(torch.tensor(start_logits[i])).item()
        end_pred = torch.argmax(torch.tensor(end_logits[i])).item()

        # Retrieve input_ids and only pass those for decoding
        input_ids = encoded_dataset["validation"][i]["input_ids"]
        prediction_text = tokenizer.decode(input_ids[start_pred:end_pred+1], skip_special_tokens=True)

        predictions.append({"id": squad["validation"][i]["id"], "prediction_text": prediction_text})
        references.append({"id": squad["validation"][i]["id"], "answers": squad["validation"][i]["answers"]})

    return metric.compute(predictions=predictions, references=references)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="no",
    logging_steps=10,
    eval_steps=10,
    report_to="none"
)

# Load DistilBERT model
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train and evaluate the model
trainer.train()
results = trainer.evaluate()
print("DistilBERT Results without distillation on SQuAD:", results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact Match,F1
1,1.1258,1.137238,66.073794,76.175304
2,0.9487,1.093961,67.672658,77.820319
3,0.5362,1.191063,68.401135,78.217015


DistilBERT Results without distillation on SQuAD: {'eval_loss': 1.1910629272460938, 'eval_exact_match': 68.40113528855251, 'eval_f1': 78.21701520918465, 'eval_runtime': 43.458, 'eval_samples_per_second': 243.223, 'eval_steps_per_second': 15.21, 'epoch': 3.0}


## DistilBERT on SQuAD Dataset (With Distillation)

In [None]:
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizerFast, Trainer, TrainingArguments, BertForQuestionAnswering
from datasets import load_dataset
import evaluate
import numpy as np
import torch

# Load the SQuAD dataset
dataset = load_dataset("squad")

# Load DistilBERT model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForQuestionAnswering.from_pretrained(model_name)

# Tokenize the data
def prepare_features(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=False,  # Disable overflow to avoid out-of-bounds
        return_offsets_mapping=True,
        padding="max_length",
    )

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(tokenized_examples["offset_mapping"]):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        if len(examples["answers"][i]["answer_start"]) == 0:
            # No answer
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = examples["answers"][i]["answer_start"][0]
            end_char = start_char + len(examples["answers"][i]["text"][0])

            token_start_index, token_end_index = 0, 0
            for idx, (offset_start, offset_end) in enumerate(offsets):
                if offset_start <= start_char < offset_end:
                    token_start_index = idx
                if offset_start < end_char <= offset_end:
                    token_end_index = idx
                    break

            start_positions.append(token_start_index)
            end_positions.append(token_end_index)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    return tokenized_examples

tokenized_datasets = dataset.map(prepare_features, batched=True, remove_columns=dataset["train"].column_names)

# Set up evaluation metrics
squad_metric = evaluate.load("squad")

def compute_metrics(predictions):
    start_logits, end_logits = predictions.predictions
    start_positions, end_positions = predictions.label_ids

    predicted_answers = []
    for i in range(len(start_logits)):
        start = np.argmax(start_logits[i])
        end = np.argmax(end_logits[i]) + 1

        predicted_answer = tokenizer.decode(tokenized_datasets["validation"]["input_ids"][i][start:end], skip_special_tokens=True)
        predicted_answers.append({"id": str(i), "prediction_text": predicted_answer})

    references = []
    for i in range(len(start_positions)):
        start_idx = start_positions[i]
        end_idx = end_positions[i] + 1

        true_answer = tokenizer.decode(tokenized_datasets["validation"]["input_ids"][i][start_idx:end_idx], skip_special_tokens=True)
        answer_start = dataset["validation"][i]["answers"]["answer_start"][0]

        references.append({"id": str(i), "answers": {"text": [true_answer], "answer_start": [answer_start]}})

    return squad_metric.compute(predictions=predicted_answers, references=references)

# Set up the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to=[],
    logging_dir="./logs",
    save_strategy="no"
)

# Trainer for distillation
class DistilTrainer(Trainer):
    def __init__(self, teacher_model, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model.to(self.args.device)

    def compute_loss(self, model, inputs, return_outputs=False):
        inputs = {k: v.to(self.args.device) for k, v in inputs.items()}
        outputs = model(**inputs)
        loss = outputs.loss

        self.teacher_model.eval()
        with torch.no_grad():
            teacher_outputs = self.teacher_model(**inputs)

        distillation_loss = (
            torch.nn.functional.kl_div(
                torch.nn.functional.log_softmax(outputs.start_logits, dim=-1),
                torch.nn.functional.softmax(teacher_outputs.start_logits, dim=-1),
                reduction="batchmean",
            )
            + torch.nn.functional.kl_div(
                torch.nn.functional.log_softmax(outputs.end_logits, dim=-1),
                torch.nn.functional.softmax(teacher_outputs.end_logits, dim=-1),
                reduction="batchmean",
            )
        ) / 2

        total_loss = loss + distillation_loss
        return (total_loss, outputs) if return_outputs else total_loss

teacher_model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

trainer = DistilTrainer(
    teacher_model=teacher_model,
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train and evaluate the model
trainer.train()
results = trainer.evaluate()
print("DistilBERT Results with distillation on SQuAD:", results)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


## BERT on SQuAD Dataset (Without Distillation)

In [None]:
# Necessary imports remain the same
from transformers import AutoTokenizer, BertForQuestionAnswering, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import evaluate

# Load SQuAD 1.1 dataset
squad = load_dataset("squad")

# Initialize evaluation metric
metric = evaluate.load("squad")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        padding="max_length",
        return_offsets_mapping=True,
    )

    offset_mapping = inputs.pop("offset_mapping")
    start_positions = []
    end_positions = []
    for i, offsets in enumerate(offset_mapping):
        answer = examples["answers"][i]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        # Find the start and end token positions
        sequence_ids = inputs.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        if offsets[context_start][0] > start_char or offsets[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offsets[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offsets[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Apply preprocessing
encoded_dataset = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)


# Function to compute metrics for SQuAD
def compute_metrics(eval_pred):
    """Computes and returns a dictionary of metrics."""
    start_logits, end_logits = eval_pred.predictions
    start_positions, end_positions = eval_pred.label_ids

    predictions = []
    references = []

    for i in range(len(start_logits)):
        start_pred = torch.argmax(torch.tensor(start_logits[i])).item()
        end_pred = torch.argmax(torch.tensor(end_logits[i])).item()

        # Retrieve input_ids and only pass those for decoding
        input_ids = encoded_dataset["validation"][i]["input_ids"]
        prediction_text = tokenizer.decode(input_ids[start_pred:end_pred+1], skip_special_tokens=True)

        predictions.append({"id": squad["validation"][i]["id"], "prediction_text": prediction_text})
        references.append({"id": squad["validation"][i]["id"], "answers": squad["validation"][i]["answers"]})

    return metric.compute(predictions=predictions, references=references)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="no",
    logging_steps=10,
    eval_steps=10,
    report_to="none"
)

# Load DistilBERT model
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")


# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train and evaluate the model
trainer.train()
results = trainer.evaluate()
print("BERT Results without distillation on SQuAD:", results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Exact Match,F1
1,0.995,1.033708,70.312204,79.837162
2,0.898,1.007052,71.53264,81.047689
3,0.4905,1.086823,71.977294,81.229275


BERT Results without distillation on SQuAD: {'eval_loss': 1.0868231058120728, 'eval_exact_match': 71.97729422894986, 'eval_f1': 81.22927463053034, 'eval_runtime': 67.8133, 'eval_samples_per_second': 155.869, 'eval_steps_per_second': 9.747, 'epoch': 3.0}
