In [1]:
!pip install transformers datasets torch evaluate

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.

In [2]:
import evaluate
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import numpy as np

## DistilBERT on GLUE Tasks

In [3]:
GLUE_TASKS = ["cola", "mnli", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]

In [4]:
task_to_keys = {
    "sst2": ("sentence", None),
    "cola": ("sentence", None),
    "mrpc": ("sentence1", "sentence2"),
    "stsb": ("sentence1", "sentence2"),
    "qqp": ("question1", "question2"),
    "qnli": ("question", "sentence"),
    "mnli": ("premise", "hypothesis"),
    "rte": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

In [5]:
def train_and_evaluate(task):

    # Step 1: Load the dataset
    dataset = load_dataset("glue", task)

    num_labels = 3 if task == "mnli" else 1 if task =="stsb" else 2
    batch_size = 32 if task in ["qqp", "mnli", "qnli"] else 16
    max_length = 128

    # Step 2: Load the tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    sentence1_key, sentence2_key = task_to_keys[task]

    # Step 3: Preprocess the dataset
    def preprocess_function(examples):
        if sentence2_key is None:
            return tokenizer(examples[sentence1_key], truncation=True, padding="max_length", max_length=max_length)
        return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True, padding="max_length", max_length=max_length)

    tokenized_dataset = dataset.map(preprocess_function, batched=True)

    # Step 4: Load pre-trained model with a classification head
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

    # Step 6: Load metric using the `evaluate` library
    metric = evaluate.load("glue", task)

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        # Convert logits to a PyTorch tensor
        logits = torch.tensor(logits)
        predictions = logits[:, 0] if task == "stsb" else torch.argmax(logits, dim=-1)
        return metric.compute(predictions=predictions, references=labels)

    # Define a list of seeds for 5 different runs
    seeds = [42, 1234, 100, 456, 789]

    # Store results from each run
    all_results = []

    # Loop for 5 runs with different seeds
    for seed in seeds:
        print(f"Running with seed: {seed}")

        # Step 5: Define training arguments (update with the current seed)
        training_args = TrainingArguments(
          output_dir=f"./results_{task}",
          eval_strategy="epoch",
          save_strategy = "epoch",
          learning_rate=5e-5,
          per_device_train_batch_size=batch_size,
          per_device_eval_batch_size=batch_size,
          num_train_epochs=3,
          warmup_ratio=0.1,
          weight_decay=0.01,
          logging_dir=f"./logs_{task}_seed_{seed}",
          logging_steps=100,
          gradient_accumulation_steps=1,
          max_grad_norm=1.0,
          save_total_limit=2,
          fp16=True,
          seed=seed,
          load_best_model_at_end=True,
          report_to=[],
        )

        validation_key = "validation_matched" if task == "mnli" else "validation"
        # Step 7: Set up the Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_dataset["train"],
            eval_dataset=tokenized_dataset[validation_key],
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
          )

        # Step 8: Train the model
        trainer.train()

        # Evaluate the model and store results
        results = trainer.evaluate()
        print(f"Results for {task.capitalize()} with seed {seed}: {results}")

        # Get the appropriate metric key based on the task
        metric_key = "eval_pearson" if task == "stsb" else "eval_matthews_correlation" if task == "cola" else "eval_accuracy"

        # Append the accuracy to the results list
        all_results.append(results[metric_key])

    # Calculate the median accuracy from all runs
    median_accuracy = np.median(all_results)

    # Print the final median result
    print(f"DistilBERT results are the medians of 5 runs with different seeds: {median_accuracy}")

In [None]:
# Loop through each task and run training/evaluation
for task in GLUE_TASKS:
    print(f"Training and evaluating {task.upper()}...")
    train_and_evaluate(task)


Training and evaluating COLA...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/251k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/37.6k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/37.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8551 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1063 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

Running with seed: 42


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.4891,0.486559,0.406366
2,0.3147,0.522503,0.470132
3,0.166,0.709664,0.498608


Results for Cola with seed 42: {'eval_loss': 0.48655861616134644, 'eval_matthews_correlation': 0.4063664761456974, 'eval_runtime': 0.7469, 'eval_samples_per_second': 1396.473, 'eval_steps_per_second': 88.367, 'epoch': 3.0}
Running with seed: 1234


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3339,0.507532,0.472848
2,0.2074,0.555143,0.496703
3,0.1322,0.885839,0.505905


Results for Cola with seed 1234: {'eval_loss': 0.5075321793556213, 'eval_matthews_correlation': 0.4728475040113381, 'eval_runtime': 0.7657, 'eval_samples_per_second': 1362.198, 'eval_steps_per_second': 86.199, 'epoch': 3.0}
Running with seed: 100


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.2608,0.641342,0.49947
2,0.1746,0.944342,0.475137
3,0.0856,0.967929,0.51764


Results for Cola with seed 100: {'eval_loss': 0.6413422226905823, 'eval_matthews_correlation': 0.4994701324058332, 'eval_runtime': 0.7563, 'eval_samples_per_second': 1379.005, 'eval_steps_per_second': 87.262, 'epoch': 3.0}
Running with seed: 456


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.193,0.974146,0.448129
2,0.1321,1.002907,0.457567
3,0.0589,1.186214,0.480953


Results for Cola with seed 456: {'eval_loss': 0.9741463661193848, 'eval_matthews_correlation': 0.4481287541235995, 'eval_runtime': 0.807, 'eval_samples_per_second': 1292.495, 'eval_steps_per_second': 81.788, 'epoch': 3.0}
Running with seed: 789


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.1469,0.866475,0.458402
2,0.1117,1.042787,0.462504
3,0.0483,1.313485,0.489444


Results for Cola with seed 789: {'eval_loss': 0.8664747476577759, 'eval_matthews_correlation': 0.4584020596329, 'eval_runtime': 0.7612, 'eval_samples_per_second': 1370.12, 'eval_steps_per_second': 86.7, 'epoch': 3.0}
DistilBERT results are the medians of 5 runs with different seeds: 0.4584020596329
Training and evaluating MNLI...


train-00000-of-00001.parquet:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

(…)alidation_matched-00000-of-00001.parquet:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

(…)dation_mismatched-00000-of-00001.parquet:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

test_matched-00000-of-00001.parquet:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

test_mismatched-00000-of-00001.parquet:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]



Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/9832 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Running with seed: 42


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5297,0.500738,0.800509


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5297,0.500738,0.800509
2,0.3862,0.480404,0.818441
3,0.2765,0.559721,0.821294


Results for Mnli with seed 42: {'eval_loss': 0.48040369153022766, 'eval_accuracy': 0.8184411614875191, 'eval_runtime': 4.2663, 'eval_samples_per_second': 2300.585, 'eval_steps_per_second': 71.959, 'epoch': 3.0}
Running with seed: 1234


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3629,0.554357,0.805502
2,0.2282,0.588536,0.812124


## BERT on GLUE Tasks

In [None]:
def train_and_evaluate_bert(task):

    # Step 1: Load the dataset
    dataset = load_dataset("glue", task)

    num_labels = 3 if task == "mnli" else 1 if task =="stsb" else 2
    batch_size = 32 if task in ["qqp", "mnli", "qnli"] else 16
    max_length = 128

    # Step 2: Load the tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    sentence1_key, sentence2_key = task_to_keys[task]

    # Step 3: Preprocess the dataset
    def preprocess_function(examples):
        if sentence2_key is None:
            return tokenizer(examples[sentence1_key], truncation=True, padding="max_length", max_length=max_length)
        return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True, padding="max_length", max_length=max_length)

    tokenized_dataset = dataset.map(preprocess_function, batched=True)

    # Step 4: Load pre-trained model with a classification head
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

    # Step 6: Load metric using the `evaluate` library
    metric = evaluate.load("glue", task)

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        # Convert logits to a PyTorch tensor
        logits = torch.tensor(logits)
        predictions = logits[:, 0] if task == "stsb" else torch.argmax(logits, dim=-1)
        return metric.compute(predictions=predictions, references=labels)

    # Define a list of seeds for 5 different runs
    seeds = [42, 1234, 100, 456, 789]

    # Store results from each run
    all_results = []

    # Loop for 5 runs with different seeds
    for seed in seeds:
        print(f"Running with seed: {seed}")

        # Step 5: Define training arguments (update with the current seed)
        training_args = TrainingArguments(
          output_dir=f"./bert_results_{task}",
          eval_strategy="epoch",
          save_strategy = "epoch",
          learning_rate=5e-6,
          per_device_train_batch_size=batch_size,
          per_device_eval_batch_size=batch_size,
          num_train_epochs=3,
          warmup_ratio=0.1,
          weight_decay=0.01,
          logging_dir=f"./bert_logs_{task}_seed_{seed}",
          logging_steps=100,
          # gradient_accumulation_steps=1,
          max_grad_norm=1.0,
          # save_total_limit=2,
          fp16=True,
          seed=seed,
          load_best_model_at_end=True,
          report_to=[],
        )

        validation_key = "validation_matched" if task == "mnli" else "validation"
        # Step 7: Set up the Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_dataset["train"],
            eval_dataset=tokenized_dataset[validation_key],
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
          )

        # Step 8: Train the model
        trainer.train()

        # Evaluate the model and store results
        results = trainer.evaluate()
        print(f"Results for {task.capitalize()} with seed {seed}: {results}")

        metric_key = "eval_pearson" if task == "stsb" else "eval_matthews_correlation" if task == "cola" else "eval_accuracy"

        # Append the accuracy to the results list
        all_results.append(results[metric_key])

    # Calculate the median accuracy from all runs
    median_accuracy = np.median(all_results)

    # Print the final median result
    print(f"BERT results are the medians of 5 runs with different seeds: {median_accuracy}")

In [None]:
# Loop through each task and run training/evaluation
for task in GLUE_TASKS:
    print(f"Training and evaluating {task.upper()}...")
    train_and_evaluate_bert(task)

Training and evaluating WNLI...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running with seed: 42


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.725944,0.422535
2,No log,0.718681,0.338028
3,0.715100,0.717512,0.352113


Results for Wnli with seed 42: {'eval_loss': 0.7175119519233704, 'eval_accuracy': 0.352112676056338, 'eval_runtime': 0.2181, 'eval_samples_per_second': 325.512, 'eval_steps_per_second': 22.923, 'epoch': 3.0}
Running with seed: 1234


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.721549,0.323944
2,No log,0.724816,0.309859
3,0.698400,0.724658,0.267606


Results for Wnli with seed 1234: {'eval_loss': 0.7215490937232971, 'eval_accuracy': 0.323943661971831, 'eval_runtime': 0.2402, 'eval_samples_per_second': 295.563, 'eval_steps_per_second': 20.814, 'epoch': 3.0}
Running with seed: 100


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.723922,0.28169
2,No log,0.725146,0.28169


KeyboardInterrupt: 

## DistilBERT on IMDb Dataset

In [None]:
# Import necessary libraries
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import evaluate

# Load IMDb dataset and DistilBERT tokenizer
dataset = load_dataset("imdb")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased", force_download=True)

# Tokenize data
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

train_data = dataset["train"].map(tokenize, batched=True)
test_data = dataset["test"].map(tokenize, batched=True)

# Convert labels to tensor format
train_data = train_data.with_format("torch", columns=["input_ids", "attention_mask", "label"])
test_data = test_data.with_format("torch", columns=["input_ids", "attention_mask", "label"])

# Load DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to=[],
    logging_dir="./logs",
)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    """Computes and returns a dictionary of metrics."""
    logits, labels = eval_pred
    logits = torch.tensor(logits)
    predictions = torch.argmax(logits, dim=-1)
    return metric.compute(predictions=predictions, references=labels)

# Initialize Trainer with IMDb data
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics
)


# Train model
trainer.train()

# Evaluate model
results = trainer.evaluate()
print("IMDb Test Accuracy:", results["eval_accuracy"])


## BERT on IMDb Dataset

In [None]:
# Import necessary libraries
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

# Load IMDb dataset and DistilBERT tokenizer
dataset = load_dataset("imdb")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", force_download=True)

# Tokenize data
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

train_data = dataset["train"].map(tokenize, batched=True)
test_data = dataset["test"].map(tokenize, batched=True)

# Convert labels to tensor format
train_data = train_data.with_format("torch", columns=["input_ids", "attention_mask", "label"])
test_data = test_data.with_format("torch", columns=["input_ids", "attention_mask", "label"])

# Load DistilBERT model for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to=[],
    logging_dir="./logs",
)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    """Computes and returns a dictionary of metrics."""
    logits, labels = eval_pred
    logits = torch.tensor(logits)
    predictions = torch.argmax(logits, dim=-1)
    return metric.compute(predictions=predictions, references=labels)

# Initialize Trainer with IMDb data
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics
)

# Train model
trainer.train()

# Evaluate model
results = trainer.evaluate()
print("IMDb Test Accuracy:", results["eval_accuracy"])


## DistilBERT on SQuAD Dataset (Without Distillation)

In [None]:
# Import necessary libraries
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering, Trainer, TrainingArguments
from datasets import load_dataset
import evaluate

# Load SQuAD dataset and tokenizer
squad_data = load_dataset("squad")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased", force_download=True)

# Tokenize data
def prepare_features(examples):
    inputs = tokenizer(
        examples["question"], examples["context"], truncation="only_second", max_length=384, stride=128, return_overflowing_tokens=True, padding="max_length"
    )
    inputs["start_positions"] = [a["answer_start"][0] for a in examples["answers"]]
    inputs["end_positions"] = [a["answer_start"][0] + len(a["text"][0]) - 1 for a in examples["answers"]]
    return inputs


tokenized_squad = squad_data.map(prepare_features, batched=True, remove_columns=squad_data["train"].column_names)

# Load DistilBERT model for question answering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased", force_download=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to=[],
    logging_dir="./logs",
)

# Initialize Trainer with SQuAD data
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"]
)

# Train model
trainer.train()

# Evaluate model
results = trainer.evaluate()
print("SQuAD EM:", results["eval_exact_match"])
print("SQuAD F1:", results["eval_f1"])


## DistilBERT on SQuAD Dataset (With Distillation)

## BERT on SQuAD Dataset (Without Distillation)