## Setup & general imports

In [2]:
# install & imports

!pip install -q transformers datasets accelerate scikit-learn

import numpy as np
import pandas as pd

from datasets import load_dataset

In [3]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [4]:
from google.colab import output
output.enable_custom_widget_manager()  # resets metadata/widgets

In [5]:
# load the full BESSTIE dataset
raw_dataset = load_dataset("unswnlporg/BESSTIE")
raw_dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

valid.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/17760 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2428 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'variety', 'source', 'task'],
        num_rows: 17760
    })
    validation: Dataset({
        features: ['text', 'label', 'variety', 'source', 'task'],
        num_rows: 2428
    })
})

## RoBERTa-large on sentiment

In [6]:
# filter: keep only samples for the Sentiment task
def is_sentiment(example):
    return example["task"] == "Sentiment"

sentiment_dataset = raw_dataset.filter(is_sentiment)

sentiment_dataset

Filter:   0%|          | 0/17760 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2428 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'variety', 'source', 'task'],
        num_rows: 8866
    })
    validation: Dataset({
        features: ['text', 'label', 'variety', 'source', 'task'],
        num_rows: 1212
    })
})

In [7]:
# convert the train split to a DataFrame just to inspect it
train_sent_df = sentiment_dataset["train"].to_pandas()

print(train_sent_df.columns)
train_sent_df.head()

Index(['text', 'label', 'variety', 'source', 'task'], dtype='object')


Unnamed: 0,text,label,variety,source,task
0,This was one of the best dishes I've EVER had!...,1,en-AU,Google,Sentiment
1,This Mexican restaurant in Penrith is a great ...,1,en-AU,Google,Sentiment
2,"This was not to bad, I ordered the big pork ri...",1,en-AU,Google,Sentiment
3,Clean cool and a nice smaller casino to check ...,1,en-AU,Google,Sentiment
4,Well set out. Great areas to enjoy. Good food ...,1,en-AU,Google,Sentiment


In [8]:
# import tokenizer and model class from huggingface
# autotokenizer automatically selects the correct tokenizer for the model name
# AutoModelForSequenceClassification creates a classification model with a final linear layer

from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [9]:
MODEL_NAME = "roberta-large"

# load the tokenizer for RoBERTa
# the tokenizer converts raw text into input_ids and attention masks
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# load a classification model for binary classification
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,  # binary classification (0 / 1)
)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# tokenize a batch of text examples for RoBERTa:
# truncation: cuts sequences longer than max_length
# padding: ensures all sequences have exactly max_length

def preprocess_function(examples):
    """
    Tokenize the input text for RoBERTa.
    """
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=128,
        padding="max_length",
    )

# apply the preprocessing to the entire dataset

tokenized_sentiment = sentiment_dataset.map(
    preprocess_function,
    batched=True,
)

Map:   0%|          | 0/8866 [00:00<?, ? examples/s]

Map:   0%|          | 0/1212 [00:00<?, ? examples/s]

In [11]:
tokenized_sentiment

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'variety', 'source', 'task', 'input_ids', 'attention_mask'],
        num_rows: 8866
    })
    validation: Dataset({
        features: ['text', 'label', 'variety', 'source', 'task', 'input_ids', 'attention_mask'],
        num_rows: 1212
    })
})

In [12]:
# we keep only the columns needed for training and evaluation

cols_to_keep = ["input_ids", "attention_mask", "label", "variety", "source"]

tokenized_sentiment = tokenized_sentiment.remove_columns(
    [c for c in tokenized_sentiment["train"].column_names if c not in cols_to_keep]
)


# show the final structure to check

tokenized_sentiment

DatasetDict({
    train: Dataset({
        features: ['label', 'variety', 'source', 'input_ids', 'attention_mask'],
        num_rows: 8866
    })
    validation: Dataset({
        features: ['label', 'variety', 'source', 'input_ids', 'attention_mask'],
        num_rows: 1212
    })
})

In [13]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# compute the evaluation metrics for the model

def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # convert model outputs (logits) into predicted class indices
    preds = np.argmax(logits, axis=-1)

    # compute standard metrics
    acc = accuracy_score(labels, preds)
    f1  = f1_score(labels, preds, average="macro")

    return {
        "accuracy": acc,
        "f1_macro": f1,
    }

In [14]:
from transformers import TrainingArguments

# minimal set of training arguments that is compatible with older versions of Transformers
training_args = TrainingArguments(
    output_dir="roberta_sentiment",   # Folder where model checkpoints will be saved
    num_train_epochs=3,               # Number of training epochs
    learning_rate=2e-5,               # Standard fine-tuning learning rate
    per_device_train_batch_size=16,   # Batch size for training
    per_device_eval_batch_size=32,    # Batch size for evaluation
    weight_decay=0.01,                # L2 regularization strength
    logging_steps=100,                # Log training information every 100 steps
    fp16=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [15]:
from transformers import Trainer

# Select the tokenized dataset splits
train_dataset = tokenized_sentiment["train"]
eval_dataset  = tokenized_sentiment["validation"]

trainer = Trainer(
    model=model,                   # the model to train
    args=training_args,            # hyperparameters & training config
    train_dataset=train_dataset,   # training data
    eval_dataset=eval_dataset,     # validation data
    compute_metrics=compute_metrics,  # evaluation function
)

In [16]:
# Start training the model
trainer.train()

Step,Training Loss
100,0.4188
200,0.3409
300,0.2981
400,0.3231
500,0.2998
600,0.2639
700,0.1992
800,0.2153
900,0.2005
1000,0.2188


TrainOutput(global_step=1665, training_loss=0.22269570376421954, metrics={'train_runtime': 789.2589, 'train_samples_per_second': 33.7, 'train_steps_per_second': 2.11, 'total_flos': 6196877600449536.0, 'train_loss': 0.22269570376421954, 'epoch': 3.0})

In [17]:
# evaluate the model on the validation set
eval_results = trainer.evaluate()

print(eval_results)

{'eval_loss': 0.39627936482429504, 'eval_accuracy': 0.9084158415841584, 'eval_f1_macro': 0.9084127864823558, 'eval_runtime': 6.0113, 'eval_samples_per_second': 201.621, 'eval_steps_per_second': 6.321, 'epoch': 3.0}


In [18]:
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np

# Get raw predictions on the validation set
pred_output = trainer.predict(eval_dataset)

logits = pred_output.predictions
labels = pred_output.label_ids

# Convert logits to predicted labels (0/1)
preds = np.argmax(logits, axis=-1)

# Build a DataFrame with true labels, predictions, source and variety
eval_df = pd.DataFrame({
    "label": labels,
    "pred": preds,
    "variety": eval_dataset["variety"],
    "source": eval_dataset["source"],
})

eval_df.head()

Unnamed: 0,label,pred,variety,source
0,1,1,en-AU,Google
1,1,1,en-AU,Google
2,1,1,en-AU,Google
3,1,1,en-AU,Google
4,1,1,en-AU,Google


In [19]:
# Compute accuracy and macro-F1 for each (variety, source) combination
results_variety_source = []

# Unique combinations of (variety, source)
groups = eval_df.groupby(["variety", "source"])

for (var, src), subset in groups:
    y_true = subset["label"].values
    y_pred = subset["pred"].values

    acc = accuracy_score(y_true, y_pred)
    f1  = f1_score(y_true, y_pred, average="macro")

    results_variety_source.append({
        "variety": var,
        "source": src,
        "n_samples": len(subset),
        "accuracy": acc,
        "f1_macro": f1
    })

results_variety_source = pd.DataFrame(results_variety_source)
results_variety_source

Unnamed: 0,variety,source,n_samples,accuracy,f1_macro
0,en-AU,Google,130,0.946154,0.929626
1,en-AU,Reddit,241,0.892116,0.87595
2,en-IN,Google,225,0.88,0.827655
3,en-IN,Reddit,230,0.856522,0.810869
4,en-UK,Google,248,0.955645,0.940539
5,en-UK,Reddit,138,0.949275,0.879536


## RoBERTa-large on sarcasm


In [21]:
from datasets import load_dataset

# Load the full BESSTIE dataset again (for clarity and independence)
raw_dataset = load_dataset("unswnlporg/BESSTIE")

# Keep only samples for the Sarcasm task
def is_sarcasm(example):
    """
    Return True if the example belongs to the 'Sarcasm' task.
    We use this to filter the original dataset.
    """
    return example["task"] == "Sarcasm"

sarcasm_dataset = raw_dataset.filter(is_sarcasm)

sarcasm_dataset

Filter:   0%|          | 0/17760 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2428 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'variety', 'source', 'task'],
        num_rows: 8894
    })
    validation: Dataset({
        features: ['text', 'label', 'variety', 'source', 'task'],
        num_rows: 1216
    })
})

In [22]:
# Convert the training split to a DataFrame for inspection
sarc_train_df = sarcasm_dataset["train"].to_pandas()

sarc_train_df.head()

Unnamed: 0,text,label,variety,source,task
0,"Located 2 blocks back from The Strand, ideal f...",0,en-AU,Google,Sarcasm
1,Have n't been to AJ in a few years so popped i...,0,en-AU,Google,Sarcasm
2,Tried their folded chili eggs() plus mushrooms...,0,en-AU,Google,Sarcasm
3,Thanks for the vegan options. Minus one star f...,0,en-AU,Google,Sarcasm
4,Bought an ANGUS Bacon BBQ Sauce with Onion and...,0,en-AU,Google,Sarcasm


In [23]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# We reuse the same model checkpoint as for sentiment
MODEL_NAME = "roberta-large"

# Reuse the same tokenizer (you can skip this line if tokenizer is already defined)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Create a NEW classification model for the sarcasm task
# with 2 labels: 0 = non-sarcastic, 1 = sarcastic.
model_sarc = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
def preprocess_function(examples):
    """
    Tokenize a batch of text examples for RoBERTa.
    - Truncation cuts sequences longer than max_length.
    - Padding ensures fixed-length sequences.
    """
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=128,
        padding="max_length",
    )

In [25]:
# Apply the same preprocessing to the sarcasm dataset
tokenized_sarcasm = sarcasm_dataset.map(
    preprocess_function,
    batched=True,
)

tokenized_sarcasm

Map:   0%|          | 0/8894 [00:00<?, ? examples/s]

Map:   0%|          | 0/1216 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'variety', 'source', 'task', 'input_ids', 'attention_mask'],
        num_rows: 8894
    })
    validation: Dataset({
        features: ['text', 'label', 'variety', 'source', 'task', 'input_ids', 'attention_mask'],
        num_rows: 1216
    })
})

In [26]:
# Keep only the fields needed for training and per-variety evaluation
cols_to_keep_sarc = ["input_ids", "attention_mask", "label", "variety", "source"]

tokenized_sarcasm = tokenized_sarcasm.remove_columns(
    [c for c in tokenized_sarcasm["train"].column_names if c not in cols_to_keep_sarc]
)

tokenized_sarcasm

DatasetDict({
    train: Dataset({
        features: ['label', 'variety', 'source', 'input_ids', 'attention_mask'],
        num_rows: 8894
    })
    validation: Dataset({
        features: ['label', 'variety', 'source', 'input_ids', 'attention_mask'],
        num_rows: 1216
    })
})

In [27]:
from transformers import TrainingArguments

# Training configuration for the sarcasm task
training_args_sarc = TrainingArguments(
    output_dir="roberta_sarcasm",      # folder for sarcasm checkpoints
    num_train_epochs=3,                # number of epochs
    learning_rate=2e-5,                # standard LR for fine-tuning
    per_device_train_batch_size=16,    # batch size for training
    per_device_eval_batch_size=32,     # batch size for evaluation
    weight_decay=0.01,                 # regularization strength
    logging_steps=100,                 # log training information every 100 steps
    fp16=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [28]:
from transformers import Trainer

# Select the train and validation splits of the tokenized sarcasm dataset
train_dataset_sarc = tokenized_sarcasm["train"]
eval_dataset_sarc  = tokenized_sarcasm["validation"]

# We reuse the same compute_metrics function defined for sentiment
trainer_sarc = Trainer(
    model=model_sarc,
    args=training_args_sarc,
    train_dataset=train_dataset_sarc,
    eval_dataset=eval_dataset_sarc,
    compute_metrics=compute_metrics,  # same metrics: accuracy + macro-F1
)

In [29]:
# Train RoBERTa-base on the sarcasm task
trainer_sarc.train()

Step,Training Loss
100,0.3753
200,0.3877
300,0.361
400,0.3595
500,0.3621
600,0.4064
700,0.366
800,0.3703
900,0.3511
1000,0.4188


TrainOutput(global_step=1668, training_loss=0.3486763887839923, metrics={'train_runtime': 772.3425, 'train_samples_per_second': 34.547, 'train_steps_per_second': 2.16, 'total_flos': 6216448159079424.0, 'train_loss': 0.3486763887839923, 'epoch': 3.0})

In [30]:
# Evaluate on the sarcasm validation split (global)
eval_results_sarc = trainer_sarc.evaluate()
eval_results_sarc

{'eval_loss': 0.32206347584724426,
 'eval_accuracy': 0.8626644736842105,
 'eval_f1_macro': 0.6954780274572434,
 'eval_runtime': 5.8931,
 'eval_samples_per_second': 206.343,
 'eval_steps_per_second': 6.448,
 'epoch': 3.0}

In [31]:
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np

# Get predictions on sarcasm validation set
pred_output_sarc = trainer_sarc.predict(eval_dataset_sarc)

logits_sarc = pred_output_sarc.predictions
labels_sarc = pred_output_sarc.label_ids
preds_sarc = np.argmax(logits_sarc, axis=-1)

# Build a DataFrame with predictions and true labels
eval_sarc_df = pd.DataFrame({
    "label": labels_sarc,
    "pred": preds_sarc,
    "variety": eval_dataset_sarc["variety"],
    "source": eval_dataset_sarc["source"],
})

eval_sarc_df.head()

Unnamed: 0,label,pred,variety,source
0,0,0,en-AU,Google
1,0,0,en-AU,Google
2,0,0,en-AU,Google
3,0,0,en-AU,Google
4,0,0,en-AU,Google


In [33]:
# Compute accuracy and macro-F1 for each (variety, source) combination
results_sarc_by_variety = []

# Unique combinations of (variety, source)
groups = eval_sarc_df.groupby(["variety", "source"])

for (var, src), subset in groups:
    y_true = subset["label"].values
    y_pred = subset["pred"].values

    acc = accuracy_score(y_true, y_pred)
    f1  = f1_score(y_true, y_pred, average="macro")

    results_sarc_by_variety.append({
        "variety": var,
        "source": src,
        "n_samples": len(subset),
        "accuracy": acc,
        "f1_macro": f1
    })

results_sarc_by_variety = pd.DataFrame(results_sarc_by_variety)
results_sarc_by_variety

Unnamed: 0,variety,source,n_samples,accuracy,f1_macro
0,en-AU,Google,130,0.923077,0.48
1,en-AU,Reddit,241,0.701245,0.683542
2,en-IN,Google,225,0.991111,0.497768
3,en-IN,Reddit,230,0.834783,0.522509
4,en-UK,Google,249,1.0,1.0
5,en-UK,Reddit,141,0.680851,0.591304
