In [10]:
# run this cell once at top of Colab
# !pip install -q transformers datasets scikit-learn tqdm

import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.optim import AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, f1_score
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Device: cuda


In [11]:
# Load the HF dataset or your local parquet/csv if you uploaded it manually
ds = load_dataset("ailsntua/QEvasion")  # uses HF; if you uploaded files, replace with pd.read_parquet / read_csv

train_df = ds["train"].to_pandas().reset_index(drop=True)
test_df  = ds["test"].to_pandas().reset_index(drop=True)

# Quick inspect
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print(train_df.columns.tolist())

# We will use:
# Task1 labels: train_df['clarity_label']  (3 classes)
# Task2 labels: train_df['evasion_label']  (9 subclasses)  (train)
# For test, we will use annotator1/2/3 to compute match-based accuracy and majority for standard metrics.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.90M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/259k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3448 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/308 [00:00<?, ? examples/s]

Train shape: (3448, 20)
Test shape: (308, 20)
['title', 'date', 'president', 'url', 'question_order', 'interview_question', 'interview_answer', 'gpt3.5_summary', 'gpt3.5_prediction', 'question', 'annotator_id', 'annotator1', 'annotator2', 'annotator3', 'inaudible', 'multiple_questions', 'affirmative_questions', 'index', 'clarity_label', 'evasion_label']


In [12]:
# Build text field (same format for all models)
def make_text(df):
    return df['interview_question'].fillna('') + " [SEP] " + df['interview_answer'].fillna('')

train_df['text'] = make_text(train_df)
test_df['text']  = make_text(test_df)

# Prepare label encoders for clarity (Task1) and evasion (Task2) from train set
le_clarity = LabelEncoder().fit(train_df['clarity_label'].astype(str).values)
le_evasion = LabelEncoder().fit(train_df['evasion_label'].astype(str).values)

print("Clarity classes:", list(le_clarity.classes_))
print("Evasion classes (train):", list(le_evasion.classes_))


Clarity classes: ['Ambivalent', 'Clear Non-Reply', 'Clear Reply']
Evasion classes (train): ['Claims ignorance', 'Clarification', 'Declining to answer', 'Deflection', 'Dodging', 'Explicit', 'General', 'Implicit', 'Partial/half-answer']


In [13]:
class QADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels  # torch.tensor

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

def build_dataloaders_for_model(model_name, train_texts, train_labels, test_texts, test_labels,
                                max_len=256, batch_size=16):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # encode
    train_enc = tokenizer(list(train_texts), truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')
    test_enc  = tokenizer(list(test_texts), truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')
    train_labels_t = torch.tensor(train_labels, dtype=torch.long)
    test_labels_t  = torch.tensor(test_labels, dtype=torch.long)

    train_ds = QADataset(train_enc, train_labels_t)
    test_ds  = QADataset(test_enc, test_labels_t)

    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_dl  = DataLoader(test_ds, batch_size=batch_size)
    return train_dl, test_dl, tokenizer


In [14]:
# For Task2 evaluation: compute majority label from annotator columns for classification_report / macro-f1
# For annotator-match accuracy: we will also keep the three annotator values per example.

def test_labels_for_evaluation(test_df, le):
    # le: label encoder fitted on train evasion labels
    # create majority label (mode). If tie, pandas.mode returns all; choose first.
    def mode_label(row):
        # annotator columns names assumed: 'annotator1', 'annotator2', 'annotator3'
        ann = [str(row.get('annotator1', '')).strip(), str(row.get('annotator2', '')).strip(), str(row.get('annotator3', '')).strip()]
        ann = [a for a in ann if a and a.lower() != 'nan']
        if len(ann)==0:
            return None
        return pd.Series(ann).mode().iloc[0]  # first mode

    majority = test_df.apply(mode_label, axis=1)
    # Map majority to label indices; if majority not in encoder classes, map to -1 and filter later
    maj_indices = []
    for m in majority:
        if pd.isna(m) or m is None:
            maj_indices.append(-1)
        else:
            try:
                maj_indices.append(int(le.transform([m])[0]))
            except Exception:
                # If annotator label strings don't match training labels exactly, map to -1
                maj_indices.append(-1)
    # annotator lists
    annotator_lists = []
    for _, row in test_df.iterrows():
        ann = [str(row.get('annotator1', '')).strip(), str(row.get('annotator2', '')).strip(), str(row.get('annotator3', '')).strip()]
        annotator_lists.append([a for a in ann if a and a.lower()!='nan'])
    return np.array(maj_indices), annotator_lists


In [15]:
from sklearn.metrics import precision_recall_fscore_support

def train_and_evaluate(model_name, task='clarity', epochs=3, batch_size=16, max_len=256,
                       lr=5e-5, save_to_drive=True, drive_path="/content/drive/MyDrive/semeval_models"):
    """
    task: 'clarity' or 'evasion'
    model_name: HF model id (e.g., 'bert-base-uncased', 'microsoft/deberta-v3-base')
    """
    assert task in ['clarity','evasion']
    print(f"\n=== Training {model_name} for task {task} ===\n")

    if task == 'clarity':
        # train labels from train_df using le_clarity
        train_labels = le_clarity.transform(train_df['clarity_label'].astype(str).values)
        # for test, use test_df clarity_label if available
        test_labels = le_clarity.transform(test_df['clarity_label'].astype(str).values)
        label_names = list(le_clarity.classes_)
        num_labels = len(label_names)
        # No special annotator-match logic for Task1
        annotator_match_info = None
    else:  # evasion
        train_labels = le_evasion.transform(train_df['evasion_label'].astype(str).values)
        # For evaluation we will compute majority label and annotator lists
        maj_indices, annotator_lists = test_labels_for_evaluation(test_df, le_evasion)
        # For the standard classification report we need test_labels array: use maj_indices
        # If maj_indices has -1 (unmapped), we will filter them out during evaluation
        test_labels = maj_indices
        label_names = list(le_evasion.classes_)
        num_labels = len(label_names)
        annotator_match_info = annotator_lists

    # Build dataloaders (uses tokenizer corresponding to model_name)
    train_loader, test_loader, tokenizer = build_dataloaders_for_model(model_name, train_df['text'], train_labels,
                                                                       test_df['text'], test_labels,
                                                                       max_len=max_len, batch_size=batch_size)
    # Model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)
    optimizer = AdamW(model.parameters(), lr=lr)
    total_steps = len(train_loader) * epochs
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    # Training loop
    model.train()
    for epoch in range(epochs):
        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
        for batch in loop:
            batch = {k:v.to(device) for k,v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
            loop.set_postfix(loss=loss.item())

    # Evaluation
    model.eval()
    preds_all, trues_all = [], []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            # batch may contain items with label -1 (for evasion majority mapping); but dataloader already has numeric labels.
            batch_gpu = {k:v.to(device) for k,v in batch.items()}
            outputs = model(**batch_gpu)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            labels = batch['labels'].cpu().numpy()
            preds_all.extend(preds.tolist())
            trues_all.extend(labels.tolist())

    preds_all = np.array(preds_all)
    trues_all = np.array(trues_all)

    # For evasion: filter out test rows where majority mapping was -1 (unmapped); for clarity none are -1
    if task == 'evasion':
        valid_idx = trues_all != -1
        preds_valid = preds_all[valid_idx]
        trues_valid = trues_all[valid_idx]
        annotator_lists_valid = [annotator_match_info[i] for i, ok in enumerate(valid_idx) if ok]
    else:
        preds_valid = preds_all
        trues_valid = trues_all
        annotator_lists_valid = None

    # Standard metrics (overall accuracy + classification report)
    overall_acc = accuracy_score(trues_valid, preds_valid)
    macro_f1 = f1_score(trues_valid, preds_valid, average='macro')
    print(f"\nOverall Accuracy: {overall_acc:.4f}")
    print(f"Macro F1: {macro_f1:.4f}  (primary metric for SemEval)")
    print("\nClassification report:")
    print(classification_report(trues_valid, preds_valid, target_names=label_names, zero_division=0))

    # Additional Task2 metric: annotator-match accuracy
    if task == 'evasion':
        # compute annotator-match accuracy
        correct = 0
        for pred_label, ann_list in zip(preds_valid, annotator_lists_valid):
            # pred_label is numeric label index -> map to string label
            pred_str = label_names[pred_label]
            # if matches any annotator string
            if pred_str in ann_list:
                correct += 1
        match_acc = correct / len(preds_valid)
        print(f"\nAnnotator-match accuracy (pred matches any annotator1/2/3): {match_acc:.4f}")

    # Save model+tokenizer to Drive if requested
    if save_to_drive:
        from google.colab import drive
        drive.mount('/content/drive', force_remount=False)
        safe_name = model_name.replace('/', '_')
        out_dir = os.path.join(drive_path, f"{safe_name}_{task}")
        os.makedirs(out_dir, exist_ok=True)
        model.save_pretrained(out_dir)
        tokenizer.save_pretrained(out_dir)
        print(f"\nSaved model & tokenizer to {out_dir}")

    return {
        'model': model,
        'tokenizer': tokenizer,
        'accuracy': overall_acc,
        'macro_f1': macro_f1,
        'preds': preds_valid,
        'trues': trues_valid
    }


In [9]:
# uncomment to run again
# res_bert = train_and_evaluate("bert-base-uncased", task='clarity', epochs=3, batch_size=16, max_len=256, save_to_drive=True)



=== Training bert-base-uncased for task clarity ===



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 216/216 [02:34<00:00,  1.40it/s, loss=0.754]
Epoch 2/3: 100%|██████████| 216/216 [02:33<00:00,  1.41it/s, loss=1.1]
Epoch 3/3: 100%|██████████| 216/216 [02:33<00:00,  1.41it/s, loss=1.04]
Evaluating: 100%|██████████| 20/20 [00:04<00:00,  4.52it/s]



Overall Accuracy: 0.6623
Macro F1: 0.4736  (primary metric for SemEval)

Classification report:
                 precision    recall  f1-score   support

     Ambivalent       0.73      0.84      0.78       206
Clear Non-Reply       0.29      0.22      0.25        23
    Clear Reply       0.47      0.33      0.39        79

       accuracy                           0.66       308
      macro avg       0.50      0.46      0.47       308
   weighted avg       0.63      0.66      0.64       308

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Saved model & tokenizer to /content/drive/MyDrive/semeval_models/bert-base-uncased_clarity


In [16]:
# uncomment to run again
# res_distilbert_base_uncased = train_and_evaluate("distilbert-base-uncased", task='clarity', epochs=3, batch_size=16, max_len=256, save_to_drive=True)


=== Training distilbert-base-uncased for task clarity ===



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 216/216 [01:08<00:00,  3.15it/s, loss=0.75]
Epoch 2/3: 100%|██████████| 216/216 [01:12<00:00,  2.98it/s, loss=0.641]
Epoch 3/3: 100%|██████████| 216/216 [01:12<00:00,  2.97it/s, loss=0.435]
Evaluating: 100%|██████████| 20/20 [00:02<00:00,  9.87it/s]



Overall Accuracy: 0.6948
Macro F1: 0.5311  (primary metric for SemEval)

Classification report:
                 precision    recall  f1-score   support

     Ambivalent       0.74      0.88      0.80       206
Clear Non-Reply       0.41      0.39      0.40        23
    Clear Reply       0.59      0.29      0.39        79

       accuracy                           0.69       308
      macro avg       0.58      0.52      0.53       308
   weighted avg       0.67      0.69      0.67       308

Mounted at /content/drive

Saved model & tokenizer to /content/drive/MyDrive/semeval_models/distilbert-base-uncased_clarity


In [18]:
# uncomment to run again
# res_bert_large_uncased = train_and_evaluate("bert-large-uncased", task='clarity', epochs=3, batch_size=16, max_len=256, save_to_drive=True)