In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Load the JSON file
df = pd.read_json("/content/drive/MyDrive/NNDL/Sarcasm_Headlines_Dataset.json", lines=True)

# Keep only the headline and label
df = df[["headline", "is_sarcastic"]].rename(columns={"headline": "text", "is_sarcastic": "label"})

# Drop missing (precautionary)
df = df.dropna(subset=["text", "label"])

print(df.head())


                                                text  label
0  former versace store clerk sues over secret 'b...      0
1  the 'roseanne' revival catches up to our thorn...      0
2  mom starting to fear son's web series closest ...      1
3  boehner just wants wife to listen, not come up...      1
4  j.k. rowling wishes snape happy birthday in th...      0


In [None]:
len(df)

26709

In [None]:
!pip install setfit

Collecting setfit
  Downloading setfit-1.1.2-py3-none-any.whl.metadata (12 kB)
Collecting datasets>=2.15.0 (from setfit)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate>=0.3.0 (from setfit)
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets>=2.15.0->setfit)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=3->sentence-transformers[train]>=3->setfit)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=3->sentence-transformers[train]>=3->setfit)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-tra

In [None]:
from setfit import SetFitModel, SetFitTrainer
from sentence_transformers.losses import CosineSimilarityLoss
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import os

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['article_url', 'article_title', 'is_sarcastic', 'lang', 'title_length'],
        num_rows: 67479
    })
})

In [None]:
import pandas as pd

df = pd.read_csv("hf://datasets/helinivan/sarcasm_headlines_multilingual/multilang_sarcasm_dataset.csv")

In [None]:
df

Unnamed: 0,article_url,article_title,is_sarcastic,lang,title_length
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,en,78
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,en,84
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,en,79
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,en,84
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,en,64
...,...,...,...,...,...
67474,https://speld.nl/2022/08/27/dit-is-de-enige-we...,dit is de enige wettelijk toegestane manier om...,1,nl,73
67475,https://speld.nl/2022/03/27/nieuwe-fitnesspas-...,nieuwe fitnesspas 200 euro zodat je iedere dag...,1,nl,81
67476,https://speld.nl/2022/09/23/wilco-stond-5-minu...,wilco stond 5 minuten in de rij voor de kassa ...,1,nl,85
67477,https://speld.nl/2022/09/17/nemen-de-britten-w...,nemen de britten wel genoeg tijd om te rouwen?,1,nl,46


**SetFit MODEL - 1** 

**Model: MPNET BASE v2**

N = 64

In [None]:
from setfit import SetFitModel, SetFitTrainer
from sentence_transformers.losses import CosineSimilarityLoss
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import os

# === CONFIG ===
DATA_PATH = "hf://datasets/helinivan/sarcasm_headlines_multilingual/multilang_sarcasm_dataset.csv"
MODEL_PATH = "/content/drive/MyDrive/NNDL/setfit_multilang_sarcasm_en"
N_SHOT = 64
MAX_TEST_SAMPLES = 1000

# === LOAD & PREPROCESS ===
df = pd.read_csv(DATA_PATH)

# Filter to English headlines
df = df[df["lang"] == "en"]

# Rename columns to match SETFIT input format
df = df[["article_title", "is_sarcastic"]].rename(columns={"article_title": "text", "is_sarcastic": "label"})

# Drop any potential NaNs
df = df.dropna(subset=["text", "label"])

# === TRAIN/TEST SPLIT ===
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

# Few-shot sampling
def sample_few_shot(df, n=64):
    return df.groupby("label").apply(lambda x: x.sample(n=min(n, len(x)), random_state=42)).reset_index(drop=True)

fewshot_train_df = sample_few_shot(train_df, N_SHOT)
test_subset_df = test_df.sample(n=min(len(test_df), MAX_TEST_SAMPLES), random_state=42)

# Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(fewshot_train_df)
test_dataset = Dataset.from_pandas(test_subset_df)

# === LOAD BASE MODEL ===
model = SetFitModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")

# === TRAIN SETUP ===
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss_class=CosineSimilarityLoss,
    batch_size=16,
    num_iterations=50,
    num_epochs=1,
    column_mapping={"text": "text", "label": "label"},
)

trainer.train()

# Save model
model.save_pretrained(MODEL_PATH)

# Evaluate
y_true = test_dataset["label"]
y_pred = model.predict(test_dataset["text"])
acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Accuracy: {acc:.4f} | F1 Score: {f1:.4f}")


  return df.groupby("label").apply(lambda x: x.sample(n=min(n, len(x)), random_state=42)).reset_index(drop=True)


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
  trainer = SetFitTrainer(
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Map:   0%|          | 0/128 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 12800
  Batch size = 16
  Num epochs = 1


Step,Training Loss
1,0.5432
50,0.324
100,0.2248
150,0.0169
200,0.0004
250,0.0003
300,0.0002
350,0.0002
400,0.0002
450,0.0002


✅ Accuracy: 0.8200 | F1 Score: 0.8133


**MODEL : MPNET BASE v2**

N = 32

In [None]:
# === FEW-SHOT SETUP FOR N_SHOT = 32 ===
N_SHOT = 32
MODEL_PATH = f"/content/drive/MyDrive/NNDL/setfit_multilang_sarcasm_en_N{N_SHOT}"

fewshot_train_df = train_df.groupby("label").apply(
    lambda x: x.sample(n=min(N_SHOT, len(x)), random_state=42)
).reset_index(drop=True)

train_dataset = Dataset.from_pandas(fewshot_train_df)
test_dataset = Dataset.from_pandas(test_subset_df)

model = SetFitModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss_class=CosineSimilarityLoss,
    batch_size=16,
    num_iterations=50,
    num_epochs=1,
    column_mapping={"text": "text", "label": "label"},
)
trainer.train()

model.save_pretrained(MODEL_PATH)

# Evaluate
y_pred = model.predict(test_dataset["text"])
acc = accuracy_score(test_dataset["label"], y_pred)
f1 = f1_score(test_dataset["label"], y_pred)
print(f"📊 N=32 | Accuracy: {acc:.4f} | F1 Score: {f1:.4f}")


  fewshot_train_df = train_df.groupby("label").apply(
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
  trainer = SetFitTrainer(
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Map:   0%|          | 0/64 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 6400
  Batch size = 16
  Num epochs = 1


Step,Training Loss
1,0.5228
50,0.2879


Step,Training Loss
1,0.5228
50,0.2879
100,0.0829
150,0.0006
200,0.0003
250,0.0002
300,0.0002
350,0.0002
400,0.0002


📊 N=32 | Accuracy: 0.7630 | F1 Score: 0.7584


**MODEL : MPNET BASE v2**

N = 16

In [None]:
# === FEW-SHOT SETUP FOR N_SHOT = 16 ===
N_SHOT = 16
MODEL_PATH = f"/content/drive/MyDrive/NNDL/setfit_multilang_sarcasm_en_N{N_SHOT}"

fewshot_train_df = train_df.groupby("label").apply(
    lambda x: x.sample(n=min(N_SHOT, len(x)), random_state=42)
).reset_index(drop=True)

train_dataset = Dataset.from_pandas(fewshot_train_df)
test_dataset = Dataset.from_pandas(test_subset_df)

model = SetFitModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss_class=CosineSimilarityLoss,
    batch_size=16,
    num_iterations=50,
    num_epochs=1,
    column_mapping={"text": "text", "label": "label"},
)
trainer.train()

model.save_pretrained(MODEL_PATH)

# Evaluate
y_pred = model.predict(test_dataset["text"])
acc = accuracy_score(test_dataset["label"], y_pred)
f1 = f1_score(test_dataset["label"], y_pred)
print(f"📊 N=16 | Accuracy: {acc:.4f} | F1 Score: {f1:.4f}")


  fewshot_train_df = train_df.groupby("label").apply(
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
  trainer = SetFitTrainer(
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Map:   0%|          | 0/32 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 3200
  Batch size = 16
  Num epochs = 1


Step,Training Loss
1,0.5348
50,0.2341
100,0.0144
150,0.0005
200,0.0004


📊 N=16 | Accuracy: 0.7360 | F1 Score: 0.7565


**Roberta - large v1**

N - Shot = 64

In [None]:
from setfit import SetFitModel, SetFitTrainer
from sentence_transformers.losses import CosineSimilarityLoss
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import os

# === CONFIG ===
DATA_PATH = "hf://datasets/helinivan/sarcasm_headlines_multilingual/multilang_sarcasm_dataset.csv"
MODEL_PATH = "/content/drive/MyDrive/NNDL/setfit_multilang_sarcasm_roberta"
N_SHOT = 64
MAX_TEST_SAMPLES = 1000

# === LOAD & PREPROCESS ===
df = pd.read_csv(DATA_PATH)

# Filter to English headlines
df = df[df["lang"] == "en"]

# Rename columns to match SETFIT input format
df = df[["article_title", "is_sarcastic"]].rename(columns={"article_title": "text", "is_sarcastic": "label"})
df = df.dropna(subset=["text", "label"])

# === TRAIN/TEST SPLIT ===
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

# Few-shot sampling
def sample_few_shot(df, n=64):
    return df.groupby("label").apply(lambda x: x.sample(n=min(n, len(x)), random_state=42)).reset_index(drop=True)

fewshot_train_df = sample_few_shot(train_df, N_SHOT)
test_subset_df = test_df.sample(n=min(len(test_df), MAX_TEST_SAMPLES), random_state=42)

# Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(fewshot_train_df)
test_dataset = Dataset.from_pandas(test_subset_df)

# === LOAD BASE MODEL (ROBERTA) ===
model = SetFitModel.from_pretrained("sentence-transformers/all-roberta-large-v1")

# === TRAIN SETUP ===
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss_class=CosineSimilarityLoss,
    batch_size=16,
    num_iterations=10,
    num_epochs=1,
    column_mapping={"text": "text", "label": "label"},
)

trainer.train()

# Save model
model.save_pretrained(MODEL_PATH)

# Evaluate
y_true = test_dataset["label"]
y_pred = model.predict(test_dataset["text"])
acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"📊 SETFIT_RoBERTa | Accuracy: {acc:.4f} | F1 Score: {f1:.4f}")

  return df.groupby("label").apply(lambda x: x.sample(n=min(n, len(x)), random_state=42)).reset_index(drop=True)
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
  trainer = SetFitTrainer(
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for inst

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 2560
  Batch size = 16
  Num epochs = 1


**BASELINE MODELS**

MODEL - 1: Logistic Regression with TF-IDF


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd

# === CONFIG ===
DATA_PATH = "hf://datasets/helinivan/sarcasm_headlines_multilingual/multilang_sarcasm_dataset.csv"
N_SHOT = 64
MAX_TEST_SAMPLES = 1000

# === LOAD & PREPROCESS ===
df = pd.read_csv(DATA_PATH)
df = df[df["lang"] == "en"]
df = df[["article_title", "is_sarcastic"]].rename(columns={"article_title": "text", "is_sarcastic": "label"})
df = df.dropna(subset=["text", "label"])

# === TRAIN/TEST SPLIT ===
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

# === FEW-SHOT SAMPLING ===
def sample_few_shot(df, n=64):
    return df.groupby("label").apply(lambda x: x.sample(n=min(n, len(x)), random_state=42)).reset_index(drop=True)

fewshot_train_df = sample_few_shot(train_df, N_SHOT)
test_subset_df = test_df.sample(n=min(len(test_df), MAX_TEST_SAMPLES), random_state=42)

# === TF-IDF + LOGISTIC REGRESSION ===
X_train = fewshot_train_df["text"]
y_train = fewshot_train_df["label"]
X_test = test_subset_df["text"]
y_test = test_subset_df["label"]

# TF-IDF vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Logistic Regression classifier
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)
y_pred_lr = lr.predict(X_test_tfidf)

# Evaluation
acc = accuracy_score(y_test, y_pred_lr)
f1 = f1_score(y_test, y_pred_lr)

print(f"📊 Logistic Regression (TF-IDF) | Accuracy: {acc:.4f} | F1 Score: {f1:.4f}")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  return df.groupby("label").apply(lambda x: x.sample(n=min(n, len(x)), random_state=42)).reset_index(drop=True)


📊 Logistic Regression (TF-IDF) | Accuracy: 0.7030 | F1 Score: 0.7021


**Baseline Model - 2**

MODEL: Zero- Shot Classification

In [9]:
#Baseline 2: Zero-Shot Classification with BART-MNLI

from transformers import pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

# === CONFIG ===
DATA_PATH = "hf://datasets/helinivan/sarcasm_headlines_multilingual/multilang_sarcasm_dataset.csv"
MAX_TEST_SAMPLES = 1000
EVAL_SIZE = 200  # limit for speed

# === LOAD & PREPROCESS ===
df = pd.read_csv(DATA_PATH)
df = df[df["lang"] == "en"]
df = df[["article_title", "is_sarcastic"]].rename(columns={"article_title": "text", "is_sarcastic": "label"})
df = df.dropna(subset=["text", "label"])

# === TRAIN/TEST SPLIT === (note: train not needed for zero-shot)
_, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
test_subset_df = test_df.sample(n=min(len(test_df), MAX_TEST_SAMPLES), random_state=42)

# === ZERO-SHOT CLASSIFICATION ===
from transformers import pipeline

# Load model
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Labels for binary classification
candidate_labels = ["sarcastic", "not sarcastic"]

# Prepare subset for faster evaluation
texts = test_subset_df["text"].tolist()[:EVAL_SIZE]
true_labels = test_subset_df["label"].tolist()[:EVAL_SIZE]

# Predict
preds = []
for text in texts:
    result = classifier(text, candidate_labels)
    pred_label = result["labels"][0]
    pred = 1 if pred_label == "sarcastic" else 0
    preds.append(pred)

# Evaluation
acc = accuracy_score(true_labels, preds)
f1 = f1_score(true_labels, preds)

print(f"📊 Zero-Shot (BART-MNLI) | Accuracy: {acc:.4f} | F1 Score: {f1:.4f}")



config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


📊 Zero-Shot (BART-MNLI) | Accuracy: 0.5350 | F1 Score: 0.1622
