**SetFit MODEL - 3** 

**Model: MiniLM-L3-v2**

N = 16

In [5]:
from setfit import SetFitModel, SetFitTrainer
from sentence_transformers.losses import CosineSimilarityLoss
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import os

# === CONFIG ===
DATA_PATH = "multilang_sarcasm_dataset.csv"
MODEL_PATH = "model/setfit_multilang_sarcasm_en"
N_SHOT = 16
MAX_TEST_SAMPLES = 1000

# === LOAD & PREPROCESS ===
df = pd.read_csv(DATA_PATH)

# Filter to English headlines
df = df[df["lang"] == "en"]

# Rename columns to match SETFIT input format
df = df[["article_title", "is_sarcastic"]].rename(columns={"article_title": "text", "is_sarcastic": "label"})

# Drop any potential NaNs
df = df.dropna(subset=["text", "label"])

# === TRAIN/TEST SPLIT ===
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

# Few-shot sampling
def sample_few_shot(df, n=64):
    return df.groupby("label").apply(lambda x: x.sample(n=min(n, len(x)), random_state=42)).reset_index(drop=True)

fewshot_train_df = sample_few_shot(train_df, N_SHOT)
test_subset_df = test_df.sample(n=min(len(test_df), MAX_TEST_SAMPLES), random_state=42)

# Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(fewshot_train_df)
test_dataset = Dataset.from_pandas(test_subset_df)

# === LOAD BASE MODEL ===
model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-MiniLM-L3-v2")

# === TRAIN SETUP ===
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss_class=CosineSimilarityLoss,
    batch_size=16,
    num_iterations=50,
    num_epochs=1,
    column_mapping={"text": "text", "label": "label"},
)

trainer.train()

# Save model
model.save_pretrained(MODEL_PATH)

# Evaluate
y_true = test_dataset["label"]
y_pred = model.predict(test_dataset["text"])
acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Accuracy: {acc:.4f} | F1 Score: {f1:.4f}")

  return df.groupby("label").apply(lambda x: x.sample(n=min(n, len(x)), random_state=42)).reset_index(drop=True)
`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
Generating Training Pairs: 100%|██████████| 50/50 [00:00<00:00, 2996.53it/s]
***** Running training *****
  Num examples = 3200
  Num epochs = 1
  Total optimization steps = 200
  Total train batch size = 16
                                                                     

Step,Training Loss


Accuracy: 0.6480 | F1 Score: 0.6576


**SetFit MODEL - 3** 

**Model: MiniLM-L3-v2**

N = 32

In [6]:
from setfit import SetFitModel, SetFitTrainer
from sentence_transformers.losses import CosineSimilarityLoss
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import os

# === CONFIG ===
DATA_PATH = "multilang_sarcasm_dataset.csv"
MODEL_PATH = "model/setfit_multilang_sarcasm_en_32"
N_SHOT = 32
MAX_TEST_SAMPLES = 1000

# === LOAD & PREPROCESS ===
df = pd.read_csv(DATA_PATH)

# Filter to English headlines
df = df[df["lang"] == "en"]

# Rename columns to match SETFIT input format
df = df[["article_title", "is_sarcastic"]].rename(columns={"article_title": "text", "is_sarcastic": "label"})

# Drop any potential NaNs
df = df.dropna(subset=["text", "label"])

# === TRAIN/TEST SPLIT ===
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

# Few-shot sampling
def sample_few_shot(df, n=64):
    return df.groupby("label").apply(lambda x: x.sample(n=min(n, len(x)), random_state=42)).reset_index(drop=True)

fewshot_train_df = sample_few_shot(train_df, N_SHOT)
test_subset_df = test_df.sample(n=min(len(test_df), MAX_TEST_SAMPLES), random_state=42)

# Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(fewshot_train_df)
test_dataset = Dataset.from_pandas(test_subset_df)

# === LOAD BASE MODEL ===
model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-MiniLM-L3-v2")

# === TRAIN SETUP ===
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss_class=CosineSimilarityLoss,
    batch_size=16,
    num_iterations=50,
    num_epochs=1,
    column_mapping={"text": "text", "label": "label"},
)

trainer.train()

# Save model
model.save_pretrained(MODEL_PATH)

# Evaluate
y_true = test_dataset["label"]
y_pred = model.predict(test_dataset["text"])
acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Accuracy: {acc:.4f} | F1 Score: {f1:.4f}")

  return df.groupby("label").apply(lambda x: x.sample(n=min(n, len(x)), random_state=42)).reset_index(drop=True)
`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
Generating Training Pairs: 100%|██████████| 50/50 [00:00<00:00, 1391.36it/s]
***** Running training *****
  Num examples = 6400
  Num epochs = 1
  Total optimization steps = 400
  Total train batch size = 16
                                                                     

Step,Training Loss


Accuracy: 0.6900 | F1 Score: 0.6645


**SetFit MODEL - 3** 

**Model: MiniLM-L3-v2**

N = 64

In [1]:
from setfit import SetFitModel, SetFitTrainer
from sentence_transformers.losses import CosineSimilarityLoss
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import os

# === CONFIG ===
DATA_PATH = "multilang_sarcasm_dataset.csv"
MODEL_PATH = "model/setfit_multilang_sarcasm_en_64"
N_SHOT = 64
MAX_TEST_SAMPLES = 1000

# === LOAD & PREPROCESS ===
df = pd.read_csv(DATA_PATH)

# Filter to English headlines
df = df[df["lang"] == "en"]

# Rename columns to match SETFIT input format
df = df[["article_title", "is_sarcastic"]].rename(columns={"article_title": "text", "is_sarcastic": "label"})

# Drop any potential NaNs
df = df.dropna(subset=["text", "label"])

# === TRAIN/TEST SPLIT ===
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

# Few-shot sampling
def sample_few_shot(df, n=64):
    return df.groupby("label").apply(lambda x: x.sample(n=min(n, len(x)), random_state=42)).reset_index(drop=True)

fewshot_train_df = sample_few_shot(train_df, N_SHOT)
test_subset_df = test_df.sample(n=min(len(test_df), MAX_TEST_SAMPLES), random_state=42)

# Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(fewshot_train_df)
test_dataset = Dataset.from_pandas(test_subset_df)

# === LOAD BASE MODEL ===
model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-MiniLM-L3-v2")

# === TRAIN SETUP ===
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss_class=CosineSimilarityLoss,
    batch_size=16,
    num_iterations=50,
    num_epochs=1,
    column_mapping={"text": "text", "label": "label"},
)

trainer.train()

# Save model
model.save_pretrained(MODEL_PATH)

# Evaluate
y_true = test_dataset["label"]
y_pred = model.predict(test_dataset["text"])
acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Accuracy: {acc:.4f} | F1 Score: {f1:.4f}")

  from .autonotebook import tqdm as notebook_tqdm
  return df.groupby("label").apply(lambda x: x.sample(n=min(n, len(x)), random_state=42)).reset_index(drop=True)
`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
Generating Training Pairs: 100%|██████████| 50/50 [00:00<00:00, 1095.85it/s]
***** Running training *****
  Num examples = 12800
  Num epochs = 1
  Total optimization steps = 800
  Total train batch size = 16
                                                                     

Step,Training Loss
500,0.1663


Accuracy: 0.7150 | F1 Score: 0.7016
