Load the training data 

In [1]:
PATH_TO_DATA = '../input/sarcasm/train-balanced-sarcasm.csv'

In [2]:
!pip install transformers datasets evaluate accelerate

Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=2.0.0->accelerate)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cus

In [3]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from transformers import RobertaTokenizer
from datasets import Dataset

In [4]:
train_df = pd.read_csv(PATH_TO_DATA)
train_df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [None]:
train_df.info()

Drop missing columns

In [6]:
train_df.dropna(subset=['comment'], inplace=True)

Label counts are balanced:

In [7]:
train_df['label'].value_counts()

label
0    505403
1    505368
Name: count, dtype: int64

### Exploration of the Training Data

Most common words in non-sarcastic comments

Most common words in sarcastic comments

Split into training, validation and test set

In [8]:
# 20% for testing
train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(
    train_df['comment'], train_df['label'], test_size=0.2, stratify=train_df['label'], random_state=42
)

# split remaining data into training and validation (10% of 80% = 8%)
train_texts, valid_texts, train_labels, valid_labels = train_test_split(
    train_val_texts, train_val_labels, test_size=0.1, stratify=train_val_labels, random_state=42
)

Convert into Huggingface dataset format

In [9]:
train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
valid_dataset = Dataset.from_dict({'text': valid_texts, 'label': valid_labels})
test_dataset  = Dataset.from_dict({'text': test_texts, 'label': test_labels})


### Tokenizing

In [10]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize_inputs(inputs):
    return tokenizer(inputs['text'], truncation=True, padding='max_length', max_length=64)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [12]:
tokenized_train_dataset = train_dataset.map(tokenize_inputs, batched=True)
tokenized_valid_dataset = valid_dataset.map(tokenize_inputs, batched=True)
tokenized_test_dataset  = test_dataset.map(tokenize_inputs, batched=True)

Map:   0%|          | 0/727754 [00:00<?, ? examples/s]

Map:   0%|          | 0/80862 [00:00<?, ? examples/s]

Map:   0%|          | 0/202155 [00:00<?, ? examples/s]

In [13]:
from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

2025-06-26 09:10:46.630713: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750929046.945075      72 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750929047.030804      72 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",     
    save_strategy="epoch",           
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

Metrics

In [15]:
import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels)["f1"],
        "precision": precision.compute(predictions=predictions, references=labels)["precision"],
        "recall": recall.compute(predictions=predictions, references=labels)["recall"]
    }


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Initialize trainer and train the model

In [17]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

Evaluate the model

In [None]:
trainer.evaluate(tokenized_test_dataset)

In [None]:
predictions = trainer.predict(tokenized_test_dataset)
pred_labels = predictions.predictions.argmax(-1)