In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('/kaggle/input/sarcasm/train-balanced-sarcasm.csv')[['comment', 'parent_comment', 'label']]

#Data preprocessing to avoid further problems
df['comment'] = df['comment'].astype(str)
df['parent_comment'] = df['parent_comment'].astype(str)
df.dropna(inplace=True)
df['label'] = df['label'].astype(int)
df = df[df['label'].isin([0, 1])]

#df['combined'] = list(zip(df['parent_comment'], df['comment']))
df['combined'] = df['parent_comment'] + " [SEP] " + df['comment']

df_label_0 = df[df['label'] == 0].head(10000)

# Filter the DataFrame for label = 1 and take the first 5000 entries
df_label_1 = df[df['label'] == 1].head(10000)

# Concatenate the two DataFrames
df = pd.concat([df_label_0, df_label_1], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [4]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['combined'], df['label'], test_size=0.2, random_state=42)

In [5]:
#Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
class SarcasmDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        inputs = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=self.max_len,
            padding='max_length', truncation=True, return_attention_mask=True
        )
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        }

In [7]:
train_dataset = SarcasmDataset(train_texts, train_labels, tokenizer)
val_dataset = SarcasmDataset(val_texts, val_labels, tokenizer)

#Loading BERT
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

optimizer = AdamW(model.parameters(), lr=2e-5)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [9]:
#Training
for epoch in range(3):
    model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f'Epoch {epoch + 1}, Loss: {loss.item()}')

#Evaluation
model.eval()
total_eval_accuracy = 0
for batch in val_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    total_eval_accuracy += (predictions == batch['labels']).float().mean().item()

print(f'Validation Accuracy: {total_eval_accuracy / len(val_loader)}')

Epoch 1, Loss: 0.5066132545471191
Epoch 2, Loss: 0.5972763895988464
Epoch 3, Loss: 0.27391156554222107
Validation Accuracy: 0.7035


In [11]:
#Enter here the sentence you would like to try:
sentence = ("I really love Trump [SEP] Oh okay, do you have ay arguments about that?")
encoded_dict = tokenizer.encode_plus(
    sentence,
    add_special_tokens=True,
    max_length=512,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',  
)

input_ids = encoded_dict['input_ids'].to(device)
attention_mask = encoded_dict['attention_mask'].to(device)

#Get the prediction
model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits

logits = logits.detach().cpu().numpy()

#Get the result (0 or 1)
import numpy as np
prediction = np.argmax(logits, axis=1)[0]

if prediction == 0:
    print("The sentence is not sarcastic.")
else:
    print("The sentence is sarcastic.")

The sentence is not sarcastic.


## 