In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.model_selection import train_test_split

df = pd.read_csv('/kaggle/input/sarcasm/isarcasm2022.csv')[['tweet', 'sarcastic']]
df.columns = ['text', 'label']

#Data preprocessing to avoid further problems
df['text'] = df['text'].astype(str)
df.dropna(inplace=True)
df['label'] = df['label'].astype(int)
df = df[df['label'].isin([0, 1])]

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'], df['label'], test_size=0.1, random_state=42)

#Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class SarcasmDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        inputs = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=self.max_len,
            padding='max_length', truncation=True, return_attention_mask=True
        )
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        }

train_dataset = SarcasmDataset(train_texts, train_labels, tokenizer)
val_dataset = SarcasmDataset(val_texts, val_labels, tokenizer)

#Loading BERT
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

optimizer = AdamW(model.parameters(), lr=1e-4)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

#Training
for epoch in range(3):
    model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f'Epoch {epoch + 1}, Loss: {loss.item()}')

#Evaluation
model.eval()
total_eval_accuracy = 0
for batch in val_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    total_eval_accuracy += (predictions == batch['labels']).float().mean().item()

print(f'Validation Accuracy: {total_eval_accuracy / len(val_loader)}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
#Enter here the sentence you would like to try:
sentence = "Tday Im having a pizza"
encoded_dict = tokenizer.encode_plus(
    sentence,
    add_special_tokens=True,
    max_length=512,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',  
)

input_ids = encoded_dict['input_ids'].to(device)
attention_mask = encoded_dict['attention_mask'].to(device)

#Get the prediction
model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits

logits = logits.detach().cpu().numpy()

#Get the result (0 or 1)
import numpy as np
prediction = np.argmax(logits, axis=1)[0]

if prediction == 0:
    print("The sentence is not sarcastic.")
else:
    print("The sentence is sarcastic.")

The sentence is sarcastic.


## 