In [1]:
#pip install transformers

In [None]:
"""Sarcasm classification using bidirectional encoder transformers"""

In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [3]:
import pandas as pd
import numpy as np

In [4]:
# Use StringIO to simulate a file object
train_df = pd.read_csv('gdrive/My Drive/anlp_project/train-balanced-sarcasm.csv')

In [5]:
test_df = pd.read_csv('gdrive/My Drive/anlp_project/test-balanced.csv')

In [7]:
train_df = train_df[train_df['subreddit']=='politics']

In [8]:
train_df['full_comment'] = train_df['parent_comment'] + ' [SEP] ' + train_df['comment']

In [9]:
train_df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,full_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ...","Yeah, I get that argument. At this point, I'd ..."
10,0,I think a significant amount would be against ...,ThisIsNotKimJongUn,politics,92,92,0,2016-09,2016-09-20 17:53:52,I bet if that money was poured into college de...,I bet if that money was poured into college de...
17,0,because it's what really bothers him... and it...,kozmo1313,politics,15,-1,-1,2016-12,2016-12-26 20:10:45,He actually acts like a moody emo girl on twit...,He actually acts like a moody emo girl on twit...
22,0,Conservatism as an ideology is for sure a reac...,MayorMcCheese59,politics,1,-1,-1,2016-12,2016-12-24 00:04:06,"I still doubt that ""all conservatives stand fo...","I still doubt that ""all conservatives stand fo..."
23,0,"Maybe not control, but certainly that is evide...",SunTzu-,politics,1,-1,-1,2016-10,2016-10-13 20:48:14,Today Russian media tweeted out that Wikileaks...,Today Russian media tweeted out that Wikileaks...


In [None]:
#test_df = pd.read_csv('gdrive/My Drive/anlp_project/SemEval2018-T3_gold_test_taskA_emoji.txt', sep='\t')

In [10]:
import re

#very basic preprocessing

def preprocess_tweet(tweet):


    # replace possible sarcasm expressions with possiblity
    re.sub(r'\. \.\.', ' possibility ', tweet)
    re.sub(r'\. \.\.\.', ' possibility ', tweet)
    #re.sub(r'\.\.\.\.', ' continuity ', tweet)

     # Replace patterns like "250,000", "3,600" with "NUM"
    #text = re.sub(r'\b\d{1,3}(?:,\d{3})*(?!\d)', 'num', tweet)

    # Replace urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", 'URL', tweet)

    #replace numbers
    #tweet = re.sub(r'^\d+$', 'num', tweet)
    tweet = re.sub(r'\b\d+\b', 'num', tweet)


    return tweet


In [11]:
train_df['full_comment'] = train_df['full_comment'].astype(str)

In [12]:
train_df['text_prep'] = train_df['full_comment'].apply(lambda x: preprocess_tweet(x))

In [15]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=42)

In [16]:
len(train_df), len(test_df)

(31596, 7900)

In [17]:
train_tweets = train_df['text_prep'].tolist()

In [18]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors

# Initialize a tokenizer
tokenizer = Tokenizer(models.BPE())

# Use the pre-tokenizer responsible for splitting the input into words
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Train the tokenizer
trainer = trainers.BpeTrainer(vocab_size=50000, show_progress=True)
tokenizer.train_from_iterator(train_tweets, trainer=trainer)  # train from iterator

# Once it's trained, save it
tokenizer.save('trained_tokenizer.json')


In [19]:
max_len = int(train_df['text_prep'].apply(len).quantile(0.95))
max_len

504

In [20]:
import torch
from torch.utils.data import Dataset, DataLoader

class TweetDataset(Dataset):
    def __init__(self, tweets, labels, tokenizer, max_len):
        self.tweets = tweets
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        tweet = self.tweets[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode(tweet)
        tokens = encoding.ids

        # Truncate or pad the tokens
        if len(tokens) > self.max_len:
            tokens = tokens[:self.max_len]
        else:
            tokens += [0] * (self.max_len - len(tokens))

        # The attention mask should have 1 for real tokens and 0 for padding
        attention_mask = [1 if token_id > 0 else 0 for token_id in tokens]

        return {
            'input_ids': torch.tensor(tokens),
            'attention_mask': torch.tensor(attention_mask),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Example
tweets = train_tweets
labels = train_df['label'].tolist()  # 0 for non-ironic and 1 for ironic
dataset = TweetDataset(tweets, labels, tokenizer, max_len=500)
loader = DataLoader(dataset, batch_size=32, shuffle=True)


In [21]:
test_tweets = test_df['text_prep'].tolist()
test_labels = test_df['label'].tolist()
test_dataset = TweetDataset(test_tweets, test_labels, tokenizer, max_len=500)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [26]:
import torch
from transformers import BertModel
from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
from torch.nn import Linear

class TweetClassifier(torch.nn.Module):
    def __init__(self, config):
        super(TweetClassifier, self).__init__()

        # Transformer encoder
        self.encoder = BertModel(config)

        # Positional embeddings (they are by default included in the TransformerModel in Hugging Face)

        # Classification head
        self.classifier = Linear(config.hidden_size, 2)  # Binary classification

    def forward(self, input_ids, attention_mask=None):
        # Pass through transformer
        outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.encoder(
            input_ids, attention_mask=attention_mask
        )

        # Use CLS token for classification task
        cls_output = outputs.last_hidden_state[:, 0]

        # Pass through classification head
        logits = self.classifier(cls_output)

        return logits


from transformers import BertConfig, BertModel

# Configuration for the transformer
config = BertConfig(
    vocab_size=tokenizer.get_vocab_size(),
    hidden_size=512,
    num_attention_heads=4,
    num_hidden_layers=3,
    intermediate_size=2048,
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1
)

model = TweetClassifier(config)


In [27]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

TweetClassifier(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 512, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_

In [28]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Assuming you have set up your model, optimizer, and loss criterion
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)
criterion = torch.nn.BCEWithLogitsLoss()

epochs = 3
for epoch in range(epochs):
    model.train()
    all_predictions = []
    all_labels = []
    train_probabilities = []
    total_loss = 0.0

    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # One-hot encode labels before sending them to the model and loss function
        labels_one_hot = torch.nn.functional.one_hot(batch['label']).to(torch.float32).to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels_one_hot)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Get the predictions
        predictions = torch.argmax(outputs, dim=1)
        all_predictions.extend(predictions.cpu().numpy())

        # Extract binary format from one-hot encoded labels for evaluation
        labels_binary = [label[1] for label in labels_one_hot.cpu().numpy()]
        all_labels.extend(labels_binary)

        total_loss += loss.item()
        probs = torch.nn.functional.softmax(outputs.detach(), dim=1)[:, 1].cpu().numpy()
        train_probabilities.extend(probs)

    avg_loss = total_loss / len(loader)
    accuracy = accuracy_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions)
    # Compute ROC AUC score for the positive class
    train_auc = roc_auc_score(all_labels, train_probabilities)

    print(f"Epoch {epoch + 1}/{epochs} Loss: {avg_loss:.4f} Accuracy: {accuracy:.4f} F1: {f1:.4f}, AUC: {train_auc:.4f}")

    # Evaluation
    model.eval()
    test_predictions = []
    test_labels = []
    test_probabilities = []
    with torch.no_grad():
      for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        labels_one_hot = torch.nn.functional.one_hot(batch['label']).to(torch.float32).to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs, dim=1)
        test_predictions.extend(predictions.cpu().numpy())

        labels_binary = [label[1] for label in labels_one_hot.cpu().numpy()]
        test_labels.extend(labels_binary)
        probs = torch.nn.functional.softmax(outputs, dim=1)[:, 1].cpu().numpy()
        test_probabilities.extend(probs)

      test_accuracy = accuracy_score(test_labels, test_predictions)
      test_f1 = f1_score(test_labels, test_predictions)
      # Compute ROC AUC score for the positive class for the test set
      test_auc = roc_auc_score(test_labels, test_probabilities)
      print(f"Test Accuracy: {test_accuracy:.4f} Test F1: {test_f1:.4f} Test AUC: {test_auc:.4f}")




Epoch 1/3 Loss: 0.6289 Accuracy: 0.6350 F1: 0.7282, AUC: 0.6675
Test Accuracy: 0.6849 Test F1: 0.7428 Test AUC: 0.7433
Epoch 2/3 Loss: 0.5220 Accuracy: 0.7418 F1: 0.7925, AUC: 0.8063
Test Accuracy: 0.6880 Test F1: 0.7237 Test AUC: 0.7624
Epoch 3/3 Loss: 0.3848 Accuracy: 0.8321 F1: 0.8629, AUC: 0.9027
Test Accuracy: 0.6866 Test F1: 0.7644 Test AUC: 0.7277


In [30]:
# Evaluation
model.eval()
test_predictions = []
test_labels = []
test_probabilities = []
with torch.no_grad():
  for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    labels_one_hot = torch.nn.functional.one_hot(batch['label']).to(torch.float32).to(device)

    outputs = model(input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs, dim=1)
    test_predictions.extend(predictions.cpu().numpy())

    labels_binary = [label[1] for label in labels_one_hot.cpu().numpy()]
    test_labels.extend(labels_binary)
    probs = torch.nn.functional.softmax(outputs, dim=1)[:, 1].cpu().numpy()
    test_probabilities.extend(probs)

  test_accuracy = accuracy_score(test_labels, test_predictions)
  test_f1 = f1_score(test_labels, test_predictions)
  # Compute ROC AUC score for the positive class for the test set
  test_auc = roc_auc_score(test_labels, test_probabilities)
  print(f"Test Accuracy: {test_accuracy:.4f} Test F1: {test_f1:.4f} Test AUC: {test_auc:.4f}")

Test Accuracy: 0.6866 Test F1: 0.7644 Test AUC: 0.7277


In [29]:
model_save_path = 'gdrive/My Drive/anlp_project/transformers/sarcasm_nonbert_model_weights.pth'
tokenizer_save_path = 'gdrive/My Drive/anlp_project/transformers/sarcasm_tokenizer.json'

# Save model weights
torch.save(model.state_dict(), model_save_path)
# Save tokenizer
tokenizer.save(tokenizer_save_path)


In [None]:
#Loading code
from tokenizers import Tokenizer

# Load model
model = TweetClassifier(config)
model.load_state_dict(torch.load(model_save_path))
model.eval()

# Load tokenizer
tokenizer = Tokenizer.from_file(tokenizer_save_path)