In [None]:
"""Irony classification using Bi directional encoder transformers architecture"""

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np

In [None]:
# Use StringIO to simulate a file object
train_df = pd.read_csv('gdrive/My Drive/anlp_project/SemEval2018-T3-train-taskA_emoji.txt', sep='\t')

In [None]:
test_df = pd.read_csv('gdrive/My Drive/anlp_project/SemEval2018-T3_gold_test_taskA_emoji.txt', sep='\t')

In [None]:
import re

#very basic preprocessing

def preprocess_tweet(tweet):

    # replace possible sarcasm expressions with possiblity
    re.sub(r'\. \.\.', ' possibility ', tweet)
    re.sub(r'\. \.\.\.', ' possibility ', tweet)

     # Replace patterns like "250,000", "3,600" with "NUM"
    #text = re.sub(r'\b\d{1,3}(?:,\d{3})*(?!\d)', 'num', tweet)

    # Replace urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", 'URL', tweet, flags=re.MULTILINE)

    #replace numbers
    #tweet = re.sub(r'^\d+$', 'num', tweet)
    tweet = re.sub(r'\b\d+\b', 'num', tweet)


    return tweet


In [None]:
train_df['text_prep'] = train_df['Tweet text'].apply(lambda x: preprocess_tweet(x))

In [None]:
test_df['text_prep'] = test_df['Tweet text'].apply(lambda x: preprocess_tweet(x))

In [None]:
train_tweets = train_df['text_prep'].tolist()

In [None]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors

# Initialize a tokenizer
tokenizer = Tokenizer(models.BPE())

# Use the pre-tokenizer responsible for splitting the input into words
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Train the tokenizer
trainer = trainers.BpeTrainer(vocab_size=50000, show_progress=True)
tokenizer.train_from_iterator(train_tweets, trainer=trainer)  # train from iterator

# Once it's trained, save it
tokenizer.save('trained_tokenizer.json')


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class TweetDataset(Dataset):
    def __init__(self, tweets, labels, tokenizer, max_len):
        self.tweets = tweets
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        tweet = self.tweets[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode(tweet)
        tokens = encoding.ids

        # Truncate or pad the tokens
        if len(tokens) > self.max_len:
            tokens = tokens[:self.max_len]
        else:
            tokens += [0] * (self.max_len - len(tokens))

        # The attention mask should have 1 for real tokens and 0 for padding
        attention_mask = [1 if token_id > 0 else 0 for token_id in tokens]

        return {
            'input_ids': torch.tensor(tokens),
            'attention_mask': torch.tensor(attention_mask),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Example
tweets = train_tweets
labels = train_df['Label'].tolist()  # 0 for non-ironic and 1 for ironic
dataset = TweetDataset(tweets, labels, tokenizer, max_len=230)
loader = DataLoader(dataset, batch_size=32, shuffle=True)


In [None]:
test_tweets = test_df['text_prep'].tolist()
test_labels = test_df['Label'].tolist()
test_dataset = TweetDataset(test_tweets, test_labels, tokenizer, max_len=230)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [None]:
import torch
from transformers import BertModel
from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
from torch.nn import Linear

class TweetClassifier(torch.nn.Module):
    def __init__(self, config):
        super(TweetClassifier, self).__init__()

        # Transformer encoder
        self.encoder = BertModel(config)

        # Positional embeddings (they are by default included in the TransformerModel in Hugging Face)

        # Classification head
        self.classifier = Linear(config.hidden_size, 2)  # Binary classification

    def forward(self, input_ids, attention_mask=None):
        # Pass through transformer
        outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.encoder(
            input_ids, attention_mask=attention_mask
        )

        # Use CLS token for classification task
        cls_output = outputs.last_hidden_state[:, 0]

        # Pass through classification head
        logits = self.classifier(cls_output)

        return logits


from transformers import BertConfig, BertModel

# Configuration for the transformer
config = BertConfig(
    vocab_size=tokenizer.get_vocab_size(),
    hidden_size=256,
    num_attention_heads=4,
    num_hidden_layers=2,
    intermediate_size=1024,
)

model = TweetClassifier(config)


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

TweetClassifier(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(25280, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Assuming you have set up your model, optimizer, and loss criterion
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)
criterion = torch.nn.BCEWithLogitsLoss()


epochs = 3
for epoch in range(epochs):
    model.train()
    all_predictions = []
    all_labels = []
    train_probabilities = []
    total_loss = 0.0
    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Get the predictions
        predictions = torch.argmax(outputs, dim=1)
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        total_loss += loss.item()
        probs = torch.nn.functional.softmax(outputs.detach(), dim=1)[:, 1].cpu().numpy()
        train_probabilities.extend(probs)

    avg_loss = total_loss / len(loader)
    accuracy = accuracy_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions)
    # Compute ROC AUC score for the positive class
    train_auc = roc_auc_score(all_labels, train_probabilities)
    #auc = roc_auc_score(all_labels, [output_probs[1] for output_probs in torch.nn.functional.softmax(outputs.detach(), dim=1).cpu().numpy()])

    print(f"Epoch {epoch + 1}/{epochs} Loss: {avg_loss:.4f} Accuracy: {accuracy:.4f} F1: {f1:.4f}, AUC: {train_auc:.4f}")


Epoch 1/3 Loss: 0.6887 Accuracy: 0.5520 F1: 0.5433, AUC: 0.5762
Epoch 2/3 Loss: 0.5934 Accuracy: 0.6717 F1: 0.6656, AUC: 0.7435
Epoch 3/3 Loss: 0.3519 Accuracy: 0.8522 F1: 0.8508, AUC: 0.9262


In [None]:
# Evaluation
model.eval()
test_predictions = []
test_labels = []
test_probabilities = []
with torch.no_grad():
  for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)

    outputs = model(input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs, dim=1)
    test_predictions.extend(predictions.cpu().numpy())
    test_labels.extend(labels.cpu().numpy())
    probs = torch.nn.functional.softmax(outputs, dim=1)[:, 1].cpu().numpy()
    test_probabilities.extend(probs)

  test_accuracy = accuracy_score(test_labels, test_predictions)
  test_f1 = f1_score(test_labels, test_predictions)
  # Compute ROC AUC score for the positive class for the test set
  test_auc = roc_auc_score(test_labels, test_probabilities)
  print(f"Test Accuracy: {test_accuracy:.4f} Test F1: {test_f1:.4f} Test AUC: {test_auc:.4f}")

Test Accuracy: 0.6429 Test F1: 0.5018 Test AUC: 0.6619


In [None]:
model_save_path = 'gdrive/My Drive/anlp_project/transformers/nonbert_model_weights.pth'
tokenizer_save_path = 'gdrive/My Drive/anlp_project/transformers/tokenizer.json'

# Save model weights
torch.save(model.state_dict(), model_save_path)
# Save tokenizer
tokenizer.save(tokenizer_save_path)


In [None]:
#Loading code
from tokenizers import Tokenizer

# Load model
model = TweetClassifier(config)
model.load_state_dict(torch.load(model_save_path))
model.eval()

# Load tokenizer
tokenizer = Tokenizer.from_file(tokenizer_save_path)