In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [3]:
import pandas as pd

In [4]:
train_df = pd.read_csv('gdrive/My Drive/anlp_project/train_preprocessed_df_setfit.csv')
test_df = pd.read_csv('gdrive/My Drive/anlp_project/test_preprocessed_df_setfit.csv')

In [5]:
train_df.columns

Index(['Unnamed: 0', 'Tweet index', 'Label', 'Tweet text', 'text_prep',
       'tweet_tokens', 'tweet_prep'],
      dtype='object')

In [6]:
pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m106.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m89.3 MB/s[0m eta [36m0:00:00[0m
Co

In [25]:
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
from transformers import RobertaTokenizer, RobertaModel
from transformers import DebertaTokenizer, DebertaModel
import torch.nn as nn
import torch

# Define the ExponentialPositionalEncoding
class ExponentialPositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super(ExponentialPositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        reversed_position = max_len - position
        div_term = torch.exp(-0.01 * reversed_position)
        self.encoding = reversed_position * div_term
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x):
        x = x + self.encoding[:, :x.size(1), :].to(x.device)
        return x

# Create a custom BERT model with Exponential Positional Encoding
class CustomBERTModel(nn.Module):
    def __init__(self, num_labels):
        super(CustomBERTModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.positional_encoding = ExponentialPositionalEncoding(d_model=768) # BERT base has 768 dims
        self.classifier = nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        last_hidden_state = outputs[0] # shape: (batch_size, seq_len, d_model)
        # Apply the positional encoding
        encoded_output = self.positional_encoding(last_hidden_state)
        # Take the [CLS] embedding for classification
        cls_output = encoded_output[:, 0, :]
        logits = self.classifier(cls_output)
        return logits

# Example usage
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = CustomBERTModel(num_labels=2) # Binary classification (irony or not)




In [26]:
# Create a custom RoBERTa model with Exponential Positional Encoding
class CustomRobertaModel(nn.Module):
    def __init__(self, num_labels):
        super(CustomRobertaModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.positional_encoding = ExponentialPositionalEncoding(d_model=768) # RoBERTa base has 768 dims
        self.classifier = nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs[0] # shape: (batch_size, seq_len, d_model)
        # Apply the positional encoding
        encoded_output = self.positional_encoding(last_hidden_state)
        # Take the [CLS] embedding for classification
        cls_output = encoded_output[:, 0, :]
        logits = self.classifier(cls_output)
        return logits

In [50]:
# Create a custom RoBERTa model with Exponential Positional Encoding
class CustomDebertaModel(nn.Module):
    def __init__(self, num_labels):
        super(CustomDebertaModel, self).__init__()
        self.roberta = DebertaModel.from_pretrained('microsoft/deberta-base')
        self.positional_encoding = ExponentialPositionalEncoding(d_model=768) # Deberta base has 768 dims
        self.classifier = nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs[0] # shape: (batch_size, seq_len, d_model)
        # Apply the positional encoding
        encoded_output = self.positional_encoding(last_hidden_state)
        # Take the [CLS] embedding for classification
        cls_output = encoded_output[:, 0, :]
        logits = self.classifier(cls_output)
        return logits

In [32]:
# Sample data
text = "This is an ironic statement!"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
print(outputs)

tensor([[1.7405, 0.6551]], grad_fn=<AddmmBackward0>)


In [8]:
from torch.utils.data import DataLoader, TensorDataset, random_split

In [27]:
# Tokenize the tweets
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer(list(train_df['tweet_prep']), padding=True, truncation=True, return_tensors="pt", max_length=256)
input_ids = inputs['input_ids']
attention_masks = inputs['attention_mask']
labels = torch.tensor(train_df['Label'])

# Create a DataLoader
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
validation_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [28]:
# loaded_bert = BertModel.from_pretrained("gdrive/My Drive/anlp_project/custom_bert_model")
# loaded_tokenizer = BertTokenizer.from_pretrained("gdrive/My Drive/anlp_project/custom_bert_tokenizer")
# loaded_bert.to('cuda') # Move to GPU if needed

In [29]:
test_inputs = tokenizer(list(test_df['tweet_prep']), padding=True, truncation=True, return_tensors="pt", max_length=256)
test_input_ids = test_inputs['input_ids']
test_attention_masks = test_inputs['attention_mask']
test_labels = torch.tensor(test_df['Label'])

In [30]:
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [31]:
from transformers import BertTokenizer, AdamW, get_linear_schedule_with_warmup
# Initialize the custom BERT model
model = CustomBERTModel(num_labels=2).to('cuda')

optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
criterion = nn.CrossEntropyLoss()



In [33]:
# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids = batch[0].to('cuda')
        attention_mask = batch[1].to('cuda')
        labels = batch[2].to('cuda')

        model.zero_grad()

        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch: {epoch + 1}, Training Loss: {avg_train_loss:.4f}")

    # Validation loop
    model.eval()
    total_eval_accuracy = 0
    for batch in validation_dataloader:
        input_ids = batch[0].to('cuda')
        attention_mask = batch[1].to('cuda')
        labels = batch[2].to('cuda')

        with torch.no_grad():
            logits = model(input_ids=input_ids, attention_mask=attention_mask)

        logits = logits.detach().cpu().numpy()
        labels = labels.to('cpu').numpy()
        total_eval_accuracy += (logits.argmax(axis=1) == labels).mean()

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print(f"Epoch: {epoch + 1}, Validation Accuracy: {avg_val_accuracy:.4f}")

Epoch: 1, Training Loss: 0.6378
Epoch: 1, Validation Accuracy: 0.6484
Epoch: 2, Training Loss: 0.4857
Epoch: 2, Validation Accuracy: 0.6905
Epoch: 3, Training Loss: 0.3010
Epoch: 3, Validation Accuracy: 0.7091
Epoch: 4, Training Loss: 0.1626
Epoch: 4, Validation Accuracy: 0.7091


In [34]:
from sklearn.metrics import roc_auc_score, accuracy_score

# Place the model in evaluation mode
model.eval()

# Lists to store model predictions and true labels
all_logits = []
true_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to('cuda') for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        # Get model outputs
        outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])


        # Store outputs and true labels
        all_logits.append(outputs.cpu())
        true_labels.append(inputs['labels'].cpu())

# Convert the lists into tensors
all_logits = torch.cat(all_logits, dim=0)
true_labels = torch.cat(true_labels, dim=0)

# Compute softmax over logits to get probabilities of class 1
probs = torch.nn.functional.softmax(all_logits, dim=1)[:, 1]

# Compute AUC
auc = roc_auc_score(true_labels, probs)

# Compute accuracy
preds = torch.argmax(all_logits, dim=1)
accuracy = accuracy_score(true_labels, preds)

print(f"AUC: {auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")


AUC: 0.7674
Accuracy: 0.6939


In [35]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
precision_score(true_labels, preds), recall_score(true_labels, preds), f1_score(true_labels, preds)

(0.5907928388746803, 0.7427652733118971, 0.6581196581196581)

In [None]:
# Bert uncased metrics
Accuracy - 0.7194
Precision - 0.60
Recall - 0.816
F1 - 0.69
AUC - 0.81

In [18]:
model_save_path = "gdrive/My Drive/anlp_project/custom_bert_model"
model.bert.save_pretrained(model_save_path)
tokenizer_save_path = "gdrive/My Drive/anlp_project/custom_bert_tokenizer"
tokenizer.save_pretrained(tokenizer_save_path)


('gdrive/My Drive/anlp_project/custom_bert_tokenizer/tokenizer_config.json',
 'gdrive/My Drive/anlp_project/custom_bert_tokenizer/special_tokens_map.json',
 'gdrive/My Drive/anlp_project/custom_bert_tokenizer/vocab.txt',
 'gdrive/My Drive/anlp_project/custom_bert_tokenizer/added_tokens.json')