In [2]:
import pandas as pd
import numpy as np
import transformers
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
from torchsummary import summary
from sklearn.model_selection import train_test_split

In [80]:
class BERT_ExtraLayer(nn.Module):
    def __init__(self, model, num_labels=1):
        super(BERT_ExtraLayer, self).__init__()

        #Freezing the initial weights of the pre-trained model by setting requires_grad=False
        self.bert_model = model
        self.dropout = nn.Dropout(0.2)
        self.l1 = nn.Linear(768, 1024)
        self.out = nn.Linear(1024, 2)

    def forward(self,input_ids,mask,token_type_ids):
        out1 = self.bert_model(input_ids=input_ids, attention_mask=mask, token_type_ids=token_type_ids)
        out1 = out1.pooler_output
        out1 = self.dropout(out1)
        out2 = self.dropout(self.l1(out1))
        output = self.dropout(self.out(out2))

        return output

In [81]:
class BERT_Dataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        inputs = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=self.max_len,
            padding='max_length', truncation=True, return_attention_mask=True)

        return {'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
                'token_type_ids': torch.tensor(inputs['token_type_ids'], dtype=torch.long),
                'label': torch.tensor(self.labels.iloc[idx], dtype=torch.long)}

In [82]:
def load_sarcasm_data(data_path, sub=True, n=10000, ratio=0.5):
    '''
    This function serves to load the sarcastic data from the dataset "Sarcasm on Reddit"
    available on: https://arxiv.org/abs/1704.05579.

    data_path: path to the file

    sub: Whether you want to use the full dataset (False) or only a subset (True)

    n: number of comment you want to consider

    ratio: ratio of sarcastic comment

    In the end we decided to select a subset of the original dataset with 50000 data
    balanced between sarcastic and non sarcastic. So the parameter 'sub' must be false.
    '''

    print("Data loading...")

    df = pd.read_csv(data_path)[['comment', 'parent_comment', 'label']]

    #Data processing to ensure compatibility with the model
    df['comment'] = df['comment'].astype(str)
    df['parent_comment'] = df['parent_comment'].astype(str)
    df['label'] = df['label'].astype(int)
    df = df[df['label'].isin([0, 1])]
    df.dropna(inplace=True)

    #Combining the the parent_comment and the main comment using the [SEP] separator
    df['combined'] = df['parent_comment'] + " [SEP] " + df['comment']

    #Taking a random subset of the data
    if sub:
        label_0 = int((1-ratio)*n)
        label_1 = n - label_0
        df_label_0 = df[df['label'] == 0].head(label_0)
        df_label_1 = df[df['label'] == 1].head(label_1)
        df = pd.concat([df_label_0, df_label_1], ignore_index=True)
        df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    print("Data loaded succesfully!")

    return df

In [83]:
data_path = 'cleaned_reddit.csv'
data = load_sarcasm_data('cleaned_reddit.csv')

Data loading...
Data loaded succesfully!


In [84]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['combined'], data['label'], test_size=0.2, random_state=42)

In [85]:
#Loading the BERT Tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

In [86]:
train_dataset = BERT_Dataset(train_texts, train_labels, tokenizer)
val_dataset = BERT_Dataset(val_texts, val_labels, tokenizer)

#Loading BERT pretrained model
bert_model = transformers.BertModel.from_pretrained('bert-base-uncased')

#Creating an instance of our model with an extra layer
model = BERT_ExtraLayer(bert_model)

In [87]:
#Initializing loss function
loss_fn = nn.BCEWithLogitsLoss()

#Initializing optimizer
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

#Creating the DataLoader
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [88]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BERT_ExtraLayer(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [89]:
def train(epochs, train_loader, val_loader, model, loss_fn, optimizer):

  print("Starting training process...")

  #Training
  for epoch in range(epochs):
    model.train()
    for batch in train_loader:
      batch = {k: v.to(device) for k,v in batch.items()}
      #print(batch)

      optimizer.zero_grad()

      output = model(batch['input_ids'], batch['attention_mask'], batch['token_type_ids'])
      target = batch['label'].float()
      loss = loss_fn(output.squeeze(), target)
      loss.backward()

      optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

    #Evaluation
    model.eval()
    total_eval_accuracy = 0
    for batch in val_loader:
      batch = {k: v.to(device) for k, v in batch.items()}

      with torch.no_grad():
        output = model(batch['input_ids'], batch['attention_mask'], batch['token_type_ids'])

        predictions = torch.sigmoid(output) >= 0.5
        total_eval_accuracy += (predictions == batch['label']).float().mean().item()

    print(f'Validation Accuracy: {total_eval_accuracy / len(val_loader)}')


In [92]:
def train_model(model, train_loader, val_loader, optimizer, loss_fn, epochs=5):
    for epoch in range(epochs):
        model.train()
        if epoch == 3:
          for param in model.bert_model.parameters():
            param.requires_grad = False

        total_loss = 0
        for batch in train_loader:
            inputs, masks, token_type_ids, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['token_type_ids'].to(device), batch['label'].to(device)
            optimizer.zero_grad()
            outputs = model(inputs, masks, token_type_ids)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}")

        # Validation phase
        model.eval()
        total_eval_accuracy = 0
        total_eval_loss = 0
        for batch in val_loader:
            inputs, masks, token_type_ids, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['token_type_ids'].to(device), batch['label'].to(device)
            with torch.no_grad():
                outputs = model(inputs, masks, token_type_ids)
                loss = loss_fn(outputs, labels)
                total_eval_loss += loss.item()
                preds = torch.argmax(outputs, dim=1)
                total_eval_accuracy += (preds == labels).sum().item()

        print(f'Validation Loss: {total_eval_loss / len(val_loader)}, ' +
              f'Accuracy: {total_eval_accuracy / len(val_loader.dataset)}')


In [93]:
loss = nn.CrossEntropyLoss()
train_model(model, train_loader, val_loader, optimizer, loss, epochs=10)

Epoch 1/10, Loss: 0.676859815299511
Validation Loss: 0.6730937595367432, Accuracy: 0.573
Epoch 2/10, Loss: 0.6800360552072525
Validation Loss: 0.6639993591308594, Accuracy: 0.596
Epoch 3/10, Loss: 0.6785348106622696
Validation Loss: 0.6625191617012024, Accuracy: 0.602
Epoch 4/10, Loss: 0.6780699687004089
Validation Loss: 0.6609219589233398, Accuracy: 0.6155
Epoch 5/10, Loss: 0.6757287653684616
Validation Loss: 0.662124285697937, Accuracy: 0.603
Epoch 6/10, Loss: 0.6772584973573684
Validation Loss: 0.6893376350402832, Accuracy: 0.545
Epoch 7/10, Loss: 0.6748990724086762
Validation Loss: 0.6603816494941711, Accuracy: 0.613
Epoch 8/10, Loss: 0.6730728220939636
Validation Loss: 0.6608927416801452, Accuracy: 0.601
Epoch 9/10, Loss: 0.6746509392261505
Validation Loss: 0.6596677360534668, Accuracy: 0.611


KeyboardInterrupt: 

In [297]:
import os
save_directory = 'checkpoint/model1'
os.makedirs(save_directory, exist_ok=True)
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

AttributeError: 'BERT_ExtraLayer' object has no attribute 'save_pretrained'

In [None]:
#Enter here the sentence you would like to try:
sentence = ("I really love Trump [SEP] You are intelligent, aren't you?")
encoded_dict = tokenizer.encode_plus(
    sentence,
    add_special_tokens=True,
    max_length=512,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',
)

input_ids = encoded_dict['input_ids'].to(device)
attention_mask = encoded_dict['attention_mask'].to(device)

#Get the prediction
model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits

logits = logits.detach().cpu().numpy()

#Get the result (0 or 1)
import numpy as np
prediction = np.argmax(logits, axis=1)[0]

if prediction == 0:
    print("The sentence is not sarcastic.")
else:
    print("The sentence is sarcastic.")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import shutil

source_folder = '/content/checkpoint/'  # Default folder for Colab files
destination_folder = '/content/drive/MyDrive/DL - EE-559'

# Create destination folder if it doesn't exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# Function to handle the copying of files and directories, excluding the 'drive' folder
def copy_item(src, dst):
    if os.path.basename(src) == 'drive':
        # Skip the 'drive' folder to prevent recursion or unnecessary copying
        return
    if os.path.isdir(src):
        # If the item is a directory, create a corresponding directory in the destination
        if not os.path.exists(dst):
            os.makedirs(dst)
        # Recursively copy each item in the directory
        for item in os.listdir(src):
            copy_item(os.path.join(src, item), os.path.join(dst, item))
    else:
        # If the item is a file, copy it directly
        shutil.copy(src, dst)

# Copy each file or directory from source to destination, excluding the 'drive' folder
for item in os.listdir(source_folder):
    copy_item(os.path.join(source_folder, item), os.path.join(destination_folder, item))