In [1]:
import pandas as pd
import json
import re
import string
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Read Data Files

In [2]:
from google.colab import files
uploaded = files.upload()

Saving Sarcasm_Headlines_Dataset.json to Sarcasm_Headlines_Dataset.json


In [3]:
file_name = "Sarcasm_Headlines_Dataset.json"
sarcasm_data = pd.read_json(file_name, lines = True)
sarcasm_data

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...
...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...


# Clean Text

In [4]:
def clean_headlines(text, remove_stopwords=False):
  text = text.lower() 
  text = text.translate(str.maketrans('', '', string.punctuation)) #replace punctuations
  text = re.sub(r'[0-9]', '', text) #remove numbers

  # Remove stop words
  if remove_stopwords:
      stops = set(stopwords.words("english"))
      words = [w for w in text.split() if not w in stops]
      
      text = " ".join(words)
  
  return text

In [5]:
sarcasm_data['headline_new'] = sarcasm_data['headline'].apply(clean_headlines)
sarcasm_data

Unnamed: 0,is_sarcastic,headline,article_link,headline_new
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...,dem rep totally nails why congress is falling ...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...,eat your veggies deliciously different recipes
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...,mother comes pretty close to using word stream...
...,...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...,jews to celebrate rosh hashasha or something
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...,internal affairs investigator disappointed con...
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...,the most beautiful acceptance speech this week...
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...,mars probe destroyed by orbiting spielberggate...


In [6]:
dict_of_words = Counter([word for line in sarcasm_data['headline_new'] for word in line.split()])
dict_of_words

Counter({'thirtysomething': 1,
         'scientists': 103,
         'unveil': 13,
         'doomsday': 5,
         'clock': 18,
         'of': 6264,
         'hair': 59,
         'loss': 38,
         'dem': 4,
         'rep': 25,
         'totally': 55,
         'nails': 13,
         'why': 519,
         'congress': 140,
         'is': 1715,
         'falling': 23,
         'short': 40,
         'on': 2627,
         'gender': 26,
         'racial': 20,
         'equality': 25,
         'eat': 46,
         'your': 668,
         'veggies': 2,
         'deliciously': 1,
         'different': 64,
         'recipes': 30,
         'inclement': 2,
         'weather': 27,
         'prevents': 5,
         'liar': 8,
         'from': 1329,
         'getting': 211,
         'to': 9062,
         'work': 202,
         'mother': 84,
         'comes': 79,
         'pretty': 119,
         'close': 42,
         'using': 69,
         'word': 64,
         'streaming': 9,
         'correctly': 5,
        

In [7]:
sarcasm_data.shape

(28619, 4)

In [8]:
sarcasm_data['is_sarcastic'].value_counts(normalize = True)

0    0.523603
1    0.476397
Name: is_sarcastic, dtype: float64

In [9]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 7.5 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 51.2 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 68.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 65.2 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninsta

In [10]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast

# specify GPU
device = torch.device("cuda")

# Train Test Split

In [11]:
train_headlines, temp_headlines, train_labels, temp_labels = train_test_split(sarcasm_data['headline_new'], sarcasm_data['is_sarcastic'], 
                                                                    random_state=42, 
                                                                    test_size=0.3, 
                                                                    stratify=sarcasm_data['is_sarcastic'])

In [12]:
train_labels.value_counts(normalize = True)

0    0.523586
1    0.476414
Name: is_sarcastic, dtype: float64

In [13]:
# we will use temp_headlines and temp_labels to create validation and test set
val_headlines, test_headlines, val_labels, test_labels = train_test_split(temp_headlines, temp_labels, 
                                                                random_state=42, 
                                                                test_size=0.6, 
                                                                stratify=temp_labels)

In [14]:
val_labels.value_counts(normalize = True)

0    0.523588
1    0.476412
Name: is_sarcastic, dtype: float64

In [15]:
print(len(sarcasm_data),len(train_headlines),len(temp_headlines),len(val_headlines),len(test_headlines))

28619 20033 8586 3434 5152


# Import BERT Model

In [16]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [17]:
# sample data
text = ["this is a bert model tutorial", "we will fine-tune a bert model"]

# encode text
sent_id = tokenizer.batch_encode_plus(text, padding=True, return_token_type_ids=False)
sent_id

{'input_ids': [[101, 2023, 2003, 1037, 14324, 2944, 14924, 4818, 102, 0], [101, 2057, 2097, 2986, 1011, 8694, 1037, 14324, 2944, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

# Get Statistical Features of Input Text

In [18]:
def create_lengths(text):
    '''Create a data frame of the sentence lengths from a text'''
    lengths = []
    for sentence in text:
        lengths.append(len(sentence.split()))
    return pd.DataFrame(lengths, columns=['counts'])

In [19]:
lengths_title = create_lengths(train_headlines)

print(lengths_title.describe())

             counts
count  20033.000000
mean       9.890880
std        3.280527
min        1.000000
25%        8.000000
50%       10.000000
75%       12.000000
max       38.000000


In [20]:
#Get 95th percentile of length of training headlines --> maximum length of sequence
max_seq_len = int(lengths_title.counts.quantile(0.95))

Tokenization

In [21]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_headlines.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_headlines.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_headlines.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)



In [22]:
# for train set
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

# for validation set
val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

# for test set
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

# Model Definition

In [23]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [24]:
# freeze all the parameters
for param in bert.parameters():
    param.requires_grad = False

In [25]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
      
      super(BERT_Arch, self).__init__()

      self.bert = bert 
      
      # dropout layer
      self.dropout = nn.Dropout(0.1)
      
      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(768,512)
      
      # dense layer 3 (Output layer)
      self.fc2 = nn.Linear(512,2)

      #softmax activation function
      self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
      
      x = self.fc1(cls_hs)

      x = self.relu(x)

      x = self.dropout(x)

      # output layer
      x = self.fc2(x)
      
      # apply softmax activation
      x = self.softmax(x)

      return x

In [26]:
# pass the pre-trained BERT to our define architecture
model = BERT_Arch(bert)

# push the model to GPU
model = model.to(device)

In [27]:
# optimizer from hugging face transformers
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-3)

In [28]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

#compute the class weights
class_wts = compute_class_weight('balanced', np.unique(train_labels), train_labels)

print(class_wts)

[0.95495281 1.04950754]


In [29]:
# convert class weights to tensor
weights= torch.tensor(class_wts,dtype=torch.float)
weights = weights.to(device)

# loss function
cross_entropy  = nn.NLLLoss(weight=weights) 

# number of training epochs
epochs = 10

In [30]:
# function to train the model
def train():
  
  model.train()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save model predictions
  total_preds=[]
  
  # iterate over batches
  for step,batch in enumerate(train_dataloader):
    
    # progress update after every 100 batches.
    if step % 100 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

    # push the batch to gpu
    batch = [r.to(device) for r in batch]
 
    sent_id, mask, labels = batch

    # clear previously calculated gradients 
    model.zero_grad()        

    # get model predictions for the current batch
    preds = model(sent_id, mask)

    # compute the loss between actual and predicted values
    loss = cross_entropy(preds, labels)

    # add on to the total loss
    total_loss = total_loss + loss.item()

    # backward pass to calculate the gradients
    loss.backward()

    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update parameters
    optimizer.step()

    # model predictions are stored on GPU. So, push it to CPU
    preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

  # compute the training loss of the epoch
  avg_loss = total_loss / len(train_dataloader)
  
  # predictions are in the form of (no. of batches, size of batch, no. of classes).
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  #returns the loss and predictions
  return avg_loss, total_preds

In [31]:
# function for evaluating the model
def evaluate():
  
  print("\nEvaluating...")
  
  # deactivate dropout layers
  model.eval()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save the model predictions
  total_preds = []

  # iterate over batches
  for step,batch in enumerate(val_dataloader):
    
    # Progress update every 100 batches.
    if step % 100 == 0 and not step == 0:
            
      # Report progress.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

    # push the batch to gpu
    batch = [t.to(device) for t in batch]

    sent_id, mask, labels = batch

    # deactivate autograd
    with torch.no_grad():
      
      # model predictions
      preds = model(sent_id, mask)

      # compute the validation loss between actual and predicted values
      loss = cross_entropy(preds,labels)

      total_loss = total_loss + loss.item()

      preds = preds.detach().cpu().numpy()

      total_preds.append(preds)

  # compute the validation loss of the epoch
  avg_loss = total_loss / len(val_dataloader) 

  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds

# Training Steps

In [32]:
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _ = train()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights_v2.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 10
  Batch   100  of    627.
  Batch   200  of    627.
  Batch   300  of    627.
  Batch   400  of    627.
  Batch   500  of    627.
  Batch   600  of    627.

Evaluating...
  Batch   100  of    108.

Training Loss: 0.597
Validation Loss: 0.524

 Epoch 2 / 10
  Batch   100  of    627.
  Batch   200  of    627.
  Batch   300  of    627.
  Batch   400  of    627.
  Batch   500  of    627.
  Batch   600  of    627.

Evaluating...
  Batch   100  of    108.

Training Loss: 0.540
Validation Loss: 0.485

 Epoch 3 / 10
  Batch   100  of    627.
  Batch   200  of    627.
  Batch   300  of    627.
  Batch   400  of    627.
  Batch   500  of    627.
  Batch   600  of    627.

Evaluating...
  Batch   100  of    108.

Training Loss: 0.523
Validation Loss: 0.503

 Epoch 4 / 10
  Batch   100  of    627.
  Batch   200  of    627.
  Batch   300  of    627.
  Batch   400  of    627.
  Batch   500  of    627.
  Batch   600  of    627.

Evaluating...
  Batch   100  of    108.

Training Loss: 0

# Load Saved Model and Perform Testing

In [None]:
#load weights of best model
path = 'saved_weights_v2.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [None]:
# get predictions for test data
with torch.no_grad():
  preds = model(test_seq.to(device), test_mask.to(device))
  preds = preds.detach().cpu().numpy()

In [None]:
# model's performance
preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))

              precision    recall  f1-score   support

           0       0.83      0.77      0.80      2698
           1       0.77      0.83      0.80      2454

    accuracy                           0.80      5152
   macro avg       0.80      0.80      0.80      5152
weighted avg       0.80      0.80      0.80      5152



In [None]:
# confusion matrix
pd.crosstab(test_y, preds)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2081,617
1,412,2042
