In [None]:
!pip install transformers
!pip install torch
! pip install bert_score
! pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering
from transformers import RobertaTokenizer, RobertaForQuestionAnswering
from transformers import DebertaTokenizer, DebertaForQuestionAnswering
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from bert_score import score
import nltk
nltk.download('wordnet')
from evaluate import load
bertscore = load("bertscore")
import bert_score
import pandas as pd
import logging
import warnings

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Data Preparation

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
def load_dataset(file_name):
    import pandas as pd
    import json
    
    df = []
    with open('/content/' + file_name,encoding="utf-8") as f:
        for i in f:
            i = json.loads(i)
            tweet = i['postText']
            article_title = i['targetTitle']
            article = ' '.join(i['targetParagraphs'])
            target = i['spoiler']
            label = i['tags']

            tweet = tweet[0]
            target = target[0]
            label = label[0]
            
            if label == 'multi':
              continue
            
            df += [{'clickbait_tweet': tweet,
                    'target_paragraphs':label + article_title + article, 
                    'spoiler': target,
                    'label': label}]

            data = pd.DataFrame(df)

        data['ans_start_idx'] = data.apply(lambda row: row['target_paragraphs'].index(row['spoiler']), axis=1)
        data['ans_end_idx'] = data.apply(lambda row : row['ans_start_idx'] + len(row['spoiler']), axis=1)


    return data

In [None]:
def data_preprocess():

  train_dataset = load_dataset('train.jsonl')
  validation_dataset = load_dataset('validation.jsonl') 

  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  train_dataset['ans_start_token_idx'] = train_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_start_idx']], add_special_tokens=False)), axis=1)
  train_dataset['ans_end_token_idx'] = train_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_end_idx']], add_special_tokens=False)), axis=1)

  validation_dataset['ans_start_token_idx'] = validation_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_start_idx']], add_special_tokens=False)), axis=1)
  validation_dataset['ans_end_token_idx'] = validation_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_end_idx']], add_special_tokens=False)), axis=1)

  #Sequence Pruning
  train_dataset = train_dataset[train_dataset['ans_end_token_idx']<513]
  train_dataset.reset_index(inplace=True)
  validation_dataset = validation_dataset[validation_dataset['ans_end_token_idx']<513]
  validation_dataset.reset_index(inplace=True)

  return train_dataset, validation_dataset


In [None]:
# Load data
train_dataset, validation_dataset = data_preprocess()

Token indices sequence length is longer than the specified maximum sequence length for this model (582 > 512). Running this sequence through the model will result in indexing errors


In [None]:
validation_dataset["label"].value_counts()

phrase     325
passage    285
Name: label, dtype: int64

In [None]:
class ClickbaitSpoilerDataset(Dataset):
    def __init__(self, clickbait_tweets, target_paragraphs, answer_start_indices, answer_end_indices, tokenizer):
        self.clickbait_tweets = clickbait_tweets
        self.target_paragraphs = target_paragraphs
        self.answer_start_indices = answer_start_indices
        self.answer_end_indices = answer_end_indices
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.clickbait_tweets)

    def __getitem__(self, idx):
        clickbait_tweet = self.clickbait_tweets[idx]
        target_paragraph = self.target_paragraphs[idx]
        answer_start_idx = self.answer_start_indices[idx]
        answer_end_idx = self.answer_end_indices[idx]

        inputs = self.tokenizer.encode_plus(
            clickbait_tweet,
            target_paragraph,
            add_special_tokens=True,
            return_tensors='pt',
            max_length=512,
            truncation=True        )

        input_ids = inputs['input_ids'].squeeze()
        token_type_ids = inputs['token_type_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        start_positions = torch.tensor(answer_start_idx)
        end_positions = torch.tensor(answer_end_idx)

        return input_ids, token_type_ids, attention_mask, start_positions, end_positions

In [None]:
def train(model, train_dataloader, optimizer, device, epoch):
    model.train()
    train_loss = 0
    total_correct = 0
    total_samples = 0

    for step, batch in enumerate(train_dataloader):
        input_ids, token_type_ids, attention_mask, start_positions, end_positions = [x.to(device) for x in batch]

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            start_positions=start_positions,
            end_positions=end_positions
        )

        loss = outputs[0]
        train_loss += loss.item()

        start_preds, end_preds = outputs[1], outputs[2]
        _, start_preds = start_preds.max(dim=1)
        _, end_preds = end_preds.max(dim=1)
        total_correct += ((start_preds == start_positions) & (end_preds == end_positions)).sum().item()
        total_samples += start_positions.size(0)

        loss.backward()
        optimizer.step()

        if step % 100 == 0:
            acc = 100.0 * total_correct / total_samples
            avg_loss = train_loss / (step + 1)
            print(f"Step [{step}/{len(train_dataloader)}], Train Loss: {avg_loss:.4f}, Train Acc: {acc:.2f}%")

    avg_loss = train_loss / len(train_dataloader)
    acc = 100.0 * total_correct / total_samples
    print(f"Epoch [{epoch}], Train Loss: {avg_loss:.4f}, Train Acc: {acc:.2f}%")

    return avg_loss, acc

In [None]:
def eval(model, dataloader, optimizer, tokenizer, device, epoch):
    logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
    warnings.filterwarnings('ignore')
    model.eval()
    eval_loss = 0
    total_correct = 0
    total_samples = 0
    blue_score = 0
    meteor_score = 0
    f1_score = 0

    results = pd.DataFrame(columns=['epoch','target', 'predicted','original','start_pred','start_ori','end_preds','end_ori'])

    with torch.no_grad():
        for step,batch in enumerate(dataloader):
            print(step,len(batch))
            input_ids, token_type_ids, attention_mask, start_positions, end_positions = [x.to(device) for x in batch]

            outputs = model(
                input_ids=input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask
            )

            start_logits, end_logits = outputs.start_logits, outputs.end_logits
            start_preds, end_preds = start_logits.argmax(dim=1), end_logits.argmax(dim=1)

            for i in range(len(input_ids)):
                input_id = input_ids[i].tolist()
                start_pred = start_preds[i].item()
                end_pred = end_preds[i].item()

                answer = tokenizer.decode(input_id[start_pred:end_pred+1], skip_special_tokens=True)
                answer = answer.strip()

                reference = tokenizer.decode(input_id[start_positions[i]:end_positions[i]+1], skip_special_tokens=True)
                reference = reference.strip()

                new_row = {'epoch': epoch,
                           'target': tokenizer.decode(input_id,skip_special_tokens=True),
                           'predicted': answer,
                           'original' : reference,
                           'start_pred' : start_pred,
                           'start_ori' : start_positions[i].item(),
                           'end_preds' : end_pred,
                           'end_ori' : end_positions[i].item()}

                results = results.append(new_row, ignore_index = True)
                
                results.to_csv('/content/gdrive/My Drive/Bert_baseline_results_eval.csv', mode='a', header=False, index=False)

                blue_score += nltk.translate.bleu_score.sentence_bleu([tokenizer.tokenize(reference)], tokenizer.tokenize(answer))
                meteor_score += nltk.translate.meteor_score.meteor_score([tokenizer.tokenize(reference)], tokenizer.tokenize(answer))
                precision, recall, f1 = bert_score.score([answer], [reference], lang="en",model_type='bert-base-uncased')
                f1_score += f1

            total_correct += ((start_preds == start_positions) & (end_preds == end_positions)).sum().item()
            total_samples += start_positions.size(0)

    avg_loss = eval_loss / len(dataloader)
    acc = 100.0 * total_correct / total_samples
    blue_score /= total_samples
    meteor_score /= total_samples
    f1_score = f1_score.item()/ total_samples

    print(f"Eval Loss: {avg_loss:.4f}, Eval Acc: {acc:.2f}%, BLUE Score: {blue_score:.4f}, METEOR Score: {meteor_score:.4f}, F1 Score: {f1_score:.4f}")

    return avg_loss, acc, blue_score, meteor_score

In [None]:
train_dataset["label"].value_counts()

phrase     1315
passage    1128
Name: label, dtype: int64

In [None]:
clickbait_tweets = train_dataset['clickbait_tweet']
target_paragraphs = train_dataset['target_paragraphs']
answer_start_indices = train_dataset['ans_start_token_idx']
answer_end_indices = train_dataset['ans_end_token_idx']

val_clickbait_tweets = validation_dataset['clickbait_tweet']
val_target_paragraphs = validation_dataset['target_paragraphs']
val_answer_start_indices = validation_dataset['ans_start_token_idx']
val_answer_end_indices = validation_dataset['ans_end_token_idx']

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',truncation = True)
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Create dataset and dataloader
train_dataset_object = ClickbaitSpoilerDataset(clickbait_tweets, target_paragraphs, answer_start_indices, answer_end_indices, tokenizer)
val_dataset_object = ClickbaitSpoilerDataset(val_clickbait_tweets, val_target_paragraphs, val_answer_start_indices, val_answer_end_indices, tokenizer)

def collate_fn(batch):
  input_ids = [item[0] for item in batch]
  token_type_ids = [item[1] for item in batch]
  attention_mask = [item[2] for item in batch]
  start_positions = [item[3] for item in batch]
  end_positions = [item[4] for item in batch]

  input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
  token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
  attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=tokenizer.pad_token_id)
  #start_positions = pad_sequence(start_positions, batch_first=True, padding_value=tokenizer.pad_token_id)
  #end_positions = pad_sequence(end_positions, batch_first=True, padding_value=tokenizer.pad_token_id)

  return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), torch.tensor(start_positions), torch.tensor(end_positions)

train_dataloader = DataLoader(train_dataset_object, batch_size=8, shuffle=True,collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset_object, batch_size=8, shuffle=True,collate_fn=collate_fn)

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [None]:
# Train model
for epoch in range(5):
    logging.getLogger("transformers").setLevel(logging.ERROR)
    train(model, train_dataloader, optimizer, device,epoch)

eval(model, val_dataloader, optimizer, tokenizer, device,epoch)


  return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), torch.tensor(start_positions), torch.tensor(end_positions)


Step [0/306], Train Loss: 6.4188, Train Acc: 0.00%
Step [100/306], Train Loss: 5.9148, Train Acc: 0.00%
Step [200/306], Train Loss: 5.8183, Train Acc: 0.06%
Step [300/306], Train Loss: 5.7587, Train Acc: 0.29%
Epoch [0], Train Loss: 5.7568, Train Acc: 0.29%
Step [0/306], Train Loss: 5.3329, Train Acc: 0.00%
Step [100/306], Train Loss: 5.4320, Train Acc: 0.50%
Step [200/306], Train Loss: 5.3961, Train Acc: 0.56%
Step [300/306], Train Loss: 5.3662, Train Acc: 0.50%
Epoch [1], Train Loss: 5.3657, Train Acc: 0.49%
Step [0/306], Train Loss: 4.7002, Train Acc: 0.00%
Step [100/306], Train Loss: 4.7988, Train Acc: 1.61%
Step [200/306], Train Loss: 4.7234, Train Acc: 1.37%
Step [300/306], Train Loss: 4.6792, Train Acc: 1.37%
Epoch [2], Train Loss: 4.6780, Train Acc: 1.35%
Step [0/306], Train Loss: 3.9511, Train Acc: 0.00%
Step [100/306], Train Loss: 3.4522, Train Acc: 6.93%
Step [200/306], Train Loss: 3.5258, Train Acc: 6.09%
Step [300/306], Train Loss: 3.5387, Train Acc: 5.69%
Epoch [3], Train



1 5




2 5
3 5
4 5




5 5




6 5
7 5




8 5
9 5
10 5




11 5




12 5




13 5




14 5




15 5




16 5




17 5




18 5




19 5
20 5




21 5




22 5




23 5




24 5
25 5




26 5
27 5




28 5




29 5




30 5




31 5




32 5




33 5




34 5
35 5




36 5




37 5




38 5




39 5
40 5




41 5




42 5
43 5
44 5
45 5




46 5




47 5




48 5




49 5




50 5




51 5




52 5




53 5
54 5




55 5




56 5




57 5




58 5
59 5




60 5




61 5




62 5




63 5




64 5




65 5




66 5




67 5




68 5




69 5




70 5
71 5




72 5
73 5




74 5




75 5
76 5




Eval Loss: 0.0000, Eval Acc: 0.66%, BLUE Score: 0.0755, METEOR Score: 0.2073, F1 Score: 0.3703


In [None]:
# Save model
model.save_pretrained('/content/gdrive/My Drive/Bert_baseline_model')