In [1]:
#Library Installations
!pip install transformers
!pip install torch
! pip install bert_score
! pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m96.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m105.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.1
Looking in i

In [2]:
#Imports
import torch
from transformers import RobertaTokenizer, RobertaForQuestionAnswering
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from bert_score import score
import nltk
nltk.download('wordnet')
from evaluate import load
bertscore = load("bertscore")
import bert_score
import pandas as pd
import logging
import warnings

[nltk_data] Downloading package wordnet to /root/nltk_data...


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

#### Data Preparation

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
def load_dataset(file_name):
    import pandas as pd
    import json
    
    df = []
    with open('/content/' + file_name,encoding="utf-8") as f:
        for i in f:
            i = json.loads(i)
            tweet = i['postText']
            article_title = i['targetTitle']
            article = ' '.join(i['targetParagraphs'])
            target = i['spoiler']
            label = i['tags']

            tweet = tweet[0]
            target = target[0]
            label = label[0]
            
            if label == 'multi':
              continue
            
            df += [{'clickbait_tweet': tweet,
                    'target_paragraphs':label + article_title + article, 
                    'spoiler': target,
                    'label': label}]

            data = pd.DataFrame(df)

        data['ans_start_idx'] = data.apply(lambda row: row['target_paragraphs'].index(row['spoiler']), axis=1)
        data['ans_end_idx'] = data.apply(lambda row : row['ans_start_idx'] + len(row['spoiler']), axis=1)


    return data

In [5]:
def data_preprocess():

  train_dataset = load_dataset('train.jsonl')
  validation_dataset = load_dataset('validation.jsonl') 

  tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
  train_dataset['ans_start_token_idx'] = train_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_start_idx']], add_special_tokens=False)), axis=1)
  train_dataset['ans_end_token_idx'] = train_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_end_idx']], add_special_tokens=False)), axis=1)

  validation_dataset['ans_start_token_idx'] = validation_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_start_idx']], add_special_tokens=False)), axis=1)
  validation_dataset['ans_end_token_idx'] = validation_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_end_idx']], add_special_tokens=False)), axis=1)

  #Sequence Pruning
  train_dataset = train_dataset[train_dataset['ans_end_token_idx']<513]
  train_dataset.reset_index(inplace=True)
  validation_dataset = validation_dataset[validation_dataset['ans_end_token_idx']<513]
  validation_dataset.reset_index(inplace=True)

  return train_dataset, validation_dataset


In [6]:
# Load data
train_dataset, validation_dataset = data_preprocess()

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors


In [8]:
class ClickbaitSpoilerDataset(Dataset):
    def __init__(self, clickbait_tweets, target_paragraphs, answer_start_indices, answer_end_indices, tokenizer):
        self.clickbait_tweets = clickbait_tweets
        self.target_paragraphs = target_paragraphs
        self.answer_start_indices = answer_start_indices
        self.answer_end_indices = answer_end_indices
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.clickbait_tweets)

    def __getitem__(self, idx):
        clickbait_tweet = self.clickbait_tweets[idx]
        target_paragraph = self.target_paragraphs[idx]
        answer_start_idx = self.answer_start_indices[idx]
        answer_end_idx = self.answer_end_indices[idx]

        inputs = self.tokenizer.encode_plus(
            clickbait_tweet,
            target_paragraph,
            add_special_tokens=True,
            return_tensors='pt',
            max_length=512,
            truncation=True        )

        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        start_positions = torch.tensor(answer_start_idx)
        end_positions = torch.tensor(answer_end_idx)

        return input_ids, attention_mask, start_positions, end_positions

In [9]:
def train(model, train_dataloader, optimizer, device, epoch):
    model.train()
    train_loss = 0
    total_correct = 0
    total_samples = 0

    for step, batch in enumerate(train_dataloader):
        input_ids, attention_mask, start_positions, end_positions = [x.to(device) for x in batch]

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            start_positions=start_positions,
            end_positions=end_positions
        )

        loss = outputs[0]
        train_loss += loss.item()

        start_preds, end_preds = outputs[1], outputs[2]
        _, start_preds = start_preds.max(dim=1)
        _, end_preds = end_preds.max(dim=1)
        total_correct += ((start_preds == start_positions) & (end_preds == end_positions)).sum().item()
        total_samples += start_positions.size(0)

        loss.backward()
        optimizer.step()

        if step % 100 == 0:
            acc = 100.0 * total_correct / total_samples
            avg_loss = train_loss / (step + 1)
            print(f"Step [{step}/{len(train_dataloader)}], Train Loss: {avg_loss:.4f}, Train Acc: {acc:.2f}%")

    avg_loss = train_loss / len(train_dataloader)
    acc = 100.0 * total_correct / total_samples
    print(f"Epoch [{epoch}], Train Loss: {avg_loss:.4f}, Train Acc: {acc:.2f}%")

    return avg_loss, acc

In [10]:
def eval(model, dataloader, optimizer, tokenizer, device, epoch):
    logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
    warnings.filterwarnings('ignore')
    model.eval()
    eval_loss = 0
    total_correct = 0
    total_samples = 0
    blue_score = 0
    meteor_score = 0
    f1_score = 0

    results = pd.DataFrame(columns=['epoch','target', 'predicted','original','start_pred','start_ori','end_preds','end_ori'])

    with torch.no_grad():
        for step,batch in enumerate(dataloader):
            print(step,len(batch))
            input_ids, attention_mask, start_positions, end_positions = [x.to(device) for x in batch]

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            start_logits, end_logits = outputs.start_logits, outputs.end_logits
            start_preds, end_preds = start_logits.argmax(dim=1), end_logits.argmax(dim=1)

            for i in range(len(input_ids)):
                input_id = input_ids[i].tolist()
                start_pred = start_preds[i].item()
                end_pred = end_preds[i].item()

                answer = tokenizer.decode(input_id[start_pred:end_pred+1], skip_special_tokens=True)
                answer = answer.strip()

                reference = tokenizer.decode(input_id[start_positions[i]:end_positions[i]+1], skip_special_tokens=True)
                reference = reference.strip()

                new_row = {'epoch': epoch,
                           'target': tokenizer.decode(input_id,skip_special_tokens=True),
                           'predicted': answer,
                           'original' : reference,
                           'start_pred' : start_pred,
                           'start_ori' : start_positions[i].item(),
                           'end_preds' : end_pred,
                           'end_ori' : end_positions[i].item()}

                results = results.append(new_row, ignore_index = True)
                
                results.to_csv('/content/gdrive/My Drive/RoBERTa_results_eval.csv', mode='a', header=False, index=False)

                blue_score += nltk.translate.bleu_score.sentence_bleu([tokenizer.tokenize(reference)], tokenizer.tokenize(answer))
                meteor_score += nltk.translate.meteor_score.meteor_score([tokenizer.tokenize(reference)], tokenizer.tokenize(answer))
                precision, recall, f1 = bert_score.score([answer], [reference], lang="en",model_type='bert-base-uncased')
                f1_score += f1

            total_correct += ((start_preds == start_positions) & (end_preds == end_positions)).sum().item()
            total_samples += start_positions.size(0)

    avg_loss = eval_loss / len(dataloader)
    acc = 100.0 * total_correct / total_samples
    blue_score /= total_samples
    meteor_score /= total_samples
    f1_score = f1_score.item()/ total_samples

    print(f"Eval Loss: {avg_loss:.4f}, Eval Acc: {acc:.2f}%, BLUE Score: {blue_score:.4f}, METEOR Score: {meteor_score:.4f}, F1 Score: {f1_score:.4f}")

    return avg_loss, acc, blue_score, meteor_score

In [12]:
clickbait_tweets = train_dataset['clickbait_tweet']
target_paragraphs = train_dataset['target_paragraphs']
answer_start_indices = train_dataset['ans_start_token_idx']
answer_end_indices = train_dataset['ans_end_token_idx']

val_clickbait_tweets = validation_dataset['clickbait_tweet']
val_target_paragraphs = validation_dataset['target_paragraphs']
val_answer_start_indices = validation_dataset['ans_start_token_idx']
val_answer_end_indices = validation_dataset['ans_end_token_idx']

# Initialize tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base',truncation = True)
model = RobertaForQuestionAnswering.from_pretrained('roberta-base')

# Create dataset and dataloader
train_dataset_object = ClickbaitSpoilerDataset(clickbait_tweets, target_paragraphs, answer_start_indices, answer_end_indices, tokenizer)
val_dataset_object = ClickbaitSpoilerDataset(val_clickbait_tweets, val_target_paragraphs, val_answer_start_indices, val_answer_end_indices, tokenizer)

def collate_fn(batch):
  input_ids = [item[0] for item in batch]
  attention_mask = [item[1] for item in batch]
  start_positions = [item[2] for item in batch]
  end_positions = [item[3] for item in batch]

  input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
  attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=tokenizer.pad_token_id)

  return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(start_positions), torch.tensor(end_positions)

train_dataloader = DataLoader(train_dataset_object, batch_size=8, shuffle=True,collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset_object, batch_size=8, shuffle=True,collate_fn=collate_fn)

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use 

In [13]:
# Train model
for epoch in range(5):
    logging.getLogger("transformers").setLevel(logging.ERROR)
    train(model, train_dataloader, optimizer, device,epoch)

eval(model, val_dataloader, optimizer, tokenizer, device,epoch)


  return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(start_positions), torch.tensor(end_positions)


Step [0/307], Train Loss: 6.2475, Train Acc: 0.00%
Step [100/307], Train Loss: 5.7936, Train Acc: 0.00%
Step [200/307], Train Loss: 5.6164, Train Acc: 0.12%
Step [300/307], Train Loss: 5.5019, Train Acc: 0.25%
Epoch [0], Train Loss: 5.4856, Train Acc: 0.24%
Step [0/307], Train Loss: 4.1427, Train Acc: 0.00%
Step [100/307], Train Loss: 5.0251, Train Acc: 0.25%
Step [200/307], Train Loss: 4.9029, Train Acc: 0.25%
Step [300/307], Train Loss: 4.8490, Train Acc: 0.54%
Epoch [1], Train Loss: 4.8493, Train Acc: 0.57%
Step [0/307], Train Loss: 4.7955, Train Acc: 0.00%
Step [100/307], Train Loss: 4.1795, Train Acc: 1.36%
Step [200/307], Train Loss: 4.1528, Train Acc: 1.12%
Step [300/307], Train Loss: 4.1421, Train Acc: 1.04%
Epoch [2], Train Loss: 4.1508, Train Acc: 1.02%
Step [0/307], Train Loss: 3.1095, Train Acc: 25.00%
Step [100/307], Train Loss: 3.5032, Train Acc: 2.48%
Step [200/307], Train Loss: 3.4953, Train Acc: 2.61%
Step [300/307], Train Loss: 3.4886, Train Acc: 2.28%
Epoch [3], Trai

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]



1 4
2 4
3 4
4 4




5 4
6 4




7 4




8 4
9 4




10 4




11 4
12 4




13 4




14 4




15 4
16 4




17 4




18 4




19 4
20 4




21 4
22 4
23 4
24 4




25 4




26 4




27 4
28 4




29 4
30 4




31 4
32 4




33 4




34 4
35 4
36 4




37 4
38 4




39 4
40 4




41 4
42 4




43 4




44 4




45 4
46 4




47 4
48 4




49 4




50 4
51 4




52 4




53 4




54 4
55 4




56 4




57 4




58 4




59 4




60 4
61 4




62 4




63 4




64 4
65 4
66 4
67 4




68 4
69 4
70 4




71 4
72 4
73 4




74 4
75 4




76 4
Eval Loss: 0.0000, Eval Acc: 1.64%, BLUE Score: 0.0843, METEOR Score: 0.2137, F1 Score: 0.4373


In [None]:

# Save model
model.save_pretrained('/content/gdrive/My Drive/Roberta_baseline_model')