In [None]:

!pip install transformers
!pip install torch
! pip install bert_score
! pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in in

In [None]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering
from transformers import RobertaTokenizer, RobertaForQuestionAnswering
from transformers import DebertaTokenizer, DebertaForQuestionAnswering
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from bert_score import score
import nltk
nltk.download('wordnet')
from evaluate import load
bertscore = load("bertscore")
import bert_score
import pandas as pd
import logging
import warnings

[nltk_data] Downloading package wordnet to /root/nltk_data...


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

#### Data Preparation

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
def load_dataset(file_name):
    import pandas as pd
    import json
    
    df = []
    with open('/content/' + file_name,encoding="utf-8") as f:
        for i in f:
            i = json.loads(i)
            tweet = i['postText']
            article_title = i['targetTitle']
            article = ' '.join(i['targetParagraphs'])
            target = i['spoiler']
            label = i['tags']

            tweet = tweet[0]
            target = target[0]
            label = label[0]
            
            if label == 'multi':
              continue
            
            df += [{'clickbait_tweet': tweet,
                    'target_paragraphs':label + article_title + article, 
                    'spoiler': target,
                    'label': label}]

            data = pd.DataFrame(df)

        data['ans_start_idx'] = data.apply(lambda row: row['target_paragraphs'].index(row['spoiler']), axis=1)
        data['ans_end_idx'] = data.apply(lambda row : row['ans_start_idx'] + len(row['spoiler']), axis=1)


    return data

In [None]:
def data_preprocess():

  train_dataset = load_dataset('train.jsonl')
  validation_dataset = load_dataset('validation.jsonl') 

  tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
  train_dataset['ans_start_token_idx'] = train_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_start_idx']], add_special_tokens=False)), axis=1)
  train_dataset['ans_end_token_idx'] = train_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_end_idx']], add_special_tokens=False)), axis=1)

  validation_dataset['ans_start_token_idx'] = validation_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_start_idx']], add_special_tokens=False)), axis=1)
  validation_dataset['ans_end_token_idx'] = validation_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_end_idx']], add_special_tokens=False)), axis=1)

  train_dataset = train_dataset[train_dataset['ans_end_token_idx']<513]
  train_dataset.reset_index(inplace=True)
  validation_dataset = validation_dataset[validation_dataset['ans_end_token_idx']<513]
  validation_dataset.reset_index(inplace=True)

  return train_dataset, validation_dataset


In [None]:
# Load data
train_dataset, validation_dataset = data_preprocess()

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors


In [None]:
validation_dataset["label"].value_counts()

phrase     324
passage    286
Name: label, dtype: int64

In [None]:
class ClickbaitSpoilerDataset(Dataset):
    def __init__(self, clickbait_tweets, target_paragraphs, answer_start_indices, answer_end_indices, tokenizer):
        self.clickbait_tweets = clickbait_tweets
        self.target_paragraphs = target_paragraphs
        self.answer_start_indices = answer_start_indices
        self.answer_end_indices = answer_end_indices
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.clickbait_tweets)

    def __getitem__(self, idx):
        clickbait_tweet = self.clickbait_tweets[idx]
        target_paragraph = self.target_paragraphs[idx]
        answer_start_idx = self.answer_start_indices[idx]
        answer_end_idx = self.answer_end_indices[idx]

        inputs = self.tokenizer.encode_plus(
            clickbait_tweet,
            target_paragraph,
            add_special_tokens=True,
            return_tensors='pt',
            max_length=512,
            truncation=True        )

        input_ids = inputs['input_ids'].squeeze()
        token_type_ids = inputs['token_type_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        start_positions = torch.tensor(answer_start_idx)
        end_positions = torch.tensor(answer_end_idx)

        return input_ids, token_type_ids, attention_mask, start_positions, end_positions

In [None]:
def train(model, train_dataloader, optimizer, device, epoch):
    model.train()
    train_loss = 0
    total_correct = 0
    total_samples = 0

    for step, batch in enumerate(train_dataloader):
        input_ids, token_type_ids, attention_mask, start_positions, end_positions = [x.to(device) for x in batch]

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            start_positions=start_positions,
            end_positions=end_positions
        )

        loss = outputs[0]
        train_loss += loss.item()

        start_preds, end_preds = outputs[1], outputs[2]
        _, start_preds = start_preds.max(dim=1)
        _, end_preds = end_preds.max(dim=1)
        total_correct += ((start_preds == start_positions) & (end_preds == end_positions)).sum().item()
        total_samples += start_positions.size(0)

        loss.backward()
        optimizer.step()

        if step % 100 == 0:
            acc = 100.0 * total_correct / total_samples
            avg_loss = train_loss / (step + 1)
            print(f"Step [{step}/{len(train_dataloader)}], Train Loss: {avg_loss:.4f}, Train Acc: {acc:.2f}%")

    avg_loss = train_loss / len(train_dataloader)
    acc = 100.0 * total_correct / total_samples
    print(f"Epoch [{epoch}], Train Loss: {avg_loss:.4f}, Train Acc: {acc:.2f}%")

    return avg_loss, acc

In [None]:
def eval(model, dataloader, optimizer, tokenizer, device, epoch):
    logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
    warnings.filterwarnings('ignore')
    model.eval()
    eval_loss = 0
    total_correct = 0
    total_samples = 0
    blue_score = 0
    meteor_score = 0
    f1_score = 0

    results = pd.DataFrame(columns=['epoch','target', 'predicted','original','start_pred','start_ori','end_preds','end_ori'])

    with torch.no_grad():
        for step,batch in enumerate(dataloader):
            print(step,len(batch))
            input_ids, token_type_ids, attention_mask, start_positions, end_positions = [x.to(device) for x in batch]

            outputs = model(
                input_ids=input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask
            )

            start_logits, end_logits = outputs.start_logits, outputs.end_logits
            start_preds, end_preds = start_logits.argmax(dim=1), end_logits.argmax(dim=1)

            for i in range(len(input_ids)):
                input_id = input_ids[i].tolist()
                start_pred = start_preds[i].item()
                end_pred = end_preds[i].item()

                answer = tokenizer.decode(input_id[start_pred:end_pred+1], skip_special_tokens=True)
                answer = answer.strip()

                reference = tokenizer.decode(input_id[start_positions[i]:end_positions[i]+1], skip_special_tokens=True)
                reference = reference.strip()

                new_row = {'epoch': epoch,
                           'target': tokenizer.decode(input_id,skip_special_tokens=True),
                           'predicted': answer,
                           'original' : reference,
                           'start_pred' : start_pred,
                           'start_ori' : start_positions[i].item(),
                           'end_preds' : end_pred,
                           'end_ori' : end_positions[i].item()}

                results = results.append(new_row, ignore_index = True)
                
                results.to_csv('/content/gdrive/My Drive/DeBERTa_baseline_results_eval.csv', mode='a', header=False, index=False)

                blue_score += nltk.translate.bleu_score.sentence_bleu([tokenizer.tokenize(reference)], tokenizer.tokenize(answer))
                meteor_score += nltk.translate.meteor_score.meteor_score([tokenizer.tokenize(reference)], tokenizer.tokenize(answer))
                precision, recall, f1 = bert_score.score([answer], [reference], lang="en",model_type='bert-base-uncased')
                f1_score += f1

            total_correct += ((start_preds == start_positions) & (end_preds == end_positions)).sum().item()
            total_samples += start_positions.size(0)

    avg_loss = eval_loss / len(dataloader)
    acc = 100.0 * total_correct / total_samples
    blue_score /= total_samples
    meteor_score /= total_samples
    f1_score = f1_score.item()/ total_samples

    print(f"Eval Loss: {avg_loss:.4f}, Eval Acc: {acc:.2f}%, BLUE Score: {blue_score:.4f}, METEOR Score: {meteor_score:.4f}, F1 Score: {f1_score:.4f}")

    return avg_loss, acc, blue_score, meteor_score

In [None]:
train_dataset["label"].value_counts()

phrase     1317
passage    1132
Name: label, dtype: int64

In [None]:
clickbait_tweets = train_dataset['clickbait_tweet']
target_paragraphs = train_dataset['target_paragraphs']
answer_start_indices = train_dataset['ans_start_token_idx']
answer_end_indices = train_dataset['ans_end_token_idx']

val_clickbait_tweets = validation_dataset['clickbait_tweet']
val_target_paragraphs = validation_dataset['target_paragraphs']
val_answer_start_indices = validation_dataset['ans_start_token_idx']
val_answer_end_indices = validation_dataset['ans_end_token_idx']

# Initialize tokenizer and model
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base',truncation = True)
model = DebertaForQuestionAnswering.from_pretrained('microsoft/deberta-base')

# Create dataset and dataloader
train_dataset_object = ClickbaitSpoilerDataset(clickbait_tweets, target_paragraphs, answer_start_indices, answer_end_indices, tokenizer)
val_dataset_object = ClickbaitSpoilerDataset(val_clickbait_tweets, val_target_paragraphs, val_answer_start_indices, val_answer_end_indices, tokenizer)

def collate_fn(batch):
  input_ids = [item[0] for item in batch]
  token_type_ids = [item[1] for item in batch]
  attention_mask = [item[2] for item in batch]
  start_positions = [item[3] for item in batch]
  end_positions = [item[4] for item in batch]

  input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
  token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
  attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=tokenizer.pad_token_id)

  return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), torch.tensor(start_positions), torch.tensor(end_positions)

train_dataloader = DataLoader(train_dataset_object, batch_size=8, shuffle=True,collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset_object, batch_size=8, shuffle=True,collate_fn=collate_fn)

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

Downloading pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForQuestionAnswering: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['

In [None]:
# Train model
for epoch in range(5):
    logging.getLogger("transformers").setLevel(logging.ERROR)
    train(model, train_dataloader, optimizer, device,epoch)

eval(model, val_dataloader, optimizer, tokenizer, device,epoch)


  return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), torch.tensor(start_positions), torch.tensor(end_positions)


Step [0/307], Train Loss: 6.0807, Train Acc: 0.00%
Step [100/307], Train Loss: 5.6531, Train Acc: 0.12%
Step [200/307], Train Loss: 5.4248, Train Acc: 0.68%
Step [300/307], Train Loss: 5.2538, Train Acc: 0.71%
Epoch [0], Train Loss: 5.2514, Train Acc: 0.73%
Step [0/307], Train Loss: 4.2917, Train Acc: 0.00%
Step [100/307], Train Loss: 4.4078, Train Acc: 1.61%
Step [200/307], Train Loss: 4.3624, Train Acc: 1.68%
Step [300/307], Train Loss: 4.3438, Train Acc: 1.70%
Epoch [1], Train Loss: 4.3372, Train Acc: 1.67%
Step [0/307], Train Loss: 3.2518, Train Acc: 0.00%
Step [100/307], Train Loss: 3.4757, Train Acc: 2.60%
Step [200/307], Train Loss: 3.4795, Train Acc: 2.49%
Step [300/307], Train Loss: 3.4668, Train Acc: 2.66%
Epoch [2], Train Loss: 3.4724, Train Acc: 2.61%
Step [0/307], Train Loss: 3.2099, Train Acc: 0.00%
Step [100/307], Train Loss: 2.8274, Train Acc: 5.69%
Step [200/307], Train Loss: 2.7999, Train Acc: 6.53%
Step [300/307], Train Loss: 2.7803, Train Acc: 6.15%
Epoch [3], Train

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]



1 5
2 5




3 5
4 5




5 5




6 5




7 5
8 5




9 5




10 5




11 5




12 5




13 5
14 5
15 5




16 5




17 5




18 5




19 5
20 5




21 5




22 5




23 5




24 5




25 5




26 5
27 5
28 5




29 5




30 5




31 5




32 5




33 5
34 5
35 5




36 5




37 5
38 5




39 5




40 5
41 5
42 5
43 5




44 5




45 5




46 5




47 5




48 5
49 5
50 5
51 5
52 5
53 5




54 5




55 5




56 5
57 5




58 5
59 5
60 5




61 5
62 5




63 5
64 5




65 5




66 5
67 5
68 5




69 5




70 5
71 5
72 5




73 5
74 5




75 5
76 5
Eval Loss: 0.0000, Eval Acc: 2.79%, BLUE Score: 0.1392, METEOR Score: 0.2718, F1 Score: 0.4772


In [None]:
# Save model
model.save_pretrained('/content/gdrive/My Drive/DeBERTa_baseline_spoiler_model')

In [None]:
##TESTING

# Define the clickbait tweet and target paragraph
clickbait_tweet = "Long-Term Marijuana Use Has One Crazy Side Effect, New Study Says"
target_paragraph = ['Marijuana has long been touted for being virtually side-effect free. Now, according to one new study, long-term marijuana use may have one negative caveat: gum disease.',
 'After analyzing about 1,000 cannabis users in New Zealand, researchers found that those who smoked pot for 20 or more years had few health problems — with the exception of gum disease.',
 'Lead researcher Madeline Meier, an assistant professor of psychology at Arizona State University, said of the findings,',
 'Unlike tobacco smoking, cannabis smoking is associated with few physical health problems in midlife, with the exception of periodontal disease... Our analyses show that this association was not explained by tobacco smoking, alcohol abuse or less tooth brushing and flossing.',
 'While the study doesn’t prove cannabis can cause gum disease, it does shed some light on the pros and cons that come with regular use, specifically smoking. A co-author of the study and professor of psychology at Duke University, Avshalom Caspi, said,',
 'What we’re seeing is that cannabis may be harmful in some respects, but possibly not in every way. We need to recognize that heavy recreational cannabis use does have some adverse consequences, but overall damage to physical health is not apparent in this study.',
 'While smoking weed every day for multiple decades surely has some negative side effects, using alternative methods like vaporizers and edibles could potentially mitigate some of those consequences. So don’t panic just yet, bud buddies. When it comes to cannabis, there’s a big, wide world out there with countless options still left to explore.']

target_paragraph = " ".join(target_paragraph)

# Tokenize the input text
clickbait_tokens = tokenizer.tokenize(clickbait_tweet)
target_tokens = tokenizer.tokenize(target_paragraph)

# Find the corresponding token indexes for the answer span
answer_start_char = 42
answer_end_char = 58
answer_start_token = len(tokenizer.encode(target_paragraph[:answer_start_char], add_special_tokens=False))
answer_end_token = len(tokenizer.encode(target_paragraph[:answer_end_char], add_special_tokens=False)) - 1

# Concatenate the clickbait and target tokens, and add special tokens
input_tokens = ['[CLS]'] + clickbait_tokens + ['[SEP]'] + target_tokens + ['[SEP]']
segment_ids = [0] * (len(clickbait_tokens) + 2) + [1] * (len(target_tokens) + 1)

# Convert the tokens to their corresponding IDs
input_ids = tokenizer.convert_tokens_to_ids(input_tokens)

# Create an attention mask with "1" for real tokens and "0" for padding tokens
attention_mask = [1] * len(input_ids)

# Pad the input if its length is less than the maximum length
max_length = 512
padding_length = max_length - len(input_ids)
if padding_length > 0:
    input_ids = input_ids + [0] * padding_length
    attention_mask = attention_mask + [0] * padding_length
    segment_ids = segment_ids + [0] * padding_length

# Convert the input to tensors
input_ids = torch.tensor(input_ids).unsqueeze(0)
attention_mask = torch.tensor(attention_mask).unsqueeze(0)
segment_ids = torch.tensor(segment_ids).unsqueeze(0)

# Move the tensors to the GPU if available
if torch.cuda.is_available():
    input_ids = input_ids.cuda()
    attention_mask = attention_mask.cuda()
    segment_ids = segment_ids.cuda()

# Make the forward pass to get the start and end logits
with torch.no_grad():
    start_logits, end_logits = model(input_ids, token_type_ids=segment_ids, attention_mask=attention_mask)

# Get the predicted answer
start_idx = int(torch.argmax(start_logits))
end_idx = int(torch.argmax(end_logits))
answer_tokens = input_ids[0][start_idx:end_idx+1]
answer = tokenizer.decode(answer_tokens)

print(f"Predicted answer: {answer}")