In [None]:
!pip install transformers
!pip install torch
! pip install bert_score
! pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://

In [None]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering
from transformers import RobertaTokenizer, RobertaForQuestionAnswering
from transformers import DebertaTokenizer, DebertaForQuestionAnswering

from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from bert_score import score
import nltk
nltk.download('wordnet')
from evaluate import load
bertscore = load("bertscore")
import bert_score
import pandas as pd
import logging
import warnings
from textblob import TextBlob
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, ne_chunk
from nltk.corpus import wordnet
import json

[nltk_data] Downloading package wordnet to /root/nltk_data...


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
def data_preprocess():

  train_dataset = pd.read_csv("/content/processed_training_dataset.csv")
  validation_dataset = pd.read_csv("/content/processed_validation_dataset.csv")
  
  train_dataset['ans_start_idx'] = train_dataset.apply(lambda row: row['target_paragraphs'].index(row['spoiler']), axis=1)
  train_dataset['ans_end_idx'] = train_dataset.apply(lambda row : row['ans_start_idx'] + len(row['spoiler']), axis=1)
  
  validation_dataset['ans_start_idx'] = validation_dataset.apply(lambda row: row['target_paragraphs'].index(row['spoiler']), axis=1)
  validation_dataset['ans_end_idx'] = validation_dataset.apply(lambda row : row['ans_start_idx'] + len(row['spoiler']), axis=1)

  tokenizer = RobertaTokenizer.from_pretrained('csarron/roberta-base-squad-v1')
  train_dataset['ans_start_token_idx'] = train_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_start_idx']], add_special_tokens=False)), axis=1)
  train_dataset['ans_end_token_idx'] = train_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_end_idx']], add_special_tokens=False)), axis=1)

  validation_dataset['ans_start_token_idx'] = validation_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_start_idx']], add_special_tokens=False)), axis=1)
  validation_dataset['ans_end_token_idx'] = validation_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_end_idx']], add_special_tokens=False)), axis=1)


  train_dataset = train_dataset[train_dataset['ans_end_token_idx']<513]
  train_dataset.reset_index(inplace=True)
  validation_dataset = validation_dataset[validation_dataset['ans_end_token_idx']<513]
  validation_dataset.reset_index(inplace=True)

  return train_dataset, validation_dataset


In [None]:
class ClickbaitSpoilerDataset(Dataset):
    def __init__(self, target_paragraphs, answer_start_indices, answer_end_indices, tokenizer):
        self.target_paragraphs = target_paragraphs
        self.answer_start_indices = answer_start_indices
        self.answer_end_indices = answer_end_indices
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.target_paragraphs)

    def __getitem__(self, idx):
        target_paragraph = self.target_paragraphs[idx]
        answer_start_idx = self.answer_start_indices[idx]
        answer_end_idx = self.answer_end_indices[idx]

        inputs = self.tokenizer.encode_plus(
            target_paragraph,
            add_special_tokens=True,
            return_tensors='pt',
            max_length=512,
            truncation=True        )

        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        start_positions = torch.tensor(answer_start_idx)
        end_positions = torch.tensor(answer_end_idx)

        return input_ids, attention_mask, start_positions, end_positions

In [None]:
def train(model, train_dataloader, optimizer, device, epoch):
    model.train()
    train_loss = 0
    total_correct = 0
    total_samples = 0

    for step, batch in enumerate(train_dataloader):
        input_ids, attention_mask, start_positions, end_positions = [x.to(device) for x in batch]
        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            start_positions=start_positions,
            end_positions=end_positions
        )
        loss = outputs[0]
        train_loss += loss.item()

        start_preds, end_preds = outputs[1], outputs[2]
        _, start_preds = start_preds.max(dim=1)
        _, end_preds = end_preds.max(dim=1)

        for i in range(len(input_ids)):
          start_pred = start_preds[i].item()
          end_pred = end_preds[i].item()
          if start_pred > end_pred:
            # Penalize the model with a higher loss if the start index is greater than the end index
            additional_loss = torch.tensor(2).to(device)
            loss += additional_loss

        total_correct += ((start_preds == start_positions) & (end_preds == end_positions)).sum().item()
        total_samples += start_positions.size(0)

        loss.backward()
        optimizer.step()

        if step % 100 == 0:
            acc = 100.0 * total_correct / total_samples
            avg_loss = train_loss / (step + 1)
            print(f"Step [{step}/{len(train_dataloader)}], Train Loss: {avg_loss:.4f}, Train Acc: {acc:.2f}%")

    avg_loss = train_loss / len(train_dataloader)
    acc = 100.0 * total_correct / total_samples
    print(f"Epoch [{epoch}], Train Loss: {avg_loss:.4f}, Train Acc: {acc:.2f}%")

    return avg_loss, acc

In [None]:

def eval(model, dataloader, optimizer, tokenizer, device, epoch, results):
    logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
    warnings.filterwarnings('ignore')
    model.eval()
    eval_loss = 0
    total_correct = 0
    total_samples = 0
    bleu_score = 0
    meteor_score = 0
    f1_score = 0

    with torch.no_grad():
        for step,batch in enumerate(dataloader):
            input_ids, attention_mask, start_positions, end_positions = [x.to(device) for x in batch]
            print(step, len(batch))
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            start_logits, end_logits = outputs.start_logits, outputs.end_logits
            start_preds, end_preds = start_logits.argmax(dim=1), end_logits.argmax(dim=1)

            for i in range(len(input_ids)):
                input_id = input_ids[i].tolist()
                start_pred = start_preds[i].item()
                end_pred = end_preds[i].item()

                answer = tokenizer.decode(input_id[start_pred:end_pred+1], skip_special_tokens=True)
                answer = str(answer.strip())

                reference = tokenizer.decode(input_id[start_positions[i]:end_positions[i]+1], skip_special_tokens=True)
                reference = str(reference.strip())

                current_statement_bleu = nltk.translate.bleu_score.sentence_bleu([reference], answer)
                bleu_score += current_statement_bleu

                current_statement_meteor = nltk.translate.meteor_score.meteor_score([tokenizer.tokenize(reference)], tokenizer.tokenize(answer))
                meteor_score += current_statement_meteor

                precision, recall, f1 = bert_score.score([answer], [reference], lang="en",model_type='bert-base-uncased')
                f1_score += f1


                new_row = {'epoch': epoch,
                           'target': tokenizer.decode(input_id,skip_special_tokens=True),
                           'predicted': answer,
                           'original' : reference,
                           'start_pred' : start_pred,
                           'start_ori' : start_positions[i].item(),
                           'end_preds' : end_pred,
                           'end_ori' : end_positions[i].item(),
                           'bleu_score' : current_statement_bleu,
                           'meteor_score' : current_statement_meteor,
                           'f1_score' : f1}

                results = results.append(new_row, ignore_index = True)


            total_correct += ((start_preds == start_positions) & (end_preds == end_positions)).sum().item()
            total_samples += start_positions.size(0)

    avg_loss = eval_loss / len(dataloader)
    acc = 100.0 * total_correct / total_samples
    bleu_score /= total_samples
    meteor_score /= total_samples
    f1_score = f1_score.item()/ total_samples

    print(f"Eval Loss: {avg_loss:.4f}, Eval Acc: {acc:.2f}%, BLEU Score: {bleu_score:.4f}, METEOR Score: {meteor_score:.4f}, F1 Score: {f1_score:.4f}")

    return results

In [None]:
# Load data
train_dataset, validation_dataset = data_preprocess()

target_paragraphs = train_dataset['target_paragraphs']
answer_start_indices = train_dataset['ans_start_token_idx']
answer_end_indices = train_dataset['ans_end_token_idx']

val_target_paragraphs = validation_dataset['target_paragraphs']
val_answer_start_indices = validation_dataset['ans_start_token_idx']
val_answer_end_indices = validation_dataset['ans_end_token_idx']

# Initialize tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('csarron/roberta-base-squad-v1',truncation = True)
model = RobertaForQuestionAnswering.from_pretrained('csarron/roberta-base-squad-v1')

# Create dataset and dataloader
train_dataset = ClickbaitSpoilerDataset(target_paragraphs, answer_start_indices, answer_end_indices, tokenizer)
val_dataset = ClickbaitSpoilerDataset(val_target_paragraphs, val_answer_start_indices, val_answer_end_indices, tokenizer)

def collate_fn(batch):
  input_ids = [item[0] for item in batch]
  attention_mask = [item[1] for item in batch]
  start_positions = [item[2] for item in batch]
  end_positions = [item[3] for item in batch]

  input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
  attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=tokenizer.pad_token_id)

  return torch.tensor(input_ids),torch.tensor(attention_mask), torch.tensor(start_positions), torch.tensor(end_positions)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True,collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=True,collate_fn=collate_fn)

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [None]:
# Train model
for epoch in range(5):
    logging.getLogger("transformers").setLevel(logging.ERROR)
    train(model, train_dataloader, optimizer, device,epoch)


results = pd.DataFrame(columns=['epoch','target', 'predicted','original','start_pred','start_ori','end_preds','end_ori','bleu_score','meteor_score','f1_score'])
results = eval(model, val_dataloader, optimizer, tokenizer, device,1, results)


  return torch.tensor(input_ids),torch.tensor(attention_mask), torch.tensor(start_positions), torch.tensor(end_positions)


Step [0/164], Train Loss: 6.9501, Train Acc: 0.00%
Step [100/164], Train Loss: 2.4715, Train Acc: 33.17%
Epoch [0], Train Loss: 2.3041, Train Acc: 35.57%
Step [0/164], Train Loss: 1.0566, Train Acc: 50.00%
Step [100/164], Train Loss: 1.1585, Train Acc: 57.18%
Epoch [1], Train Loss: 1.1697, Train Acc: 57.02%
Step [0/164], Train Loss: 0.7023, Train Acc: 62.50%
Step [100/164], Train Loss: 0.5959, Train Acc: 72.15%
Epoch [2], Train Loss: 0.6023, Train Acc: 71.68%
Step [0/164], Train Loss: 0.2230, Train Acc: 100.00%
Step [100/164], Train Loss: 0.3792, Train Acc: 79.83%
Epoch [3], Train Loss: 0.3617, Train Acc: 80.53%
Step [0/164], Train Loss: 0.2110, Train Acc: 87.50%
Step [100/164], Train Loss: 0.2070, Train Acc: 90.59%
Epoch [4], Train Loss: 0.2073, Train Acc: 90.46%
0 4


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

1 4
2 4
3 4
4 4
5 4
6 4
7 4
8 4




9 4




10 4
11 4
12 4
13 4
14 4
15 4




16 4
17 4




18 4




19 4
20 4




21 4
22 4
23 4
24 4
25 4




26 4
27 4




28 4
29 4




30 4
31 4
32 4
33 4
34 4
35 4
36 4
37 4




38 4
39 4
40 4
Eval Loss: 0.0000, Eval Acc: 48.92%, BLUE Score: 0.6204, METEOR Score: 0.6019, F1 Score: 0.7643


In [None]:
results.to_csv('/content/gdrive/My Drive/RoBerta_novel_phrase.csv', mode='a', header=False, index=False)


# Save model
model.save_pretrained('/content/gdrive/My Drive/RoBerta_novel_phrase_model')

