In [1]:
!pip install transformers
!pip install torch
! pip install bert_score
! pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering
from transformers import RobertaTokenizer, RobertaForQuestionAnswering
from transformers import DebertaTokenizer, DebertaForQuestionAnswering

from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from bert_score import score
import nltk
nltk.download('wordnet')
from evaluate import load
bertscore = load("bertscore")
import bert_score
import pandas as pd
import logging
import warnings
from textblob import TextBlob
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, ne_chunk
from nltk.corpus import wordnet
import json

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [14]:
def data_preprocess():

  train_dataset = pd.read_csv("/content/processed_training_dataset.csv")
  validation_dataset = pd.read_csv("/content/processed_validation_dataset.csv")

  train_dataset = train_dataset[train_dataset["label"]=='phrase']
  validation_dataset = validation_dataset[validation_dataset["label"]=='phrase']
  
  train_dataset['ans_start_idx'] = train_dataset.apply(lambda row: row['target_paragraphs'].index(row['spoiler']), axis=1)
  train_dataset['ans_end_idx'] = train_dataset.apply(lambda row : row['ans_start_idx'] + len(row['spoiler']), axis=1)
  
  validation_dataset['ans_start_idx'] = validation_dataset.apply(lambda row: row['target_paragraphs'].index(row['spoiler']), axis=1)
  validation_dataset['ans_end_idx'] = validation_dataset.apply(lambda row : row['ans_start_idx'] + len(row['spoiler']), axis=1)

  tokenizer = DebertaTokenizer.from_pretrained('Palak/microsoft_deberta-base_squad')
  train_dataset['ans_start_token_idx'] = train_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_start_idx']], add_special_tokens=False)), axis=1)
  train_dataset['ans_end_token_idx'] = train_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_end_idx']], add_special_tokens=False)), axis=1)

  validation_dataset['ans_start_token_idx'] = validation_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_start_idx']], add_special_tokens=False)), axis=1)
  validation_dataset['ans_end_token_idx'] = validation_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_end_idx']], add_special_tokens=False)), axis=1)


  train_dataset = train_dataset[train_dataset['ans_end_token_idx']<513]
  train_dataset.reset_index(inplace=True)
  validation_dataset = validation_dataset[validation_dataset['ans_end_token_idx']<513]
  validation_dataset.reset_index(inplace=True)

  return train_dataset, validation_dataset


In [7]:
class ClickbaitSpoilerDataset(Dataset):
    def __init__(self, target_paragraphs, answer_start_indices, answer_end_indices, tokenizer):
        self.target_paragraphs = target_paragraphs
        self.answer_start_indices = answer_start_indices
        self.answer_end_indices = answer_end_indices
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.target_paragraphs)

    def __getitem__(self, idx):
        target_paragraph = self.target_paragraphs[idx]
        answer_start_idx = self.answer_start_indices[idx]
        answer_end_idx = self.answer_end_indices[idx]

        inputs = self.tokenizer.encode_plus(
            target_paragraph,
            add_special_tokens=True,
            return_tensors='pt',
            max_length=512,
            truncation=True        )

        input_ids = inputs['input_ids'].squeeze()
        token_type_ids = inputs['token_type_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        start_positions = torch.tensor(answer_start_idx)
        end_positions = torch.tensor(answer_end_idx)

        return input_ids,token_type_ids, attention_mask, start_positions, end_positions

In [8]:
def train(model, train_dataloader, optimizer, device, epoch):
    model.train()
    train_loss = 0
    total_correct = 0
    total_samples = 0

    for step, batch in enumerate(train_dataloader):
        input_ids,token_type_ids, attention_mask, start_positions, end_positions = [x.to(device) for x in batch]
        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            start_positions=start_positions,
            end_positions=end_positions
        )
        loss = outputs[0]
        train_loss += loss.item()

        start_preds, end_preds = outputs[1], outputs[2]
        _, start_preds = start_preds.max(dim=1)
        _, end_preds = end_preds.max(dim=1)

        for i in range(len(input_ids)):
          start_pred = start_preds[i].item()
          end_pred = end_preds[i].item()
          if start_pred > end_pred:
            # Penalize the model with a higher loss if the start index is greater than the end index
            additional_loss = torch.tensor(2).to(device)
            loss += additional_loss

        total_correct += ((start_preds == start_positions) & (end_preds == end_positions)).sum().item()
        total_samples += start_positions.size(0)

        loss.backward()
        optimizer.step()

        if step % 100 == 0:
            acc = 100.0 * total_correct / total_samples
            avg_loss = train_loss / (step + 1)
            print(f"Step [{step}/{len(train_dataloader)}], Train Loss: {avg_loss:.4f}, Train Acc: {acc:.2f}%")

    avg_loss = train_loss / len(train_dataloader)
    acc = 100.0 * total_correct / total_samples
    print(f"Epoch [{epoch}], Train Loss: {avg_loss:.4f}, Train Acc: {acc:.2f}%")

    return avg_loss, acc

In [9]:

def eval(model, dataloader, tokenizer, device, epoch, results):
    logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
    warnings.filterwarnings('ignore')
    model.eval()
    eval_loss = 0
    total_correct = 0
    total_samples = 0
    bleu_score = 0
    meteor_score = 0
    f1_score = 0

    with torch.no_grad():
        for step,batch in enumerate(dataloader):
            input_ids, token_type_ids, attention_mask, start_positions, end_positions = [x.to(device) for x in batch]
            print(step, len(batch))
            print(input_ids, input_ids.shape)
            outputs = model(
                input_ids=input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask
            )

            start_logits, end_logits = outputs.start_logits, outputs.end_logits
            start_preds, end_preds = start_logits.argmax(dim=1), end_logits.argmax(dim=1)

            for i in range(len(input_ids)):
                input_id = input_ids[i].tolist()
                start_pred = start_preds[i].item()
                end_pred = end_preds[i].item()

                answer = tokenizer.decode(input_id[start_pred:end_pred+1], skip_special_tokens=True)
                answer = str(answer.strip())

                reference = tokenizer.decode(input_id[start_positions[i]:end_positions[i]+1], skip_special_tokens=True)
                reference = str(reference.strip())

                current_statement_bleu = nltk.translate.bleu_score.sentence_bleu([reference], answer)
                bleu_score += current_statement_bleu

                current_statement_meteor = nltk.translate.meteor_score.meteor_score([tokenizer.tokenize(reference)], tokenizer.tokenize(answer))
                meteor_score += current_statement_meteor

                precision, recall, f1 = bert_score.score([answer], [reference], lang="en",model_type='bert-base-uncased')
                f1_score += f1


                new_row = {'epoch': epoch,
                           'target': tokenizer.decode(input_id,skip_special_tokens=True),
                           'predicted': answer,
                           'original' : reference,
                           'start_pred' : start_pred,
                           'start_ori' : start_positions[i].item(),
                           'end_preds' : end_pred,
                           'end_ori' : end_positions[i].item(),
                           'bleu_score' : current_statement_bleu,
                           'meteor_score' : current_statement_meteor,
                           'f1_score' : f1}

                results = results.append(new_row, ignore_index = True)


            total_correct += ((start_preds == start_positions) & (end_preds == end_positions)).sum().item()
            total_samples += start_positions.size(0)

    avg_loss = eval_loss / len(dataloader)
    acc = 100.0 * total_correct / total_samples
    bleu_score /= total_samples
    meteor_score /= total_samples
    f1_score = f1_score.item()/ total_samples

    print(f"Eval Loss: {avg_loss:.4f}, Eval Acc: {acc:.2f}%, bleu Score: {bleu_score:.4f}, METEOR Score: {meteor_score:.4f}, F1 Score: {f1_score:.4f}")

    return results

In [15]:
train_dataset, validation_dataset = data_preprocess()


Token indices sequence length is longer than the specified maximum sequence length for this model (1568 > 512). Running this sequence through the model will result in indexing errors


In [10]:
# Load data
train_dataset, validation_dataset = data_preprocess()

target_paragraphs = train_dataset['target_paragraphs']
answer_start_indices = train_dataset['ans_start_token_idx']
answer_end_indices = train_dataset['ans_end_token_idx']

val_target_paragraphs = validation_dataset['target_paragraphs']
val_answer_start_indices = validation_dataset['ans_start_token_idx']
val_answer_end_indices = validation_dataset['ans_end_token_idx']

# Initialize tokenizer and model
tokenizer = DebertaTokenizer.from_pretrained('Palak/microsoft/deberta-base',truncation = True)
model = DebertaForQuestionAnswering.from_pretrained('Palak/mmicrosoft/deberta-base')

# Create dataset and dataloader
train_dataset = ClickbaitSpoilerDataset(target_paragraphs, answer_start_indices, answer_end_indices, tokenizer)
val_dataset = ClickbaitSpoilerDataset(val_target_paragraphs, val_answer_start_indices, val_answer_end_indices, tokenizer)
print(val_dataset)

def collate_fn(batch):
  input_ids = [item[0] for item in batch]
  token_type_ids = [item[1] for item in batch]
  attention_mask = [item[2] for item in batch]
  start_positions = [item[3] for item in batch]
  end_positions = [item[4] for item in batch]

  input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
  token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
  attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=tokenizer.pad_token_id)

  return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), torch.tensor(start_positions), torch.tensor(end_positions)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True,collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=True,collate_fn=collate_fn)

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

Token indices sequence length is longer than the specified maximum sequence length for this model (1568 > 512). Running this sequence through the model will result in indexing errors


<__main__.ClickbaitSpoilerDataset object at 0x7fc61421bf40>


In [18]:
train_dataset["target_paragraphs"].iloc[0]

'Question - nasa sets date for full recovery of ozone hole. National_Aeronautics_and_Space_Administration sets date for full_moon recovery of ozone hole\nQuestion_Sentiment - Positive\nArticle_Keyword - \nArticle_Title - hole in ozone layer expected to make full recovery by 2070: nasa\nArticle - 2070 is shaping up to be a great year for mother earth thats when nasa scientists are predicting the hole in the ozone layer might finally make a full recovery researchers announced their conclusion, in addition to other findings, in a presentation wednesday during the annual american geophysical union meeting in san francisco the team of scientists specifically looked at the chemical composition of the ozone hole, which has shifted in both size and depth since the passing of the montreal protocol in 1987 the agreement banned its 197 signatory countries from using chemicals, like chlorofluorocarbons (cfcs), that break down into chlorine in the upper atmosphere and harm the ozone layer they foun

In [None]:
# Train model
for epoch in range(5):
    logging.getLogger("transformers").setLevel(logging.ERROR)
    train(model, train_dataloader, optimizer, device,epoch)


results = pd.DataFrame(columns=['epoch','target', 'predicted','original','start_pred','start_ori','end_preds','end_ori','bleu_score','meteor_score','f1_score'])
results = eval(model, val_dataloader, tokenizer, device,1, results)

  return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), torch.tensor(start_positions), torch.tensor(end_positions)


Step [0/164], Train Loss: 6.0911, Train Acc: 0.00%
Step [100/164], Train Loss: 2.2725, Train Acc: 37.38%
Epoch [0], Train Loss: 2.0841, Train Acc: 40.08%
Step [0/164], Train Loss: 1.2874, Train Acc: 50.00%
Step [100/164], Train Loss: 0.8566, Train Acc: 66.71%
Epoch [1], Train Loss: 0.8932, Train Acc: 64.73%
Step [0/164], Train Loss: 0.3190, Train Acc: 87.50%
Step [100/164], Train Loss: 0.3984, Train Acc: 80.07%
Epoch [2], Train Loss: 0.4078, Train Acc: 78.63%
Step [0/164], Train Loss: 0.0414, Train Acc: 100.00%
Step [100/164], Train Loss: 0.1853, Train Acc: 89.73%
Epoch [3], Train Loss: 0.1984, Train Acc: 89.77%
Step [0/164], Train Loss: 0.1229, Train Acc: 87.50%
Step [100/164], Train Loss: 0.1221, Train Acc: 93.81%
Epoch [4], Train Loss: 0.1356, Train Acc: 93.74%
0 5


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

1 5
2 5




3 5
4 5
5 5




6 5
7 5




8 5
9 5
10 5
11 5
12 5
13 5
14 5
15 5
16 5
17 5
18 5
19 5
20 5
21 5
22 5
23 5




24 5
25 5
26 5
27 5
28 5




29 5
30 5




31 5




32 5
33 5
34 5




35 5
36 5
37 5




38 5
39 5
40 5
Eval Loss: 0.0000, Eval Acc: 52.94%, BLUE Score: 0.6484, METEOR Score: 0.6190, F1 Score: 0.7822


In [None]:
results.to_csv('/content/gdrive/My Drive/DeBerta_novel_phrase.csv', mode='a', header=False, index=False)


# Save model
model.save_pretrained('/content/gdrive/My Drive/DeBerta_novel_phrase_model')