In [1]:
!pip install transformers
!pip install torch
! pip install bert_score
! pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.1
Looking in in

In [2]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
import torch
from transformers import DebertaTokenizer, DebertaForQuestionAnswering

from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from bert_score import score
import nltk
nltk.download('wordnet')
from evaluate import load
bertscore = load("bertscore")
import bert_score
import pandas as pd
import logging
import warnings
from textblob import TextBlob
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, ne_chunk
from nltk.corpus import wordnet
import json

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
def data_preprocess():

  train_dataset = pd.read_csv("/content/processed_training_dataset.csv")
  validation_dataset = pd.read_csv("/content/processed_validation_dataset.csv")
  
  train_dataset = train_dataset[train_dataset["label"]=='passage']
  validation_dataset = validation_dataset[validation_dataset["label"]=='passage']

  train_dataset['ans_start_idx'] = train_dataset.apply(lambda row: row['target_paragraphs'].index(row['spoiler']), axis=1)
  train_dataset['ans_end_idx'] = train_dataset.apply(lambda row : row['ans_start_idx'] + len(row['spoiler']), axis=1)
  
  validation_dataset['ans_start_idx'] = validation_dataset.apply(lambda row: row['target_paragraphs'].index(row['spoiler']), axis=1)
  validation_dataset['ans_end_idx'] = validation_dataset.apply(lambda row : row['ans_start_idx'] + len(row['spoiler']), axis=1)

  tokenizer = DebertaTokenizer.from_pretrained('Palak/microsoft_deberta-base_squad')
  train_dataset['ans_start_token_idx'] = train_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_start_idx']], add_special_tokens=False)), axis=1)
  train_dataset['ans_end_token_idx'] = train_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_end_idx']], add_special_tokens=False)), axis=1)

  validation_dataset['ans_start_token_idx'] = validation_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_start_idx']], add_special_tokens=False)), axis=1)
  validation_dataset['ans_end_token_idx'] = validation_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_end_idx']], add_special_tokens=False)), axis=1)


  train_dataset = train_dataset[train_dataset['ans_end_token_idx']<513]
  train_dataset.reset_index(inplace=True)
  validation_dataset = validation_dataset[validation_dataset['ans_end_token_idx']<513]
  validation_dataset.reset_index(inplace=True)

  return train_dataset, validation_dataset


In [6]:
class ClickbaitSpoilerDataset(Dataset):
    def __init__(self, target_paragraphs, answer_start_indices, answer_end_indices, tokenizer):
        #self.clickbait_tweets = clickbait_tweets
        self.target_paragraphs = target_paragraphs
        self.answer_start_indices = answer_start_indices
        self.answer_end_indices = answer_end_indices
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.target_paragraphs)

    def __getitem__(self, idx):
        target_paragraph = self.target_paragraphs[idx]
        answer_start_idx = self.answer_start_indices[idx]
        answer_end_idx = self.answer_end_indices[idx]

        inputs = self.tokenizer.encode_plus(
            target_paragraph,
            add_special_tokens=True,
            return_tensors='pt',
            max_length=512,
            truncation=True        )

        input_ids = inputs['input_ids'].squeeze()
        token_type_ids = inputs['token_type_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        start_positions = torch.tensor(answer_start_idx)
        end_positions = torch.tensor(answer_end_idx)

        return input_ids,token_type_ids, attention_mask, start_positions, end_positions

In [7]:
def train(model, train_dataloader, optimizer, device, epoch):
    model.train()
    train_loss = 0
    total_correct = 0
    total_samples = 0

    for step, batch in enumerate(train_dataloader):
        input_ids,token_type_ids, attention_mask, start_positions, end_positions = [x.to(device) for x in batch]
        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            start_positions=start_positions,
            end_positions=end_positions
        )
        loss = outputs[0]
        train_loss += loss.item()

        start_preds, end_preds = outputs[1], outputs[2]
        _, start_preds = start_preds.max(dim=1)
        _, end_preds = end_preds.max(dim=1)

        for i in range(len(input_ids)):
          start_pred = start_preds[i].item()
          end_pred = end_preds[i].item()
          if start_pred > end_pred:
            # Penalize the model with a higher loss if the start index is greater than the end index
            additional_loss = torch.tensor(4).to(device)
            loss += additional_loss

        total_correct += ((start_preds == start_positions) & (end_preds == end_positions)).sum().item()
        total_samples += start_positions.size(0)

        loss.backward()
        optimizer.step()

        if step % 100 == 0:
            acc = 100.0 * total_correct / total_samples
            avg_loss = train_loss / (step + 1)
            print(f"Step [{step}/{len(train_dataloader)}], Train Loss: {avg_loss:.4f}, Train Acc: {acc:.2f}%")

    avg_loss = train_loss / len(train_dataloader)
    acc = 100.0 * total_correct / total_samples
    print(f"Epoch [{epoch}], Train Loss: {avg_loss:.4f}, Train Acc: {acc:.2f}%")

    return avg_loss, acc

In [8]:

def eval(model, dataloader, tokenizer, device, epoch, results):
    logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
    warnings.filterwarnings('ignore')
    model.eval()
    eval_loss = 0
    total_correct = 0
    total_samples = 0
    blue_score = 0
    meteor_score = 0
    f1_score = 0

    with torch.no_grad():
        for step,batch in enumerate(dataloader):
            input_ids, token_type_ids, attention_mask, start_positions, end_positions = [x.to(device) for x in batch]
            print(step, len(batch))
            outputs = model(
                input_ids=input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask
            )

            start_logits, end_logits = outputs.start_logits, outputs.end_logits
            start_preds, end_preds = start_logits.argmax(dim=1), end_logits.argmax(dim=1)

            for i in range(len(input_ids)):
                input_id = input_ids[i].tolist()
                start_pred = start_preds[i].item()
                end_pred = end_preds[i].item()

                answer = tokenizer.decode(input_id[start_pred:end_pred+1], skip_special_tokens=True)
                answer = str(answer.strip())

                reference = tokenizer.decode(input_id[start_positions[i]:end_positions[i]+1], skip_special_tokens=True)
                reference = str(reference.strip())

                current_statement_blue = nltk.translate.bleu_score.sentence_bleu([reference], answer)
                blue_score += current_statement_blue

                current_statement_meteor = nltk.translate.meteor_score.meteor_score([tokenizer.tokenize(reference)], tokenizer.tokenize(answer))
                meteor_score += current_statement_meteor

                precision, recall, f1 = bert_score.score([answer], [reference], lang="en",model_type='bert-base-uncased')
                f1_score += f1


                new_row = {'epoch': epoch,
                           'target': tokenizer.decode(input_id,skip_special_tokens=True),
                           'predicted': answer,
                           'original' : reference,
                           'start_pred' : start_pred,
                           'start_ori' : start_positions[i].item(),
                           'end_preds' : end_pred,
                           'end_ori' : end_positions[i].item(),
                           'blue_score' : current_statement_blue,
                           'meteor_score' : current_statement_meteor,
                           'f1_score' : f1}

                results = results.append(new_row, ignore_index = True)


            total_correct += ((start_preds == start_positions) & (end_preds == end_positions)).sum().item()
            total_samples += start_positions.size(0)

    avg_loss = eval_loss / len(dataloader)
    acc = 100.0 * total_correct / total_samples
    blue_score /= total_samples
    meteor_score /= total_samples
    f1_score = f1_score.item()/ total_samples

    print(f"Eval Loss: {avg_loss:.4f}, Eval Acc: {acc:.2f}%, BLUE Score: {blue_score:.4f}, METEOR Score: {meteor_score:.4f}, F1 Score: {f1_score:.4f}")

    return results

In [9]:
# Load data
train_dataset, validation_dataset = data_preprocess()

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/778 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (588 > 512). Running this sequence through the model will result in indexing errors


In [10]:
train_dataset.shape

(1120, 9)

In [11]:
validation_dataset.shape

(284, 9)

In [12]:


target_paragraphs = train_dataset['target_paragraphs']
answer_start_indices = train_dataset['ans_start_token_idx']
answer_end_indices = train_dataset['ans_end_token_idx']

val_target_paragraphs = validation_dataset['target_paragraphs']
val_answer_start_indices = validation_dataset['ans_start_token_idx']
val_answer_end_indices = validation_dataset['ans_end_token_idx']

# Initialize tokenizer and model
tokenizer = DebertaTokenizer.from_pretrained('Palak/microsoft_deberta-base_squad',truncation = True)
model = DebertaForQuestionAnswering.from_pretrained('Palak/microsoft_deberta-base_squad')

# Create dataset and dataloader
train_dataset = ClickbaitSpoilerDataset(target_paragraphs, answer_start_indices, answer_end_indices, tokenizer)
val_dataset = ClickbaitSpoilerDataset(val_target_paragraphs, val_answer_start_indices, val_answer_end_indices, tokenizer)

def collate_fn(batch):
  input_ids = [item[0] for item in batch]
  token_type_ids = [item[1] for item in batch]
  attention_mask = [item[2] for item in batch]
  start_positions = [item[3] for item in batch]
  end_positions = [item[4] for item in batch]

  input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
  token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
  attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=tokenizer.pad_token_id)

  return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), torch.tensor(start_positions), torch.tensor(end_positions)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True,collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=True,collate_fn=collate_fn)

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

Downloading (…)lve/main/config.json:   0%|          | 0.00/782 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/554M [00:00<?, ?B/s]

In [13]:
# Train model
for epoch in range(5):
    logging.getLogger("transformers").setLevel(logging.ERROR)
    train(model, train_dataloader, optimizer, device,epoch)


results = pd.DataFrame(columns=['epoch','target', 'predicted','original','start_pred','start_ori','end_preds','end_ori','blue_score','meteor_score','f1_score'])
results = eval(model,val_dataloader, tokenizer, device,1, results)


  return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), torch.tensor(start_positions), torch.tensor(end_positions)


Step [0/140], Train Loss: 7.0022, Train Acc: 0.00%
Step [100/140], Train Loss: 3.2246, Train Acc: 9.03%
Epoch [0], Train Loss: 3.1136, Train Acc: 10.45%
Step [0/140], Train Loss: 2.0589, Train Acc: 25.00%
Step [100/140], Train Loss: 1.8564, Train Acc: 26.86%
Epoch [1], Train Loss: 1.8664, Train Acc: 27.68%
Step [0/140], Train Loss: 0.9763, Train Acc: 25.00%
Step [100/140], Train Loss: 0.8679, Train Acc: 54.58%
Epoch [2], Train Loss: 0.9386, Train Acc: 52.23%
Step [0/140], Train Loss: 0.2367, Train Acc: 87.50%
Step [100/140], Train Loss: 0.4521, Train Acc: 73.89%
Epoch [3], Train Loss: 0.4727, Train Acc: 72.77%
Step [0/140], Train Loss: 0.3756, Train Acc: 87.50%
Step [100/140], Train Loss: 0.2333, Train Acc: 87.13%
Epoch [4], Train Loss: 0.2492, Train Acc: 86.16%
0 5


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]



1 5
2 5
3 5
4 5
5 5




6 5




7 5




8 5




9 5




10 5
11 5
12 5
13 5




14 5




15 5




16 5
17 5




18 5




19 5




20 5
21 5
22 5
23 5




24 5
25 5




26 5




27 5




28 5
29 5




30 5




31 5




32 5
33 5




34 5




35 5




Eval Loss: 0.0000, Eval Acc: 11.97%, BLUE Score: 0.3718, METEOR Score: 0.4189, F1 Score: 0.5772


In [None]:
results.to_csv('/content/gdrive/My Drive/DeBerta_novel_passage.csv', mode='a', header=False, index=False)


# Save model
model.save_pretrained('/content/gdrive/My Drive/DeBerta_novel_passage_model')