In [1]:
!pip install transformers
!pip install torch
! pip install bert_score
! pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m97.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m103.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.1
Looking in i

In [2]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering
from transformers import RobertaTokenizer, RobertaForQuestionAnswering
from transformers import DebertaTokenizer, DebertaForQuestionAnswering

from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from bert_score import score
import nltk
nltk.download('wordnet')
from evaluate import load
bertscore = load("bertscore")
import bert_score
import pandas as pd
import logging
import warnings
from textblob import TextBlob
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, ne_chunk
from nltk.corpus import wordnet
import json

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


#### QUESTION - ANSWERING

In [11]:
def sentiment(text):
  sentiment_polarity = TextBlob(text).sentiment.polarity
  if sentiment_polarity > 0:
    sentiment = "Positive"
  elif sentiment_polarity < 0:
    sentiment = "Negative"
  else:
    sentiment = "Neutral"
  return sentiment

def generate_NER(paragraph, call_flag):

  words = word_tokenize(paragraph)

  stop_words = set(stopwords.words('english'))
  filtered_words = [word for word in words if word.lower() not in stop_words]

  lemmatizer = WordNetLemmatizer()
  lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

  pos_tags = pos_tag(lemmatized_words)

  ner_tags = ne_chunk(pos_tags)

  keywords = []
  entities = []

  for chunk in ner_tags:
      if hasattr(chunk, 'label') and chunk.label() == 'PERSON':
          entities.append(' '.join(c[0] for c in chunk))
      elif hasattr(chunk, 'label') and chunk.label() == 'ORGANIZATION':
          entities.append(' '.join(c[0] for c in chunk))
      elif hasattr(chunk, 'label') and chunk.label() == 'LOCATION':
          entities.append(' '.join(c[0] for c in chunk))
      else:
          keywords.append(chunk[0])

  if call_flag == 'NER':
    return ", ".join(entities).lower()
  else:
    return keywords

def query_reformulation(query):
    new_query = []
    keywords = generate_NER(query, "Synset")
    for term in query.split():
        if term in keywords:
          synonyms = []
          for syn in wordnet.synsets(term):
              for lemma in syn.lemmas():
                  synonyms.append(lemma.name())
          if synonyms:
              new_query.append(synonyms[0])
          else:
              new_query.append(term)
        else:
          new_query.append(term)
    return " ".join(new_query)


def load_dataset(file_name):
    
    df = []
    i = 0
    with open("/content/" + file_name,encoding="utf-8") as f:
        for i in f:

          i = json.loads(i)
          uuid = i['uuid']
          post_text = i['postText']
          article_title = i['targetTitle'].replace("\'","").lower().replace(".","")
          article = '. '.join(i['targetParagraphs']).replace('\"','').lower().replace("\'","").replace(".","")
          
          article_description = i['targetDescription']
          if article_description is None:
            article_description = " "
          article_description = article_description.lower().replace("\'","").replace(".","")

          article_keywords = i['targetKeywords']
          if article_keywords is None or ',' or '&nbsp':
            article_keywords = generate_NER(article,"NER")
          article_keywords = article_keywords.lower().replace("\'","").replace(".","")

          spoiler = i['spoiler']
          spoiler = spoiler[0].replace('\"','').lower().replace("\'","").replace(".","")

          prediction = i['predicted_labels']

          if prediction == 1:
            label = 'phrase'
          elif prediction == 0:
            label = 'passage'

          post_text = post_text[0].lower().replace("\'","").replace(".","")
          post_text = post_text + ". " + query_reformulation(post_text)
          
          sentiment_val =  sentiment(post_text)
          
          if str(uuid).replace("-","").isalnum() == False:
            continue

          temp_article = ""
          
          temp_article = "Question - " + post_text + "\n" + \
                  "Question_Sentiment - " + sentiment_val + "\n" +\
                  "Article_Keyword - " + article_keywords + "\n"  + \
                  "Article_Title - " + article_title + "\n" + \
                  "Article - " + article + "\n" + \
                  "Label Type - " + label 

          t = temp_article.index(spoiler)

          df += [{
                  'target_paragraphs': 
                  "Question - " + post_text + "\n" + \
                  "Question_Sentiment - " + sentiment_val + "\n" +\
                  "Article_Keyword - " + article_keywords + "\n"  + \
                  "Article_Title - " + article_title + "\n" + \
                  "Article - " + article + "\n" + \
                  "Label Type - " + label ,
                  'spoiler': spoiler,
                  'label': label
                  }]

          data = pd.DataFrame(df)


    return data

In [6]:
def data_preprocess():

  validation_dataset = load_dataset("RoBERTa_classified_validation_dataset.jsonl")
  validation_dataset['ans_start_idx'] = validation_dataset.apply(lambda row: row['target_paragraphs'].index(row['spoiler']), axis=1)
  validation_dataset['ans_end_idx'] = validation_dataset.apply(lambda row : row['ans_start_idx'] + len(row['spoiler']), axis=1)

  tokenizer = DebertaTokenizer.from_pretrained('Palak/microsoft_deberta-base_squad')

  validation_dataset['ans_start_token_idx'] = validation_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_start_idx']], add_special_tokens=False)), axis=1)
  validation_dataset['ans_end_token_idx'] = validation_dataset.apply(lambda row: len(tokenizer.encode(row['target_paragraphs'][:row['ans_end_idx']], add_special_tokens=False)), axis=1)

  return validation_dataset


In [7]:
class ClickbaitSpoilerDataset(Dataset):
    def __init__(self, target_paragraphs, answer_start_indices, answer_end_indices, tokenizer):
        self.target_paragraphs = target_paragraphs
        self.answer_start_indices = answer_start_indices
        self.answer_end_indices = answer_end_indices
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.target_paragraphs)

    def __getitem__(self, idx):
        target_paragraph = self.target_paragraphs[idx]
        answer_start_idx = self.answer_start_indices[idx]
        answer_end_idx = self.answer_end_indices[idx]

        inputs = self.tokenizer.encode_plus(
            target_paragraph,
            add_special_tokens=True,
            return_tensors='pt',
            max_length=512,
            truncation=True        )

        input_ids = inputs['input_ids'].squeeze()
        token_type_ids = inputs['token_type_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        start_positions = torch.tensor(answer_start_idx)
        end_positions = torch.tensor(answer_end_idx)

        return input_ids,token_type_ids, attention_mask, start_positions, end_positions

In [8]:
def eval(model, dataloader, tokenizer, device, label, results):
    logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
    warnings.filterwarnings('ignore')
    model.eval()
    eval_loss = 0
    total_correct = 0
    total_samples = 0
    bleu_score = 0
    meteor_score = 0
    f1_score = 0
    with torch.no_grad():
        for step,batch in enumerate(dataloader):
            input_ids, token_type_ids, attention_mask, start_positions, end_positions = [x.to(device) for x in batch]
            print(step, len(batch))

            outputs = model(
                input_ids=input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask
            )
            start_logits, end_logits = outputs.start_logits, outputs.end_logits
            start_preds, end_preds = start_logits.argmax(dim=1), end_logits.argmax(dim=1)

            for i in range(len(input_ids)):
                input_id = input_ids[i].tolist()
                start_pred = start_preds[i].item()
                end_pred = end_preds[i].item()

                answer = tokenizer.decode(input_id[start_pred:end_pred+1], skip_special_tokens=True)
                answer = str(answer.strip())

                reference = tokenizer.decode(input_id[start_positions[i]:end_positions[i]+1], skip_special_tokens=True)
                reference = str(reference.strip())

                current_statement_bleu = nltk.translate.bleu_score.sentence_bleu([reference], answer)
                bleu_score += current_statement_bleu

                current_statement_meteor = nltk.translate.meteor_score.meteor_score([tokenizer.tokenize(reference)], tokenizer.tokenize(answer))
                meteor_score += current_statement_meteor

                precision, recall, f1 = bert_score.score([answer], [reference], lang="en",model_type='bert-base-uncased')
                f1_score += f1


                new_row = {'label': label,
                           'target': tokenizer.decode(input_id,skip_special_tokens=True),
                           'predicted': answer,
                           'original' : reference,
                           'start_pred' : start_pred,
                           'start_ori' : start_positions[i].item(),
                           'end_preds' : end_pred,
                           'end_ori' : end_positions[i].item(),
                           'bleu_score' : current_statement_bleu,
                           'meteor_score' : current_statement_meteor,
                           'f1_score' : f1}

                results = results.append(new_row, ignore_index = True)


            total_correct += ((start_preds == start_positions) & (end_preds == end_positions)).sum().item()
            total_samples += start_positions.size(0)

    avg_loss = eval_loss / len(dataloader)
    acc = 100.0 * total_correct / total_samples
    bleu_score /= total_samples
    meteor_score /= total_samples
    f1_score = f1_score.item()/ total_samples

    print(f"Eval Loss: {avg_loss:.4f}, Eval Acc: {acc:.2f}%, BLEU Score: {bleu_score:.4f}, METEOR Score: {meteor_score:.4f}, F1 Score: {f1_score:.4f}")

    return results, bleu_score, meteor_score, f1_score

In [9]:
# Initialize tokenizer and model
tokenizer = DebertaTokenizer.from_pretrained('Palak/microsoft_deberta-base_squad',truncation = True)
phrase_model = DebertaForQuestionAnswering.from_pretrained("/content/gdrive/My Drive/DeBerta_novel_phrase/")
passage_model = DebertaForQuestionAnswering.from_pretrained("/content/gdrive/My Drive/DeBerta_novel_passage/")

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
phrase_model.to(device)
passage_model.to(device)

def collate_fn(batch):
  input_ids = [item[0] for item in batch]
  token_type_ids = [item[1] for item in batch]
  attention_mask = [item[2] for item in batch]
  start_positions = [item[3] for item in batch]
  end_positions = [item[4] for item in batch]

  input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
  token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
  attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=tokenizer.pad_token_id)

  return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), torch.tensor(start_positions), torch.tensor(end_positions)


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/778 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

In [14]:
# Load data
validation_dataset = data_preprocess()

#Phrase Data Loader
phrase_data = validation_dataset[validation_dataset["label"] == 'phrase']
phrase_data.reset_index(inplace=True)
phrase_data = phrase_data[['target_paragraphs', 'spoiler', 'label', 'ans_start_idx','ans_end_idx', 'ans_start_token_idx', 'ans_end_token_idx']]

phrase_target_paragraphs = phrase_data['target_paragraphs']
phrase_answer_start_indices = phrase_data['ans_start_token_idx']
phrase_answer_end_indices = phrase_data['ans_end_token_idx']

# Create phrase dataset and dataloader
phrase_dataset = ClickbaitSpoilerDataset(phrase_target_paragraphs, phrase_answer_start_indices, phrase_answer_end_indices, tokenizer)
phrase_dataloader = DataLoader(phrase_dataset, batch_size=8, shuffle=True,collate_fn=collate_fn)

#Passage Data Loader
passage_data = validation_dataset[validation_dataset["label"] == 'passage']
passage_data.reset_index(inplace=True)
passage_data = passage_data[['target_paragraphs', 'spoiler', 'label', 'ans_start_idx','ans_end_idx', 'ans_start_token_idx', 'ans_end_token_idx']]

passage_target_paragraphs = passage_data['target_paragraphs']
passage_answer_start_indices = passage_data['ans_start_token_idx']
passage_answer_end_indices = passage_data['ans_end_token_idx']

# Create passage dataset and dataloader 
passage_dataset = ClickbaitSpoilerDataset(passage_target_paragraphs, passage_answer_start_indices, passage_answer_end_indices, tokenizer)
passage_dataloader = DataLoader(passage_dataset, batch_size=8, shuffle=True,collate_fn=collate_fn)

Token indices sequence length is longer than the specified maximum sequence length for this model (747 > 512). Running this sequence through the model will result in indexing errors


In [15]:
phrase_data

Unnamed: 0,target_paragraphs,spoiler,label,ans_start_idx,ans_end_idx,ans_start_token_idx,ans_end_token_idx
0,Question - here’s how much you should be tippi...,20%,phrase,917,920,218,219
1,Question - this popular soda could cure your h...,sprite,phrase,211,217,48,48
2,Question - the anytime snack you wont feel gui...,smoky paprika-baked garbanzo beans,phrase,508,542,110,120
3,Question - guess who obama just dined with in ...,anthony bourdain,phrase,258,274,71,74
4,Question - the one morning work mistake you ca...,starts later,phrase,647,659,135,136
...,...,...,...,...,...,...,...
325,Question - both campaigns stunned after housto...,hillary clinton,phrase,554,569,101,104
326,Question - one guy decided to explore this aba...,"this was clearly a crack house, and he was sta...",phrase,3837,3990,774,804
327,Question - heres a controversial idea about ch...,china is too quick to rebalance its services s...,phrase,225,276,53,64
328,Question - this texas gop elector announces th...,christopher suprun,phrase,460,478,104,108


In [16]:
results = pd.DataFrame(columns=['Label','target', 'predicted','original','start_pred','start_ori','end_preds','end_ori','bleu_score','meteor_score','f1_score'])


logging.getLogger("transformers").setLevel(logging.ERROR)
results_phrase, bleu_score_phrase, meteor_score_phrase, f1_score_phrase = eval(phrase_model, phrase_dataloader, tokenizer, device,"Phrase", results)
results_passage, bleu_score_passage, meteor_score_passage, f1_score_passage = eval(passage_model, passage_dataloader, tokenizer, device,"Passage", results)

final_bleu = (bleu_score_phrase + bleu_score_passage)/2
final_meteor = (meteor_score_phrase + meteor_score_passage)/2
final_f1 = (f1_score_phrase + f1_score_passage)/2

print("Final Results")
print(f"BLEU Score: {final_bleu:.4f}, METEOR Score: {final_meteor:.4f}, F1 Score: {final_f1:.4f}")


0 5


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]



1 5




2 5
3 5




4 5
5 5




6 5




7 5
8 5
9 5
10 5
11 5
12 5




13 5




14 5
15 5




16 5




17 5




18 5




19 5
20 5




21 5




22 5




23 5
24 5




25 5
26 5
27 5




28 5




29 5




30 5
31 5
32 5
33 5




34 5




35 5
36 5
37 5




38 5
39 5
40 5




41 5
Eval Loss: 0.0000, Eval Acc: 46.67%, BLEU Score: 0.5634, METEOR Score: 0.5485, F1 Score: 0.7095
0 5




1 5




2 5
3 5




4 5
5 5




6 5




7 5
8 5




9 5




10 5




11 5




12 5




13 5




14 5




15 5




16 5




17 5




18 5




19 5




20 5




21 5




22 5




23 5
24 5




25 5




26 5




27 5
28 5




29 5




30 5




31 5




32 5




33 5




34 5




35 5




36 5




37 5




38 5




39 5




40 5
Eval Loss: 0.0000, Eval Acc: 9.17%, BLEU Score: 0.3037, METEOR Score: 0.3841, F1 Score: 0.5154
Final Results
BLEU Score: 0.4336, METEOR Score: 0.4663, F1 Score: 0.6124




In [17]:
results_phrase.to_csv('/content/gdrive/My Drive/DeBerta_Final_Phrase.csv', mode='a', header=False, index=False)
results_passage.to_csv('/content/gdrive/My Drive/DeBerta_Final_Passage.csv', mode='a', header=False, index=False)
