In [1]:
!pip install transformers
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m83.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://

In [2]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering
from transformers import RobertaTokenizer, RobertaForQuestionAnswering
from transformers import DebertaTokenizer, DebertaForQuestionAnswering

from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import nltk
nltk.download('wordnet')
import pandas as pd
import logging
import warnings
from textblob import TextBlob
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, ne_chunk
from nltk.corpus import wordnet


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
def sentiment(text):
  sentiment_polarity = TextBlob(text).sentiment.polarity
  if sentiment_polarity > 0:
    sentiment = "Positive"
  elif sentiment_polarity < 0:
    sentiment = "Negative"
  else:
    sentiment = "Neutral"
  return sentiment

def generate_NER(paragraph, call_flag):

  words = word_tokenize(paragraph)

  stop_words = set(stopwords.words('english'))
  filtered_words = [word for word in words if word.lower() not in stop_words]

  lemmatizer = WordNetLemmatizer()
  lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

  pos_tags = pos_tag(lemmatized_words)

  ner_tags = ne_chunk(pos_tags)

  keywords = []
  entities = []

  for chunk in ner_tags:
      if hasattr(chunk, 'label') and chunk.label() == 'PERSON':
          entities.append(' '.join(c[0] for c in chunk))
      elif hasattr(chunk, 'label') and chunk.label() == 'ORGANIZATION':
          entities.append(' '.join(c[0] for c in chunk))
      elif hasattr(chunk, 'label') and chunk.label() == 'LOCATION':
          entities.append(' '.join(c[0] for c in chunk))
      else:
          keywords.append(chunk[0])

  if call_flag == 'NER':
    return ", ".join(entities).lower()
  else:
    return keywords

def query_reformulation(query):
    new_query = []
    keywords = generate_NER(query, "Synset")
    for term in query.split():
        if term in keywords:
          synonyms = []
          for syn in wordnet.synsets(term):
              for lemma in syn.lemmas():
                  synonyms.append(lemma.name())
          if synonyms:
              new_query.append(synonyms[0])
          else:
              new_query.append(term)
        else:
          new_query.append(term)
    return " ".join(new_query)

def load_dataset(file_name):
    import pandas as pd
    import json
    
    df = []
    with open('/content/' + file_name,encoding="utf-8") as f:
        for i in f:
            i = json.loads(i)
            uuid = i['uuid']
            post_text = i['postText']
            article_title = i['targetTitle'].replace("\'","").lower().replace(".","")
            article = '. '.join(i['targetParagraphs']).replace('\"','').lower().replace("\'","").replace(".","")
            
            article_description = i['targetDescription']
            if article_description is None:
              article_description = " "
            article_description = article_description.lower().replace("\'","").replace(".","")

            article_keywords = i['targetKeywords']
            if article_keywords is None or ',' or '&nbsp':
              article_keywords = generate_NER(article,"NER")
            article_keywords = article_keywords.lower().replace("\'","").replace(".","")

            spoiler = i['spoiler']
            spoiler = spoiler[0].replace('\"','').lower().replace("\'","").replace(".","")

            label = i['tags']
            label = label[0]

            post_text = post_text[0].lower().replace("\'","").replace(".","")
            post_text = post_text + ". " + query_reformulation(post_text)
            
            sentiment_val =  sentiment(post_text)

            if label == 'multi':
              continue
            
            if str(uuid).replace("-","").isalnum() == False:
              continue

            try:
              temp_article = "Question - " + post_text + "\n" + \
                    "Question_Sentiment - " + sentiment_val + "\n" +\
                    "Article_Keyword - " + article_keywords + "\n"  + \
                    "Article_Title - " + article_title + "\n" + \
                    "Article - " + article + "\n" + \
                    "Label Type - " + label 

              t = temp_article.index(spoiler)
            except:
              print(temp_article)
              print(spoiler)

            df += [{#'clickbait_tweet': tweet,
                    'target_paragraphs': 
                    "Question - " + post_text + "\n" + \
                    "Question_Sentiment - " + sentiment_val + "\n" +\
                    "Article_Keyword - " + article_keywords + "\n"  + \
                    "Article_Title - " + article_title + "\n" + \
                    "Article - " + article + "\n" + \
                    "Label Type - " + label ,
                    'spoiler': spoiler,
                    'label': label}]

            data = pd.DataFrame(df)

    return data

In [6]:
train_dataset = load_dataset('train.jsonl')
validation_dataset = load_dataset('validation.jsonl') 

train_dataset.to_csv('processed_training_dataset.csv')
validation_dataset.to_csv('processed_validation_dataset.csv')

In [11]:
!cp processed_training_dataset.csv "/content/gdrive/My Drive/"
!cp processed_validation_dataset.csv "/content/gdrive/My Drive/"

cp: cannot stat 'processed_train_dataset.csv': No such file or directory
