In [10]:
import tensorflow_datasets as tfds 
import pandas as pd
import itertools


In [11]:
def _split_into_words(sentences):
    """Splits multiple sentences into words and flattens the result"""
    return list(itertools.chain(*[_.split(" ") for _ in sentences]))


def _get_word_ngrams(n, sentences, exclusive=True):
    """Calculates word n-grams for multiple sentences"""
    
    assert len(sentences) > 0
    assert n > 0

    words = _split_into_words(sentences)
    return _get_ngrams(n, words)

def _get_ngrams(n, text):
    """Calculates n-grams.
    Args:
      n: which n-grams to calculate
      text: A list of tokens
    Returns:
      A set of n-grams """

    n_grams = set()

    # Loop through words, creating n-grams
    for i in range((len(text)) - n + 1):
        ngram = " ".join(text[i:i+n])
        n_grams.add(ngram)

    return n_grams

In [12]:
def rouge_n(evaluated_sentences, reference_sentences,
            n=2, raw_results=False, exclusive=True):
    """
    Computes ROUGE-N of two text collections of sentences.
    Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/
    papers/rouge-working-note-v1.3.1.pdf
    Args:
      evaluated_sentences: The sentences that have been picked by the
                           summarizer
      reference_sentences: The sentences from the referene set
      n: Size of ngram.  Defaults to 2.
    Returns:
      A tuple (f1, precision, recall) for ROUGE-N
    Raises:
      ValueError: raises exception if a param has len <= 0
    """
    if len(evaluated_sentences) <= 0:
        raise ValueError("Hypothesis is empty.")
    if len(reference_sentences) <= 0:
        raise ValueError("Reference is empty.")

    evaluated_ngrams = _get_word_ngrams(
        n, evaluated_sentences, exclusive=exclusive)
    reference_ngrams = _get_word_ngrams(
        n, reference_sentences, exclusive=exclusive)
    reference_count = len(reference_ngrams)
    evaluated_count = len(evaluated_ngrams)

    # Gets the overlapping ngrams between evaluated and reference
    overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
    overlapping_count = len(overlapping_ngrams)

    if raw_results:
        o = {
            "hyp": evaluated_count,
            "ref": reference_count,
            "overlap": overlapping_count
        }
        return o
    else:
        return f_r_p_rouge_n(
            evaluated_count, reference_count, overlapping_count)

In [13]:
import numpy as np
from dataclasses import dataclass
from typing import Dict, List
from nltk.tokenize import sent_tokenize
from sklearn.cluster import KMeans

@dataclass(frozen=True)
class Clustering:
    """This class is used to cluster sentence embeddings in order to execute
    text summarization. 
    Args:
    - features (np.ndarray): sentence embeddings
    - random_state (int - optional): random state for random seed
    """

    features: np.ndarray
    random_state: int = 1

    def __define_model(self, k: int) -> None:
        """used to define KNN clustering model"""

        model = KMeans(n_clusters=k, random_state=self.random_state)
        object.__setattr__(self, 'model', model)

    def __find_closest_sents(self, centroids: np.ndarray) -> Dict:
        """
        Find the closest arguments to centroid.
        - centroids: Centroids to find closest.
        - return: Closest arguments.
        """

        centroid_min = 1e10
        cur_arg = -1
        args = {}
        used_idx = []

        for j, centroid in enumerate(centroids):

            for i, feature in enumerate(self.features):
                value = np.linalg.norm(feature - centroid)

                if value < centroid_min and i not in used_idx:
                    cur_arg = i
                    centroid_min = value

            used_idx.append(cur_arg)
            args[j] = cur_arg
            centroid_min = 1e10
            cur_arg = -1

        return args

    def cluster(self, ratio: float = 0.2,
                num_sentences: int = None) -> List[int]:
        """
        Clusters sentences based on the ratio.
        - ratio: Ratio to use for clustering.
        - num_sentences: Number of sentences. Overrides ratio.
        return: Sentences index that qualify for summary.
        """

        # set k value
        if num_sentences is not None:
            if num_sentences == 0:
                return []
            k = min(num_sentences, len(self.features))
        else:
            k = max(int(len(self.features) * ratio), 1)

        # define n train the model
        self.__define_model(k)
        self.model.fit(self.features)

        # find the closest embeddings to the center
        centroids = self.model.cluster_centers_
        cluster_args = self.__find_closest_sents(centroids)

        sorted_values = sorted(cluster_args.values())
        return sorted_values

In [14]:
import re
import string
from typing import Union

REGEX_URL = r'((http|https)\:\/\/)[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*'
clear_url = lambda text: re.sub(REGEX_URL, ' ', text)
DOT_REGEX = r"(?<!\w)(?:[A-Z][A-Za-z]{,3}|[a-z]{1,2})\."

@dataclass(frozen=True)
class Preprocessing:
    """Preprocessing class used to preprocess news text before Text
    Summarization is applied. """

    def _clear_content_head(self, content: str, site_name: str,
                           head_pattern: str=r"\s\-+\s") -> str:
        """used to clear any head in given news content"""

        match = re.search(head_pattern, content)
        if match:
            idx_end = match.end()
            site_name = site_name.split()[0]
            if site_name.lower() in content[:idx_end].lower():
                content = content[idx_end:]

        return content

    def _clear_abbreviation_dot(self, text: str) -> str:
        """used to rip off abbreviation dot in given text"""

        # replace any matched abbr with empty string
        text_list = list(text)
        for i, match in enumerate(re.finditer(DOT_REGEX, text)):
            no_dot = match.group().replace('.', '')
            idx = match.span()
            text_list[idx[0]-i: idx[1]-i] = no_dot

        # join list of texts and clear multiple whitespaces
        text = ''.join(text_list)
        text = re.sub(' +', ' ', text)

    def __call__(self, content: str, site_name: str) -> Union[str, bool]:

        """the method is used to:
        - clear any content head
        - clear any heading/tailing whitespace & punct
        - clear any abbreviation dot

        Args:
        - content (str): news content
        - site_name (str): news site name

        Return:
        preprocessed content
        """

        content = self._clear_content_head(content, site_name)
        content = clear_url(content)

        # clear leadding/trailing whitespaces & puncts
        content = content.strip(string.punctuation)
        content = content.strip()

        # change multiple whitespaces to single one
        content = re.sub(' +', ' ', content)

        # clear whitespace before dot
        content = re.sub(r'\s+([?,.!"])', r'\1', content)

        return content

In [15]:
from typing import List, Tuple
from dataclasses import dataclass
from nltk.tokenize import sent_tokenize
from gensim.models.word2vec import Word2Vec

@dataclass(frozen=True)
class Word2VecSummarizer:
    """The main class for Word2Vec Summarizer

    Args:
    - model: A gensim Word2Vec model (optional)
    - random_state: state for random seed (optional)
    """
    def __init__(self, model: Word2Vec, random_state: int=1):
        object.__setattr__(self, 'model', model)
        object.__setattr__(self, 'random_state', random_state)

    def __split_sentence(self, text: str) -> List[str]:
        """used to split given text into sentences"""
        sentences = sent_tokenize(text)
        return [sent for sent in sentences if len(sent) >= 5]

    def __set_embedder(self) -> None:
        """used to instantiate Embedder object"""
        embedder = Embedder(self.model)
        object.__setattr__(self, 'embedder', embedder)

    def __set_clusterer(self, features: np.ndarray,
                        random_state: int) -> None:
        """used to instantiate Clustering object"""
        clusterer = Clustering(features, random_state)
        object.__setattr__(self, 'clusterer', clusterer)

    def summarize(self, text: str,
                  use_first: bool = True,
                  num_sentences: int = None,
                  ratio: float = 0.2,
                  return_oov: bool = False) -> Tuple[List[str], np.ndarray]:
        
        """This method executes the summarization part. It returns a tuple of sentences and related embeddings (and OOV list if return_oov set to True)"""
        
        list_sentence = self.__split_sentence(text)
        self.__set_embedder()

        # set buffers
        sent_vecs = []
        oov_list = []

        # loop through each sentence to create each embeddings
        for sentence in list_sentence:
            if return_oov:
                vec, oov = self.embedder.embed(sentence, return_oov)
                oov_list.extend(oov)
            else:
                vec = self.embedder.embed(sentence, return_oov)

            # check if no OOV returned
            if isinstance(vec, np.ndarray):
                sent_vecs.append(vec)

        sent_vecs = np.array(sent_vecs) # create array of all embeddings

        # instantiate clustering & process
        self.__set_clusterer(sent_vecs, self.random_state)
        summary_idx = self.clusterer.cluster(ratio, num_sentences)

        if use_first:
            if not summary_idx:
                summary_idx.append(0)

            elif summary_idx[0] != 0:
                summary_idx.insert(0, 0)

        sentences = [list_sentence[idx] for idx in summary_idx]
        embeddings = np.asarray([sent_vecs[idx] for idx in summary_idx])

        if return_oov:
            return sentences, oov_list
        return sentences

In [16]:
@dataclass(frozen=True)
class Embedder:
    """This class is used to create word embeddings from given sentence. It converts each token of given sentence 
        to its representative vector then calculates the mean of all tokens in given sentence in order to get a
    sentence embedding.

    Arg:
    - model: a gensim Word2Vec model
    """

    model: Word2Vec

    def __get_vector(self, token: str) -> np.ndarray:
        """used to convert given token to its representative vector"""
        try:
            return self.model.wv.get_vector(token)
        except KeyError:
            return False

    def __averaging(self, token_matrix: np.ndarray) -> np.ndarray:
        """used to calculate mean of an array of vectors in order to get a
        sentence embedding"""
        return np.mean(token_matrix, axis=0)

    def embed(self, sentence: str, return_oov: bool=False) -> np.ndarray:
        """combine all other methods to execute the embedding process.
        
        Args:
        - sentence (str): a sentence to be process to get its embedding
        - return_oov(bool): indicate if you'd like to return the OOV
        (out-of-vocabulary) tokens
        
        Returns:
        If all tokens in given sentence are OOV tokens, return False (and with
        list of OOVs if 'return_oov' set to True).
        else, return the sentence embedding (and with list of OOVs if
        'return_oov' set to True).
        """

        # make the given sentence lowercase and collect only words
        list_tok = re.findall(r"\w+", sentence.lower())

        # buffers
        list_vec = []
        OOV_tokens = []

        # loop through each token of given sentence
        for token in list_tok:
            tokvec = self.__get_vector(token) # convert to vector

            # check if no OOV token produced
            if isinstance(tokvec, np.ndarray):
                list_vec.append(tokvec)
            else:
                OOV_tokens.append(token)

        # if all tokens in given sentence are OOV tokens
        if not list_vec:
            if return_oov:
                return False, OOV_tokens
            return False

        # if not
        list_vec = np.array(list_vec)
        if return_oov:
            return (self.__averaging(list_vec), OOV_tokens)
        return self.__averaging(list_vec)

In [17]:
def f_r_p_rouge_n(evaluated_count, reference_count, overlapping_count):
    # Handle edge case. This isn't mathematically correct, but it's good enough
    if evaluated_count == 0:
        precision = 0.0
    else:
        precision = overlapping_count / evaluated_count

    if reference_count == 0:
        recall = 0.0
    else:
        recall = overlapping_count / reference_count

    f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))

    return {"f": f1_score, "p": precision, "r": recall}

In [20]:
from sklearn.model_selection import KFold
import tensorflow_datasets as tfds 
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

import warnings
warnings.filterwarnings('ignore')

summaries = []
vals_highs = []
fpr = tpr = []
preprocessor = Preprocessing()

data = list(tfds.load('cnn_dailymail', split='all', as_supervised=True)) #CNN Daily Mail dataset
data = data[:1000] #Past a certain size, the notebook stopped working so must experiment with different sizes
kfold = KFold(n_splits=10, shuffle=True)
    
for train, test in kfold.split(data):

  data_test = [data[i] for i in test]
  data_train = [data[i] for i in train]

  vals_highlights  = [data_test[i][1] for i in range(len(data_test))]



  for i in range(len(data_train)): 
    data_train[i] = preprocessor(data[i][0].numpy().decode('utf-8'), 'Daily Mail' )
    data_train[i] = word_tokenize(data[i][0].numpy().decode('utf-8')) #tokenizes each training article
    
    
  summarizes = Word2VecSummarizer(Word2Vec(data_train))



  for i in range(len(data_test)):
    clean_text = preprocessor(data_test[i][0].numpy().decode('utf-8'), 'Daily Mail' )
    summaries.append(summarizes.summarize(clean_text)[0])



  for i in range(len(vals_highlights)):
    vals_highlights[i] = vals_highlights[i].numpy().decode('utf-8')
    vals_highs.append(vals_highlights[i])

  scores = rouge_n(summaries, vals_highs)
  
  tpr.append(scores['r'])
  FP_Rate = (1-scores['p']) / ((1-scores['p']) + scores['r'] / scores['f'])
  fpr.append(FP_Rate)

  print(scores) #recall is true positive; FPR = (1 - precision) / ((1 - precision) + recall / F1)

  









[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{'f': 0.08834916855171028, 'p': 0.1426947129050597, 'r': 0.06398164669895488}
{'f': 0.09701042058635606, 'p': 0.1645717626425022, 'r': 0.06877595895431224}
{'f': 0.10624169577534219, 'p': 0.18514791708593278, 'r': 0.07449392712550608}
{'f': 0.10916607611810407, 'p': 0.18906154783668494, 'r': 0.07673757111056147}
{'f': 0.1098807455115188, 'p': 0.19379068602904356, 'r': 0.07667921537547058}
{'f': 0.1102476890213645, 'p': 0.19674972914409533, 'r': 0.07657923589440836}
{'f': 0.11230129987770805, 'p': 0.1995113241236726, 'r': 0.07814340400471143}
{'f': 0.11665155227415015, 'p': 0.2036691289877425, 'r': 0.08173170890611767}
{'f': 0.11802977444653152, 'p': 0.20574336604870955, 'r': 0.08275095763034007}
{'f': 0.12152836452022045, 'p': 0.21342718640015937, 'r': 0.08495004493312894}
