In [20]:
!pip install bert-extractive-summarizer

Collecting bert-extractive-summarizer
  Downloading bert_extractive_summarizer-0.10.1-py3-none-any.whl (25 kB)
Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 7.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.5 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 36.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 41.5 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 44.8 MB/s 
Installing coll

In [21]:
!pip install transformers



In [2]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


# -----------------------------

In [3]:
import re
import string
import json

from typing import Union, Dict, List, Tuple
from dataclasses import dataclass

import numpy as np
import pandas as pd
from tqdm import tqdm

from transformers import AutoConfig, AutoTokenizer, AutoModel
from summarizer import Summarizer

from gensim.models.word2vec import Word2Vec

from nltk.tokenize import sent_tokenize
from sklearn.cluster import KMeans

from rouge import Rouge

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Set Colab Directory

In [4]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [5]:
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/IndoSum')
os.getcwd()

'/content/drive/My Drive/Colab Notebooks/IndoSum'

# Processing dataset

In [None]:
data = []
with open('datasets/test.01.jsonl') as file:
    for line in file.readlines():
        data.append(json.loads(line))

In [None]:
# create a function to flatten the tokens in the 'paragraphs' key
def flatten_paragraphs(list_paragraphs):
    list_sentences = []
    for paragraph in list_paragraphs:
        for sentence in paragraph:
            sent = ' '.join(sentence)
            list_sentences.append(sent)
    return list_sentences

In [None]:
def flatten(nested_list):
    final_list = []
    for list1 in nested_list:
        for list2 in list1:
            final_list.append(list2)
    return final_list

In [None]:
def flatten_summaries(list_sentences):
    final_sentences = []
    for sentence in list_sentences:
        sent = ' '.join(sentence)
        final_sentences.append(sent)
    return final_sentences

In [None]:
flatten(data[0]['gold_labels'])

In [None]:
flatten_paragraphs(data[0]['paragraphs'])

In [None]:
flatten_summaries(data[0]['summary'])

['Eman Ahmed Abd El Aty memiliki berat badan mencapai 500 kilogram sebelum menjalankan operasi di Mumbai Maret lalu dimana ia mengurangi seperlima dari berat badannya .',
 'Abd El Aty diberi diet cairan khusus selama berada di India yang bertujuan menurunkan berat badan .',
 'Kini , berat badannya telah turun drastis sebanyak 323 kilogram dalam tiga bulan .',
 'Sekarang berat badannya tinggal 176,6 kilogram .']

In [None]:
for datum in tqdm(data):
    datum['flatten_article'] = flatten_paragraphs(datum['paragraphs'])
    datum['flatten_summary'] = flatten_summaries(datum['summary'])

100%|██████████| 3762/3762 [00:00<00:00, 18027.22it/s]


In [None]:
with open('datasets/test_01.json', 'w') as file:
    file.write(json.dumps(data))

# Collect Data

In [6]:
data = []
with open('datasets/test_01.json', 'r') as file:
    data = json.loads(file.read())

In [7]:
X_test = [' '.join(datum['flatten_article']) for datum in data]
y_test = [' '.join(datum['flatten_summary']) for datum in data]

# Define Text Preprocessing

In [8]:
REGEX_URL = r'((http|https)\:\/\/)[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*'
clear_url = lambda text: re.sub(REGEX_URL, ' ', text)
DOT_REGEX = r"(?<!\w)(?:[A-Z][A-Za-z]{,3}|[a-z]{1,2})\."

############################################################################

@dataclass(frozen=True)
class Preprocessing:
    """Preprocessing class used to preprocess news text before Text
    Summarization is applied.
    
    - Usage:
    ```
    >>> preprocessor = Preprocessing()
    >>> text = "any news text"
    >>> site_name = "media site"
    >>> clean_text = preprocessor(text, site_name)
    ```
    """

    def _clear_content_head(self, content: str, site_name: str,
                           head_pattern: str=r"\s\-+\s") -> str:
        """used to clear any head in given news content"""

        match = re.search(head_pattern, content)
        if match:
            idx_end = match.end()
            site_name = site_name.split()[0]
            if site_name.lower() in content[:idx_end].lower():
                content = content[idx_end:]

        return content

#################################

    def _clear_abbreviation_dot(self, text: str) -> str:
        """used to rip off abbreviation dot in given text"""

        # replace any matched abbr with empty string
        text_list = list(text)
        for i, match in enumerate(re.finditer(DOT_REGEX, text)):
            no_dot = match.group().replace('.', '')
            idx = match.span()
            text_list[idx[0]-i: idx[1]-i] = no_dot

        # join list text and clear multiple whitespaces
        text = ''.join(text_list)
        text = re.sub(' +', ' ', text)
    
#################################

    def __call__(self, content: str, site_name: str) -> Union[str, bool]:

        """the method is used to:
        - clear any content head
        - clear any heading/tailing whitespace & punct
        - clear any abbreviation dot
        Args:
        - content (str): news content
        - site_name (str): news site name
        Return:
        preprocessed content
        """

        content = self._clear_content_head(content, site_name)
        content = clear_url(content)

        # clear leadding/trailing whitespaces & puncts
        content = content.strip(string.punctuation)
        content = content.strip()

        # change multiple whitespaces to single one
        content = re.sub(' +', ' ', content)

        # clear whitespace before dot
        content = re.sub(r'\s+([?,.!"])', r'\1', content)

        return content

In [9]:
sample_text = """Bisnis.com , JAKARTA - Emiten barang konsumen PT Unilever Indonesia Tbk. memutuskan untuk membagikan dividen interim kepada pemegang saham pada akhir tahun ini. Berdasarkan pengumuman perseroan di harian Bisnis Indonesia hari ini, Senin (22/11/2021), emiten dengan kode saham UNVR ini akan membagikan dividen interim senilai total Rp2,51 triliun. Keputusan pembagian dividen ini diambil dalam Rapat Direksi Unilever Indonesia pada 19 November 2021. Dividen interim itu akan diambil dari laba bersih perseroan untuk periode yang berakhir pada 30 Juni 2021. Dengan jumlah pemegang saham UNVR sebanyak 38,15 miliar saham, artinya satu saham UNVR akan mendapat dividen senilai Rp66. Berikut jadwal pelaksanaan dividen interim UNVR: Berdasarkan laporan keuangan per 30 Juni 2021, UNVR membukukan pendapatan senilai Rp20,17 triliun atau turun 7,32 persen dibandingkan periode yang sama tahun sebelumnya Rp21,77 triliun. Laba perseroan terkoreksi 15,85 persen menjadi Rp3,04 triliun dari sebelumnya Rp3,61 triliun. Laba sebelum bunga, pajak, penyusutan, dan amortisasi (EBITDA) turun 13,91 persen menjadi Rp4,55 triliun dari sebelumnya Rp5,29 triliun. Sebelumnya, UNVR tercatat membagikan dividen final tahun buku 2020 senilai Rp3,81 triliun atau Rp100 per saham pada Juni 2021. Dividen tersebut berasal dari laba bersih tahun penuh 2020. Dengan adanya dividen interim yang akan dibayar pada Desember 2021, maka UNVR membagikan total dividen Rp6,31 triliun sepanjang 2021."""
sample_text

'Bisnis.com , JAKARTA - Emiten barang konsumen PT Unilever Indonesia Tbk. memutuskan untuk membagikan dividen interim kepada pemegang saham pada akhir tahun ini. Berdasarkan pengumuman perseroan di harian Bisnis Indonesia hari ini, Senin (22/11/2021), emiten dengan kode saham UNVR ini akan membagikan dividen interim senilai total Rp2,51 triliun. Keputusan pembagian dividen ini diambil dalam Rapat Direksi Unilever Indonesia pada 19 November 2021. Dividen interim itu akan diambil dari laba bersih perseroan untuk periode yang berakhir pada 30 Juni 2021. Dengan jumlah pemegang saham UNVR sebanyak 38,15 miliar saham, artinya satu saham UNVR akan mendapat dividen senilai Rp66. Berikut jadwal pelaksanaan dividen interim UNVR: Berdasarkan laporan keuangan per 30 Juni 2021, UNVR membukukan pendapatan senilai Rp20,17 triliun atau turun 7,32 persen dibandingkan periode yang sama tahun sebelumnya Rp21,77 triliun. Laba perseroan terkoreksi 15,85 persen menjadi Rp3,04 triliun dari sebelumnya Rp3,61 

In [10]:
preprocessor = Preprocessing()
preprocessor(sample_text, "Bisnis")

'Emiten barang konsumen PT Unilever Indonesia Tbk. memutuskan untuk membagikan dividen interim kepada pemegang saham pada akhir tahun ini. Berdasarkan pengumuman perseroan di harian Bisnis Indonesia hari ini, Senin (22/11/2021), emiten dengan kode saham UNVR ini akan membagikan dividen interim senilai total Rp2,51 triliun. Keputusan pembagian dividen ini diambil dalam Rapat Direksi Unilever Indonesia pada 19 November 2021. Dividen interim itu akan diambil dari laba bersih perseroan untuk periode yang berakhir pada 30 Juni 2021. Dengan jumlah pemegang saham UNVR sebanyak 38,15 miliar saham, artinya satu saham UNVR akan mendapat dividen senilai Rp66. Berikut jadwal pelaksanaan dividen interim UNVR: Berdasarkan laporan keuangan per 30 Juni 2021, UNVR membukukan pendapatan senilai Rp20,17 triliun atau turun 7,32 persen dibandingkan periode yang sama tahun sebelumnya Rp21,77 triliun. Laba perseroan terkoreksi 15,85 persen menjadi Rp3,04 triliun dari sebelumnya Rp3,61 triliun. Laba sebelum b

# Define Summarizers

## BERT Extractive Summarizer

In [23]:
# Load model, model config and tokenizer via Transformers
custom_config = AutoConfig.from_pretrained(pretrained_model_name_or_path="indobenchmark/indobert-base-p1")
custom_config.output_hidden_states = True
custom_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="indobenchmark/indobert-base-p1")
custom_model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1", config=custom_config)

# instantiate model
model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)

Downloading:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/224k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/475M [00:00<?, ?B/s]

In [23]:
summary = model(sample_text)
summary

'Bisnis.com , JAKARTA - Emiten barang konsumen PT Unilever Indonesia Tbk. Dividen interim itu akan diambil dari laba bersih perseroan untuk periode yang berakhir pada 30 Juni 2021. Berikut jadwal pelaksanaan dividen interim UNVR: Berdasarkan laporan keuangan per 30 Juni 2021, UNVR membukukan pendapatan senilai Rp20,17 triliun atau turun 7,32 persen dibandingkan periode yang sama tahun sebelumnya Rp21,77 triliun.'

## Word2Vec Extractive Summarizer

- define `Embedder()` class for word embeddings process

In [11]:
@dataclass(frozen=True)
class Embedder:
    """This class is used to create word embeddings from given sentence.
    The processes implemented are the following:
    - convert each token of given sentence to its representative vector;
    - calculate mean of all tokens in given sentence in order to get a
    sentence embedding.
    Arg:
    - model: a gensim Word2Vec model
    """

    model: Word2Vec

######################

    def __get_vector(self, token: str) -> np.ndarray:
        """used to convert given token to its representative vector"""
        try:
            return self.model.wv.get_vector(token)
        except KeyError:
            return False

######################

    def __averaging(self, token_matrix: np.ndarray) -> np.ndarray:
        """used to calculate mean of an array of vectors in order to get a
        sentence embedding"""
        return np.mean(token_matrix, axis=0)

######################

    def embed(self, sentence: str, return_oov: bool=False) -> np.ndarray:
        """combine all other methods to execute the embedding process.
        
        Args:
        - sentence (str): a sentence to be process to get its embedding
        - return_oov(bool): indicate if you'd like to return the OOV
        (out-of-vocabulary) tokens
        
        Returns:
        If all tokens in given sentence are OOV tokens, return False (and with
        list of OOVs if 'return_oov' set to True).
        else, return the sentence embedding (and with list of OOVs if
        'return_oov' set to True).
        """

        # make the given sentence lower and collect only words
        list_tok = re.findall(r"\w+", sentence.lower())

        # buffers
        list_vec = []
        OOV_tokens = []

        # loop through each token of given sentence
        for token in list_tok:
            tokvec = self.__get_vector(token) # convert to vector

            # check if no OOV token produced
            if isinstance(tokvec, np.ndarray):
                list_vec.append(tokvec)
            else:
                OOV_tokens.append(token)

        # if all tokens in given sentence are OOV tokens
        if not list_vec:
            if return_oov:
                return False, OOV_tokens
            return False

        # if not
        list_vec = np.array(list_vec)
        if return_oov:
            return (self.__averaging(list_vec), OOV_tokens)
        return self.__averaging(list_vec)

- define `Clustering()` class for clustering model process

In [12]:
@dataclass(frozen=True)
class Clustering:
    """This class is used to cluster sentence embeddings in order to execute
    text summarization. The processes implemented are thr following:
    - define a KNN clustering model;
    - train the model;
    - find sentences closest to the cluster's center.
    Args:
    - features (np.ndarray): sentence embeddings
    - random_state (int - optional): random state for random seed
    """

    features: np.ndarray
    random_state: int = 1

######################

    def __define_model(self, k: int) -> None:
        """used to define KNN clustering model"""

        model = KMeans(n_clusters=k, random_state=self.random_state)
        object.__setattr__(self, 'model', model)

######################

    def __find_closest_sents(self, centroids: np.ndarray) -> Dict:
        """
        Find the closest arguments to centroid.
        - centroids: Centroids to find closest.
        - return: Closest arguments.
        """

        centroid_min = 1e10
        cur_arg = -1
        args = {}
        used_idx = []

        for j, centroid in enumerate(centroids):

            for i, feature in enumerate(self.features):
                value = np.linalg.norm(feature - centroid)

                if value < centroid_min and i not in used_idx:
                    cur_arg = i
                    centroid_min = value

            used_idx.append(cur_arg)
            args[j] = cur_arg
            centroid_min = 1e10
            cur_arg = -1

        return args

######################

    def cluster(self, ratio: float = 0.2,
                num_sentences: int = None) -> List[int]:
        """
        Clusters sentences based on the ratio.
        - ratio: Ratio to use for clustering.
        - num_sentences: Number of sentences. Overrides ratio.
        return: Sentences index that qualify for summary.
        """

        # set k value
        if num_sentences is not None:
            if num_sentences == 0:
                return []
            k = min(num_sentences, len(self.features))
        else:
            k = max(int(len(self.features) * ratio), 1)

        # define n train the model
        self.__define_model(k)
        self.model.fit(self.features)

        # find the closest embeddings to the center
        centroids = self.model.cluster_centers_
        cluster_args = self.__find_closest_sents(centroids)

        sorted_values = sorted(cluster_args.values())
        return sorted_values

- define `Word2VecSummarizer()` class for the main class of Word2Vec Extractive Summarizer

In [13]:
@dataclass(frozen=True)
class Word2VecSummarizer:
    """The main class for Word2Vec Summarizer
    Args:
    - model: A gensim Word2Vec model (optional)
    - random_state: state for random seed (optional)
    """
    def __init__(self, model: Word2Vec, random_state: int=1):
        object.__setattr__(self, 'model', model)
        object.__setattr__(self, 'random_state', random_state)

######################

    def __split_sentence(self, text: str) -> List[str]:
        """used to split given text into sentences"""
        sentences = sent_tokenize(text)
        return [sent for sent in sentences if len(sent) >= 5]

######################

    def __set_embedder(self) -> None:
        """used to instantiate Embedder object"""
        embedder = Embedder(self.model)
        object.__setattr__(self, 'embedder', embedder)

######################

    def __set_clusterer(self, features: np.ndarray,
                        random_state: int) -> None:
        """used to instantiate Clustering object"""
        clusterer = Clustering(features, random_state)
        object.__setattr__(self, 'clusterer', clusterer)

######################

    def summarize(self, text: str,
                  use_first: bool = True,
                  num_sentences: int = None,
                  ratio: float = 0.2,
                  return_oov: bool = False) -> Tuple[List[str], np.ndarray]:
        """
        This method executes the summarization part.
        
        Args:
        - text (str): text to be processed
        - use_first (bool-default True): indicate if the first sentence of the text used
        - num_sentences (int): whether you'd like to return certain number of summarized sentences (optional)
        - ratio (float-default 0.2): ratio of sentences to use
        - return_oov(bool-default False): indicate if you'd like to return the OOV
        (out-of-vocabulary) tokens
        
        Returns: tuple of sentences and related embeddings (and OOV list if return_oov set to True)
        """
        list_sentence = self.__split_sentence(text)
        self.__set_embedder()

        # set buffers
        sent_vecs = []
        oov_list = []

        # loop through each sentence to create each embeddings
        for sentence in list_sentence:
            if return_oov:
                vec, oov = self.embedder.embed(sentence, return_oov)
                oov_list.extend(oov)
            else:
                vec = self.embedder.embed(sentence, return_oov)

            # check if no OOV returned
            if isinstance(vec, np.ndarray):
                sent_vecs.append(vec)

        sent_vecs = np.array(sent_vecs) # create array of all embeddings

        # instantiate clustering & process
        self.__set_clusterer(sent_vecs, self.random_state)
        summary_idx = self.clusterer.cluster(ratio, num_sentences)

        if use_first:
            if not summary_idx:
                summary_idx.append(0)

            elif summary_idx[0] != 0:
                summary_idx.insert(0, 0)

        sentences = [list_sentence[idx] for idx in summary_idx]
        embeddings = np.asarray([sent_vecs[idx] for idx in summary_idx])

        if return_oov:
            return sentences, oov_list
        return sentences

In [14]:
MODEL_PATH = "models/model_wv"
MODEL_WORD2VEC = Word2Vec.load(MODEL_PATH)

word2vecsum = Word2VecSummarizer(MODEL_WORD2VEC)

In [51]:
' '.join(word2vecsum.summarize(sample_text))

'Bisnis.com , JAKARTA - Emiten barang konsumen PT Unilever Indonesia Tbk. Berdasarkan pengumuman perseroan di harian Bisnis Indonesia hari ini, Senin (22/11/2021), emiten dengan kode saham UNVR ini akan membagikan dividen interim senilai total Rp2,51 triliun. Berikut jadwal pelaksanaan dividen interim UNVR: Berdasarkan laporan keuangan per 30 Juni 2021, UNVR membukukan pendapatan senilai Rp20,17 triliun atau turun 7,32 persen dibandingkan periode yang sama tahun sebelumnya Rp21,77 triliun.'

# Evaluation

In [15]:
list_sitenames = [datum['source'] for datum in data]

In [16]:
# summarize!
summaries = []
for article, site in tqdm(zip(X_test[:500], list_sitenames[:500])):
    clean_article = preprocessor(article, site)

    sum_bert = model(clean_article)
    sum_word2vec = ' '.join(word2vecsum.summarize(clean_article))

    summaries.append((sum_bert, sum_word2vec))

500it [00:15, 32.72it/s]


## Evaluate ROUGE

In [17]:
rouge = Rouge()

In [77]:
rouge_scores_bert = rouge.get_scores(hyps=[summ[0] for summ in summaries],
                                     refs=y_test[:500], avg=True)

In [18]:
rouge_scores_word2vec = rouge.get_scores(hyps=[summ[1] for summ in summaries],
                                     refs=y_test[:500], avg=True)

In [79]:
rouge_scores_bert

{'rouge-1': {'f': 0.45927942402628796,
  'p': 0.434197748039035,
  'r': 0.5081185596360001},
 'rouge-2': {'f': 0.31694698210005817,
  'p': 0.30725531612332807,
  'r': 0.3447222880614652},
 'rouge-l': {'f': 0.4461234855141844,
  'p': 0.42201833286251855,
  'r': 0.4932598414287366}}

In [19]:
rouge_scores_word2vec

{'rouge-1': {'f': 0.4567773819588521,
  'p': 0.43944199116899846,
  'r': 0.5005609963295804},
 'rouge-2': {'f': 0.31206062261051887,
  'p': 0.30998871683687257,
  'r': 0.33539796728475707},
 'rouge-l': {'f': 0.4434959507766516,
  'p': 0.427033276945876,
  'r': 0.4856715493195363}}

In [28]:
# ROUGE metrics for BERT
pd.DataFrame(rouge_scores_bert)

Unnamed: 0,rouge-1,rouge-2,rouge-l
f,0.459279,0.316947,0.446123
p,0.434198,0.307255,0.422018
r,0.508119,0.344722,0.49326


In [29]:
# ROUGE metrics for Word2Vec
pd.DataFrame(rouge_scores_word2vec)

Unnamed: 0,rouge-1,rouge-2,rouge-l
r,0.500561,0.335398,0.485672
p,0.439442,0.309989,0.427033
f,0.456777,0.312061,0.443496


## Evaluate Processing Speed

- test processing speed of BERT in CPU environment

In [90]:
%%timeit
for article, site in zip(X_test[:5], list_sitenames[:5]):
    clean_article = preprocessor(article, site)
    sum_bert = model(clean_article)

1 loop, best of 5: 11 s per loop


- test processing speed of BERT in GPU environment

In [24]:
%%timeit
for article, site in zip(X_test[:5], list_sitenames[:5]):
    clean_article = preprocessor(article, site)
    sum_bert = model(clean_article)

1 loop, best of 5: 1.63 s per loop


- test processing speed of Word2Vec in CPU environment

In [91]:
%%timeit
for article, site in zip(X_test[:5], list_sitenames[:5]):
    clean_article = preprocessor(article, site)
    sum_word2vec = ' '.join(word2vecsum.summarize(clean_article))

10 loops, best of 5: 84.4 ms per loop


In [30]:
pd.DataFrame({'Type': ['BERT-CPU', 'BERT-GPU', 'Word2Vec'],
              'Result': ['1 loop, best of 5: 11 s per loop',
                         '1 loop, best of 5: 1.63 s per loop',
                         '10 loops, best of 5: 84.4 ms per loop']})

Unnamed: 0,Type,Result
0,BERT-CPU,"1 loop, best of 5: 11 s per loop"
1,BERT-GPU,"1 loop, best of 5: 1.63 s per loop"
2,Word2Vec,"10 loops, best of 5: 84.4 ms per loop"


In [31]:
len(data)

3762