In [None]:
import os
import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.kl import KLSummarizer
from sumy.summarizers.reduction import ReductionSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer
from sumy.summarizers.random import RandomSummarizer
from sumy.summarizers.sum_basic import SumBasicSummarizer
from sumy.utils import get_stop_words
from sumy.nlp.stemmers import Stemmer

from helpers import *

In [None]:
SENTENCES_COUNT = 7

In [None]:
ENGLISH = "english"
UKRAINIAN = "ukrainian"

In [None]:
def summarize_extractively(summarizer, tokenizer):
    storage_folder = os.path.join(os.path.curdir, "laws_with_abstracts")
    summarized_folder = os.path.join(storage_folder, "summarized")

    language_folder = os.path.join(summarized_folder, tokenizer.language)
    create_folder_if_not_exists(language_folder)

    approach_folder = os.path.join(language_folder, "extractive")
    create_folder_if_not_exists(approach_folder)

    summarizer_name = summarizer.__class__.__name__
    method_folder = os.path.join(approach_folder, summarizer_name)

    create_folder_if_not_exists(method_folder)

    laws_folder = os.path.join(storage_folder, tokenizer.language, "laws")
    laws_folder_listed = os.listdir(laws_folder)

    for index, _ in enumerate(laws_folder_listed):
        start_index = get_next_index(method_folder, split_by_dot=True)

        if int(start_index) > index:
            continue

        law_folder = os.path.join(laws_folder, str(index))
        paras_folder = os.listdir(law_folder)
        para_list = []

        summarized_law_path = os.path.join(method_folder, f"{start_index}.txt")

        for index, _ in enumerate(paras_folder):
            para_file = os.path.join(law_folder, f"{str(index)}.txt")

            with open(para_file, 'r') as file:
                para_text = file.read()
                para_list.append(para_text)

        law_text = " ".join(para_list)
        parser = PlaintextParser.from_string(law_text, tokenizer)
        summarized = []

        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            summarized.append(sentence)

        summarized_text = ' '.join(str(v) for v in summarized)

        with open(summarized_law_path, 'w') as f:
            f.write(summarized_text)

In [None]:
def execute_extractive_summarization(language):
    stemmer = Stemmer(language)
    tokenizer = Tokenizer(language)

    summarizers = [
        LuhnSummarizer(stemmer),
        SumBasicSummarizer(),
        RandomSummarizer(stemmer),
        ReductionSummarizer(stemmer),
        KLSummarizer(stemmer),
        LexRankSummarizer(stemmer),
        TextRankSummarizer(stemmer),
        LsaSummarizer(stemmer)
    ]

    edmundson_summarizer = EdmundsonSummarizer(stemmer)
    edmundson_summarizer.bonus_words = ('foo',)
    edmundson_summarizer.stigma_words = ('foo',)
    edmundson_summarizer.null_words = ('foo',)
    summarizers.append(edmundson_summarizer)

    for summarizer in summarizers:
        summarizer.stop_words = get_stop_words(language)
        summarize_extractively(summarizer, tokenizer)

In [None]:
# execute_extractive_summarization(UKRAINIAN)
# execute_extractive_summarization(ENGLISH)