In [89]:
## LexRankSummarizer

from sumy.summarizers.lex_rank import LexRankSummarizer #We're choosing Lexrank, other algorithms are also built in
from sumy.parsers.plaintext import PlaintextParser #We're choosing a plaintext parser here, other parsers available for HTML etc.
from sumy.nlp.tokenizers import Tokenizer 

text = (
    "Thomas A. Anderson is a man living two lives. By day he is an "
    "average computer programmer and by night a hacker known as "
    "Neo. Neo has always questioned his reality, but the truth is "
    "far beyond his imagination. Neo finds himself targeted by the "
    "police when he is contacted by Morpheus, a legendary computer "
    "hacker branded a terrorist by the government. Morpheus awakens "
    "Neo to the real world, a ravaged wasteland where most of "
    "humanity have been captured by a race of machines that live "
    "off of the humans' body heat and electrochemical energy and "
    "who imprison their minds within an artificial reality known as "
    "the Matrix. As a rebel against the machines, Neo must return to "
    "the Matrix and confront the agents: super-powerful computer "
    "programs devoted to snuffing out Neo and the entire human "
    "rebellion. "
)

#file = "plain_text.txt" #name of the plain-text file
#parser = PlaintextParser.from_file(file, Tokenizer("english"))

# def LexRank(text, max):
#     parser = PlaintextParser.from_string(text, Tokenizer("english"))
#     summarizer = LexRankSummarizer()
#     summary = summarizer(parser.document, 1) #Summarize the document with 5 sentences
#     #for sentence in summary:
#     #    result = sentence
#     #    #print (sentence)
#     result = str(summary[0]).split()
#     result = result[0:max]
#     result = ' '.join(result)
#     return (result)

def LexRank(text, max):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, 1)
    result = str(summary[0]).split()
    result = result[0:max]
    result = ' '.join(result)
    return (result)

print(LexRank(text,10))

Thomas A. Anderson is a man living two lives.


In [92]:
## Luhn Summarizer

from sumy.summarizers.luhn import LuhnSummarizer
summarizer_1 = LuhnSummarizer()
summary_1 =summarizer_1(parser.document,1)

for sentence in summary_1:
    print(sentence)

def Luhn(text, max):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer_1 = LuhnSummarizer()
    summary = summarizer_1(parser.document, 1)
    result = str(summary[0]).split()
    result = result[0:max]
    result = ' '.join(result)
    return (result)

print(Luhn(text,10))

By day he is an average computer programmer and by night a hacker known as Neo.
By day he is an average computer programmer and by


In [42]:
## LSA Summarizer

from sumy.summarizers.lsa import LsaSummarizer
summarizer_2 = LsaSummarizer()
summary_2 =summarizer_2(parser.document,1)

for sentence in summary_2:
    print(sentence)

Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.


In [43]:
## TextRank Summarizer

from sumy.summarizers.text_rank import TextRankSummarizer
summarizer_3 = TextRankSummarizer()
summary_3 =summarizer_3(parser.document,1)
for sentence in summary_3:
    print(sentence)

Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.


In [93]:
## Google T5 Summarizer

import torch
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained('t5-base')
model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)

inputs = tokenizer.encode("summarize: " + text,return_tensors='pt',max_length=512,truncation=True)

summary_ids = model.generate(inputs, max_length=10)

summary = tokenizer.decode(summary_ids[0])
print(summary)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


<pad> computer programmer and hacker is awaken


In [28]:
## BERT Summarizer

from summarizer import Summarizer

text = (
    "Thomas A. Anderson is a man living two lives. By day he is an "
    "average computer programmer and by night a hacker known as "
    "Neo. Neo has always questioned his reality, but the truth is "
    "far beyond his imagination. Neo finds himself targeted by the "
    "police when he is contacted by Morpheus, a legendary computer "
    "hacker branded a terrorist by the government. Morpheus awakens "
    "Neo to the real world, a ravaged wasteland where most of "
    "humanity have been captured by a race of machines that live "
    "off of the humans' body heat and electrochemical energy and "
    "who imprison their minds within an artificial reality known as "
    "the Matrix. As a rebel against the machines, Neo must return to "
    "the Matrix and confront the agents: super-powerful computer "
    "programs devoted to snuffing out Neo and the entire human "
    "rebellion. "
)

model = Summarizer()

result = model(text, num_sentences=2)
print(result)




Thomas A. Anderson is a man living two lives. As a rebel against the machines, Neo must return to the Matrix and confront the agents: super-powerful computer programs devoted to snuffing out Neo and the entire human rebellion.


In [31]:
## SBERT Summarizer

from summarizer.sbert import SBertSummarizer

text = (
    "Thomas A. Anderson is a man living two lives. By day he is an "
    "average computer programmer and by night a hacker known as "
    "Neo. Neo has always questioned his reality, but the truth is "
    "far beyond his imagination. Neo finds himself targeted by the "
    "police when he is contacted by Morpheus, a legendary computer "
    "hacker branded a terrorist by the government. Morpheus awakens "
    "Neo to the real world, a ravaged wasteland where most of "
    "humanity have been captured by a race of machines that live "
    "off of the humans' body heat and electrochemical energy and "
    "who imprison their minds within an artificial reality known as "
    "the Matrix. As a rebel against the machines, Neo must return to "
    "the Matrix and confront the agents: super-powerful computer "
    "programs devoted to snuffing out Neo and the entire human "
    "rebellion. "
)

model = SBertSummarizer('paraphrase-MiniLM-L6-v2')
result = model(text, num_sentences=1)
print(result)

Thomas A. Anderson is a man living two lives.


In [32]:
## Calculating Elbow

from summarizer import Summarizer

text = (
    "Thomas A. Anderson is a man living two lives. By day he is an "
    "average computer programmer and by night a hacker known as "
    "Neo. Neo has always questioned his reality, but the truth is "
    "far beyond his imagination. Neo finds himself targeted by the "
    "police when he is contacted by Morpheus, a legendary computer "
    "hacker branded a terrorist by the government. Morpheus awakens "
    "Neo to the real world, a ravaged wasteland where most of "
    "humanity have been captured by a race of machines that live "
    "off of the humans' body heat and electrochemical energy and "
    "who imprison their minds within an artificial reality known as "
    "the Matrix. As a rebel against the machines, Neo must return to "
    "the Matrix and confront the agents: super-powerful computer "
    "programs devoted to snuffing out Neo and the entire human "
    "rebellion. "
)
model = Summarizer()
res = model.calculate_elbow(text, k_max=10)
print(res)



[481.9498291015625, 295.1265563964844, 202.5259246826172, 124.87324523925781, 51.752540588378906]


In [33]:
## Calculate optimal number of sentences

from summarizer import Summarizer

text = (
    "Thomas A. Anderson is a man living two lives. By day he is an "
    "average computer programmer and by night a hacker known as "
    "Neo. Neo has always questioned his reality, but the truth is "
    "far beyond his imagination. Neo finds himself targeted by the "
    "police when he is contacted by Morpheus, a legendary computer "
    "hacker branded a terrorist by the government. Morpheus awakens "
    "Neo to the real world, a ravaged wasteland where most of "
    "humanity have been captured by a race of machines that live "
    "off of the humans' body heat and electrochemical energy and "
    "who imprison their minds within an artificial reality known as "
    "the Matrix. As a rebel against the machines, Neo must return to "
    "the Matrix and confront the agents: super-powerful computer "
    "programs devoted to snuffing out Neo and the entire human "
    "rebellion. "
)
model = Summarizer()
res = model.calculate_optimal_k(text, k_max=10)
print(res)



3


In [98]:
## LexRankSummarizer

from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.nlp.tokenizers import Tokenizer 

from transformers import AutoTokenizer, AutoModelWithLMHead
from summarizer import Summarizer
import torch

def LexRank(text, max):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, 1)
    result = str(summary[0]).split()
    result = result[0:max]
    result = ' '.join(result)
    return (result)

def Luhn(text, max):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LuhnSummarizer()
    summary = summarizer(parser.document, 1)
    result = str(summary[0]).split()
    result = result[0:max]
    result = ' '.join(result)
    return (result)

def LSA(text, max):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, 1)
    result = str(summary[0]).split()
    result = result[0:max]
    result = ' '.join(result)
    return (result)

def TextRank(text, max):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    summary = summarizer(parser.document, 1)
    result = str(summary[0]).split()
    result = result[0:max]
    result = ' '.join(result)
    return (result)

def T5(text, max):
    tokenizer = AutoTokenizer.from_pretrained('t5-base')
    model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)

    inputs = tokenizer.encode("summarize: " + text,return_tensors='pt',max_length=512,truncation=True)
    summary_ids = model.generate(text, max_length=max)
    summary = tokenizer.decode(summary_ids[0])
    return (summary)

def Bert(text, max):
    model = Summarizer()
    result = model(text, num_sentences=1)
    print(result)

def SBert(text, max):
    ####

IndentationError: expected an indented block after function definition on line 58 (2274062423.py, line 61)

In [90]:
## Tests summarizers against the ground truth

import gensim.corpora as corpora
import pandas as pd
import gensim

csv = pd.read_csv("ground_truths/bgl_lines.txt_structured.csv")
content = csv["EventTemplate"]
num_topics = 10
dataset = "bgl"
line_file = []
line_set = []

# Converts sentences to words
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

for idx, line in enumerate(content):
    line_set.append(line + '\n')

    if (idx % 20 == 19):
        summary = LexRank(line_set,10)
        print(summary)

        #Appends summary to general line file
        for num in range(20):
            line_file.append(summary)

## Writes external file with created topics
with open ("ground_truths/" + dataset + "__lexrank.txt", "w") as f:
     for line in line_file:
          f.write(f"{line}\n")

['RAS APP FATAL ciod : Error creating node map from
['RAS APP FATAL ciod : Error creating node map from
['RAS APP FATAL ciod : Error creating node map from
['RAS APP FATAL ciod : Error creating node map from
['RAS APP FATAL ciod : Error creating node map from
['RAS APP FATAL ciod : Error creating node map from
['RAS APP FATAL ciod : Error creating node map from
['RAS APP FATAL ciod : Error creating node map from
['RAS APP FATAL ciod : Error creating node map from
['RAS APP FATAL ciod : Error creating node map from
['RAS APP FATAL ciod : Error creating node map from
['RAS APP FATAL ciod : Error creating node map from
['RAS APP FATAL ciod : Error creating node map from
['RAS APP FATAL ciod : Error creating node map from
['RAS APP FATAL ciod : Error creating node map from
['RAS APP FATAL ciod : Error creating node map from
['RAS APP FATAL ciod : Error creating node map from
['RAS APP FATAL ciod : Error creating node map from
['RAS APP FATAL ciod : Error creating node map from
['RAS APP FA