In [5]:
import re #regular expression
import nltk #natural language toolkil
nltk.download('punkt')
nltk.download('stopwords')
import string
import numpy as np
import heapq
import networkx as nx
from nltk.cluster.util import cosine_distance
from goose3 import Goose

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def preprocess(text):
    formatted_text = text.lower()
    tokens = []
    stopwords = nltk.corpus.stopwords.words('english')
    for token in nltk.word_tokenize(formatted_text):
        if token not in stopwords and token not in string.punctuation:
            tokens.append(token)
    # print(tokens)
    formatted_text = ' '.join(tokens)
    return formatted_text

Frequency Algorithm

In [7]:
def summarize_freq(original_text,ratio):
    formatted_text = preprocess(original_text)
    word_frequency = nltk.FreqDist(nltk.word_tokenize(formatted_text))
    max_freq = max(word_frequency.values())
    for word in word_frequency.keys():
        word_frequency[word] /=max_freq
    sentence_list = nltk.sent_tokenize(original_text)
    score_sentence = {}
    for sentence in sentence_list:
        # print(sentence)
        count=0
        for word in nltk.word_tokenize(sentence.lower()):
            if word in word_frequency.keys():
                count+=word_frequency[word]
        score_sentence[sentence]=count
        
    import heapq
    best_sentences = heapq.nlargest(int(len(sentence_list)*ratio),score_sentence,key = score_sentence.get)
    return sentence_list,best_sentences,score_sentence

Luhn Algorithm

In [8]:
def calculate_sentences_score(sentences,important_words,distance):
    scores = []
    sentence_index = 0
    for sentence in [nltk.word_tokenize(sentence) for sentence in sentences]:
        word_index = []
        for word in important_words:
            try:
                word_index.append(sentence.index(word))
            except ValueError:
                pass
        if len(word_index) == 0:
            continue
        word_index.sort()
        groups_list = []
        group = [word_index[0]]
        i = 1 # 3
        while i < len(word_index): # 3
            if word_index[i] - word_index[i - 1] < distance:
                group.append(word_index[i])
            else:
                groups_list.append(group[:])
                group = [word_index[i]]
            i += 1
        groups_list.append(group)
        max_group_score = 0
        for g in groups_list:
            important_words_in_group = len(g)
            total_words_in_group = g[-1] - g[0] + 1
            score = 1.0 * important_words_in_group**2 / total_words_in_group
            if score > max_group_score:
                max_group_score = score
        scores.append((max_group_score, sentence_index))
        sentence_index += 1
    return scores

In [9]:
def summarize_luhn(text,distance,top_n_words,number_of_sentences,ratio):
    original_sentences = [sentence for sentence in nltk.sent_tokenize(text)]
    formatted_sentences= [preprocess(original_sentence) for original_sentence in original_sentences]
    words = [word for sentence in formatted_sentences for word in nltk.word_tokenize(sentence)]
    freq_words = nltk.FreqDist(words)
    top_n_words = [word[0] for word in freq_words.most_common(top_n_words)]
    sentences_score = calculate_sentences_score(formatted_sentences,top_n_words,distance)
    if ratio>0:
        best_sentences = heapq.nlargest(int(len(formatted_sentences)*ratio),sentences_score)
    else:
        best_sentences = heapq.nlargest(number_of_sentences,sentences_score)
    best_sentences = [original_sentences[i] for (score,i) in best_sentences]
    return original_sentences, best_sentences, sentences_score

Cosine Similarity

In [10]:
def calculate_sentence_similarity(sentence1, sentence2):
    words1 = [word for word in nltk.word_tokenize(sentence1)]
    words2 = [word for word in nltk.word_tokenize(sentence2)]
    all_words = list(set(words1 + words2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    for word in words1: # Bag of words
        vector1[all_words.index(word)] += 1
    for word in words2:
        vector2[all_words.index(word)] += 1
    return 1 - cosine_distance(vector1, vector2)

In [11]:
def calculate_similarity_matrix(sentences):
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    #print(similarity_matrix)
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i == j:
                continue
            similarity_matrix[i][j] = calculate_sentence_similarity(sentences[i], sentences[j])
    return similarity_matrix

In [12]:
def summarize_cosine(text, number_of_sentences, percentage = 0):
    original_sentences = [sentence for sentence in nltk.sent_tokenize(text)]
    formatted_sentences = [preprocess(original_sentence) for original_sentence in original_sentences]
    similarity_matrix = calculate_similarity_matrix(formatted_sentences)
    similarity_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(similarity_graph)
    ordered_scores = sorted(((scores[i], score) for i, score in enumerate(original_sentences)), reverse=True)
    if percentage > 0:
        number_of_sentences = int(len(formatted_sentences) * percentage)
    best_sentences = []
    for sentence in range(number_of_sentences):
        best_sentences.append(ordered_scores[sentence][1])
    return original_sentences, best_sentences, ordered_scores

Visualise

In [26]:
def visualse(title, sentence_list,best_sentences):
    from IPython.core.display import HTML
    text = ''
    display(HTML(f'<h1>Summary - {title}</h1>'))
    for sentence in sentence_list:
        if sentence in best_sentences:
        #     text += ' '+sentence.replace(sentence,f"<mark>{sentence}</mark>")
        # else:
            text += sentence
    display(HTML(f"""{text}"""))

In [None]:
def open_link(url):
    g = Goose()
    article = g.extract(url)
    return article.cleaned_text

In [14]:
g = Goose()

In [15]:
url = 'https://en.wikipedia.org/wiki/Malware'
ratio = 0.20

In [27]:
article = g.extract(url)
formatted_article = preprocess(article.cleaned_text)
sentence_list,best_sentences,score_sentence=summarize_freq(article.cleaned_text,ratio)
visualse(article.title,sentence_list,best_sentences)

In [28]:
top_n_words = 10
distance = 4
number_of_sentences = 5
ratio = 0.3
url = 'https://en.wikipedia.org/wiki/Malware'

In [29]:
article = g.extract(url)
formatted_article = preprocess(article.cleaned_text)
sentence_list,best_sentences,score_sentence=summarize_luhn(article.cleaned_text,distance,top_n_words,number_of_sentences,ratio)
visualse(article.title,sentence_list,best_sentences)

In [30]:
article = g.extract(url)
formatted_article = preprocess(article.cleaned_text)
sentence_list,best_sentences,score_=summarize_cosine(article.cleaned_text,120,0.2)
visualse(article.title,sentence_list,best_sentences)