# Extractive Summarization

Attempt to summarize articles by selecting a subset of words that retain the most important points. Weights the important part of sentences and uses the same to form the summary.

In [1]:
import gensim
from gensim import corpora
from gensim import similarities
from gensim import models
from gensim.models import CoherenceModel

import preprocess
import os
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.cluster.util import cosine_distance

import numpy as np
import networkx as nx

##  Extract Data

In [2]:
# Define the directory where the books are located
# 5827, 31671
books_directory = "Data/Test/Test_Chapters/31671"

# Create a list to store the text data of each book
book_texts = []
chapters_name = []

# Loop through each file in the directory
for filename in os.listdir(books_directory):
    if filename.endswith('.txt'):
        with open(os.path.join(books_directory, filename), "r", encoding="utf8", errors='ignore') as file:
            book_text = file.read()
            book_texts.append(book_text)
            chapters_name.append(filename.replace('.txt',''))

In [3]:
chapters_name

['CHAPTER_III_',
 'CHAPTER_II_',
 'CHAPTER_IV_',
 'CHAPTER_IX_',
 'CHAPTER_I_',
 'CHAPTER_VIII_',
 'CHAPTER_VII_',
 'CHAPTER_VI_',
 'CHAPTER_V_']

In [4]:
selected_chap = 4
sentences = nltk.sent_tokenize(book_texts[selected_chap])

## Model 1 - Using LDA Model Topic Distribution

In [5]:
wnl = WordNetLemmatizer()
lower = [[w.lower() for w in nltk.word_tokenize(sen)] for sen in sentences]
punc = [[w for w in sen if re.search('^[a-z]+$', w)] for sen in lower]
doc_lemmed = [[wnl.lemmatize(word, tag[0].lower()) for word, tag in pos_tag(sen, tagset='universal') if tag[0].lower() in ['a', 'r', 'n', 'v']] for sen in punc]

In [6]:
id2word = corpora.Dictionary.load('Model/finalmodel_Dictionary')
# vectorise the words 
chapter = doc_lemmed
# new_dict = gensim.corpora.Dictionary(chapter)
new_vecs = [id2word.doc2bow(sen) for sen in chapter]
print("Number of sentences: ",len(new_vecs))

Number of sentences:  88


In [7]:
#Find most dominant topic for each of the sentences

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

lda_disk = gensim.models.ldamodel.LdaModel.load("Model/finalmodel_5Topics")

def format_topics_sentences(ldamodel, corpus, data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(data)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

#I choose model_list[2] where the number of topics is 6

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_disk, corpus=new_vecs, data=chapter)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Sentence_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
sorted_df = df_dominant_topic.sort_values(by="Topic_Perc_Contrib", ascending = False)

In [8]:
sorted_df

Unnamed: 0,Sentence_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
80,80,4.0,0.6968,"sexual, dream, child, girl, activity, young, s...","[first, vestige, of, head, body, little, late,..."
70,70,3.0,0.5302,"child, soul, plato, justice, reply, evil, woma...","[skin, be, red, characterize, at, period, by, ..."
14,14,2.0,0.5124,"country, price, labour, trade, quantity, land,...","[at, very, early, age, of, reproduction, embry..."
56,56,3.0,0.5056,"child, soul, plato, justice, reply, evil, woma...","[movement, of, foetus, be, by, time, plainly, ..."
31,31,4.0,0.4954,"sexual, dream, child, girl, activity, young, s...","[kidney, now, begin, be, form, little, late, g..."
...,...,...,...,...,...
73,73,0.0,0.2000,"conception, government, sensation, image, phen...","[at, anxious, time, of, parturition, have, arr..."
54,54,0.0,0.2000,"conception, government, sensation, image, phen...","[eye, be, now, close, by, lid, nostril, be, mo..."
21,21,0.0,0.2000,"conception, government, sensation, image, phen...","[slight, depression, represent, neck, enable, ..."
30,30,0.0,0.2000,"conception, government, sensation, image, phen...","[at, about, little, bony, deposit, be, find, i..."


In [9]:
sen_list = sorted_df['Sentence_No'].to_list()
result_list = []
for i in sen_list[0:4]:
    sentence = sentences[i].replace('\n', ' ')
    result_list.append(sentence)

summary = ' '
summary.join(result_list)
print("Summarize Text: \n" + summary.join(result_list))

Summarize Text: 
First the vestige of a head and body, a little later the heart and lungs appear lying in the open chest; then the hands are protruded from the sides of the trunk, afterwards the forearms, then the arms, all pushed out from the body; the feet and legs gradually protrude from the lower end of the trunk, and the chest closes up so that the heart and lungs can no longer be seen; the face, mouth and eyes take form, the external genital organs make their appearance in conjunction with other developments, and in due course of time the boy or girl is born ready for further developments in childhood, and adolescence. The skin is red, and characterized at this period by a fine downy covering, over which is spread a quantity of thick viscous matter, called the sebaceous coat, which has been forming since the latter part of the fifth month. At this very early age of reproduction the embryo has all the elements of the future man or woman, mentally and physically, even before any fo

## Model 2 - Using Sentence Similarity

In [10]:
book_sentences = []
for sen in sentences:
    cleaned_sen = sen.replace("\n", " ")
    book_sentences.append(cleaned_sen.split(" "))

In [11]:
new_sentences=[]
new_sentences = [[x for x in sentence if x] for sentence in book_sentences]
print(new_sentences)

[['CHAPTER', 'I.'], ['Introductory.'], ['In', 'the', 'creation', 'of', 'the', 'world', 'and', 'all', 'that', 'therein', 'is,', 'we', 'should', 'consider', 'it', 'an', 'axiom', 'that', '"Everything', 'was', 'created', 'for', 'use."'], ['All', 'individual', 'substances,', 'or', 'beings,', 'that', 'come', 'to', 'our', 'notice', 'bear', 'certain', 'relations', 'to', 'one', 'another,', 'have', 'connection', 'one', 'with', 'another,', 'and', 'are', 'dependent', 'upon', 'and', 'useful', 'to', 'each', 'other;', 'and', 'nothing', 'could', 'possibly', 'exist', 'or', 'subsist', 'without', 'this', 'co-relation:', 'connection', 'with', 'and', 'use', 'to', 'each', 'other.'], ['This', 'is', 'a', 'law', 'which', 'needs', 'only', 'a', 'little', 'reflection', 'to', 'be', 'accepted', 'as', 'a', 'truth', 'in', 'every', 'particular--in', 'the', 'greatest', 'as', 'well', 'as', 'in', 'the', 'least', 'created', 'form.'], ['This', 'is', 'more', 'plainly', 'seen', 'in', 'the', 'animal', 'kingdom', 'than', 'in',

In [12]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
        
    if np.isnan(1 - cosine_distance(vector1, vector2)):
        return 0
    return 1 - cosine_distance(vector1, vector2)

In [13]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
    return similarity_matrix

In [14]:
stop_words = stopwords.words('english')
# print(stop_words)
summarize_text = []
# new_sentences = doc_lemmed
        
# Step 2 - Generate Similarity Matrix across sentences
sentence_similarity_matrix = build_similarity_matrix(new_sentences, stop_words)

# Step 3 - Rank sentences in similarity matrix
sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
# print(sentence_similarity_graph)
# print(scores)

# for i,s in enumerate(sentences):
#     print(scores[i],s)

# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(new_sentences)), reverse=True)    
print("Indexes of top ranked_sentence order are ", ranked_sentence)


Indexes of top ranked_sentence order are  [(0.025169542311981968, ['The', 'length', 'of', 'the', 'embryo', 'is', 'from', 'one', 'inch', 'and', 'a', 'half', 'to', 'two', 'inches,', 'and', 'it', 'weighs', 'from', 'three', 'to', 'five', 'drachms.']), (0.022000891037960976, ['At', '_ten', 'weeks_', 'the', 'embryo', 'is', 'from', 'one', 'and', 'a', 'half', 'to', 'two', 'and', 'a', 'half', 'inches', 'long,', 'and', 'its', 'weight', 'is', 'from', 'one', 'ounce', 'to', 'an', 'ounce', 'and', 'a', 'half,', 'the', 'eyelids', 'are', 'more', 'developed', 'and', 'descend', 'in', 'front', 'of', 'the', 'eyes;', 'the', 'mouth', 'begins', 'to', 'be', 'closed', 'by', 'the', 'development', 'of', 'the', 'lips.']), (0.02174196368552055, ['First', 'the', 'vestige', 'of', 'a', 'head', 'and', 'body,', 'a', 'little', 'later', 'the', 'heart', 'and', 'lungs', 'appear', 'lying', 'in', 'the', 'open', 'chest;', 'then', 'the', 'hands', 'are', 'protruded', 'from', 'the', 'sides', 'of', 'the', 'trunk,', 'afterwards', '

In [15]:
# number of sentences to combine
summarize_text=[]
for i in range(3):
    summarize_text.append(" ".join(ranked_sentence[i][1]))
    # Step 5 - Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text).replace('..', '.'))

Summarize Text: 
 The length of the embryo is from one inch and a half to two inches, and it weighs from three to five drachms. At _ten weeks_ the embryo is from one and a half to two and a half inches long, and its weight is from one ounce to an ounce and a half, the eyelids are more developed and descend in front of the eyes; the mouth begins to be closed by the development of the lips. First the vestige of a head and body, a little later the heart and lungs appear lying in the open chest; then the hands are protruded from the sides of the trunk, afterwards the forearms, then the arms, all pushed out from the body; the feet and legs gradually protrude from the lower end of the trunk, and the chest closes up so that the heart and lungs can no longer be seen; the face, mouth and eyes take form, the external genital organs make their appearance in conjunction with other developments, and in due course of time the boy or girl is born ready for further developments in childhood, and adole

## Model 3 - Google's T5

In [16]:
# conda install -c pytorch pytorch
# !pip install transformers datasets

In [17]:
import torch
import transformers
from transformers import AutoTokenizer,AutoModelWithLMHead

In [18]:
#Instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained('t5-base')
model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)

In [19]:
#there is a need to add a prefix summarize: before the input text so that T5 understands the task 
inputs = tokenizer.encode("summarize: " + book_texts[selected_chap],
                          return_tensors='pt',
                          max_length=512,
                          truncation=True)
#Summarse tokenised data using T5 via model.generate
#num_beams : the number of different possible sequences considered at each generation step. the higher the value, the better the quality of generated text, at the expense of computation time
summary_ids = model.generate(inputs, max_length=200, min_length=80, length_penalty=5., num_beams=2)
#decode summary tokens back into a human-readable language using tokenizer.decode
summary = tokenizer.decode(summary_ids[0])
# print(f"{chap}'s summary: ")
print(summary)

<pad> sally kohn: in the creation of the world, we should consider it an axiom that "Everything was created for use" kohn: nothing could possibly exist or subsist without this co-relation. kohn: man and woman were and are still created to associate in pairs. kohn: the greatest happiness to the human race will be found in living a life in accordance.</s>
