# Language Summarization 

# Step 1: Importing libraries and preprocessing data.

In [1]:
import nltk
import re
import math
from nltk import sent_tokenize, word_tokenize, PorterStemmer   
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# Uploading the data file

from google.colab import files
uploaded = files.upload()

Saving data.txt to data.txt


In [3]:
# Tokenization 
file = 'data.txt'
file = open(file , 'r')
text = file.read()

sentences = sent_tokenize(text) 
total_documents = len(sentences)
print(sentences)

["Those Who Are Resilient Stay In The Game Longer\n'On the mountains of truth you can never climb in vain: either you will reach a point higher up today, or you will be training your powers so that you will be able to climb higher tomorrow.'", '- Friedrich Nietzsche\nChallenges and setbacks are not meant to defeat you, but promote you.', 'However, I realise after many years of defeats, it can crush your spirit and it is easier to give up than risk further setbacks and disappointments.', 'Have you experienced this before?', "To be honest, I don't have the answers.", "I can't tell you what the right course of action is; only you will know.", "However, it's important not to be discouraged by failure when pursuing a goal or a dream, since failure itself means different things to different people.", "To a person with a Fixed Mindset failure is a blow to their self-esteem, yet to a person with a Growth Mindset, it's an opportunity to improve and find new ways to overcome their obstacles.", '

# Step 2: Create the Frequency matrix of the words in each sentence.

In [4]:
# Calculate the frequency of words in each sentence.

def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

freq_matrix = _create_frequency_matrix(sentences)
print(freq_matrix)

{'Those Who Are R': {'resili': 1, 'stay': 1, 'game': 1, 'longer': 1, "'on": 1, 'mountain': 1, 'truth': 1, 'never': 1, 'climb': 2, 'vain': 1, ':': 1, 'either': 1, 'reach': 1, 'point': 1, 'higher': 2, 'today': 1, ',': 1, 'train': 1, 'power': 1, 'abl': 1, 'tomorrow': 1, '.': 1, "'": 1}, '- Friedrich Nie': {'-': 1, 'friedrich': 1, 'nietzsch': 1, 'challeng': 1, 'setback': 1, 'meant': 1, 'defeat': 1, ',': 1, 'promot': 1, '.': 1}, 'However, I real': {'howev': 1, ',': 2, 'realis': 1, 'mani': 1, 'year': 1, 'defeat': 1, 'crush': 1, 'spirit': 1, 'easier': 1, 'give': 1, 'risk': 1, 'setback': 1, 'disappoint': 1, '.': 1}, 'Have you experi': {'experienc': 1, 'thi': 1, 'befor': 1, '?': 1}, 'To be honest, I': {'honest': 1, ',': 1, "n't": 1, 'answer': 1, '.': 1}, "I can't tell yo": {'ca': 1, "n't": 1, 'tell': 1, 'right': 1, 'cours': 1, 'action': 1, ';': 1, 'onli': 1, 'know': 1, '.': 1}, "However, it's i": {'howev': 1, ',': 2, "'s": 1, 'import': 1, 'discourag': 1, 'failur': 2, 'pursu': 1, 'goal': 1, 'dre

# Step 3: Calculate TF

In [5]:
# TF(w) = (Number of times term w appears in a document) / (Total number of terms in the document)

def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

tf_matrix = _create_tf_matrix(freq_matrix)
print(tf_matrix)

{'Those Who Are R': {'resili': 0.043478260869565216, 'stay': 0.043478260869565216, 'game': 0.043478260869565216, 'longer': 0.043478260869565216, "'on": 0.043478260869565216, 'mountain': 0.043478260869565216, 'truth': 0.043478260869565216, 'never': 0.043478260869565216, 'climb': 0.08695652173913043, 'vain': 0.043478260869565216, ':': 0.043478260869565216, 'either': 0.043478260869565216, 'reach': 0.043478260869565216, 'point': 0.043478260869565216, 'higher': 0.08695652173913043, 'today': 0.043478260869565216, ',': 0.043478260869565216, 'train': 0.043478260869565216, 'power': 0.043478260869565216, 'abl': 0.043478260869565216, 'tomorrow': 0.043478260869565216, '.': 0.043478260869565216, "'": 0.043478260869565216}, '- Friedrich Nie': {'-': 0.1, 'friedrich': 0.1, 'nietzsch': 0.1, 'challeng': 0.1, 'setback': 0.1, 'meant': 0.1, 'defeat': 0.1, ',': 0.1, 'promot': 0.1, '.': 0.1}, 'However, I real': {'howev': 0.07142857142857142, ',': 0.14285714285714285, 'realis': 0.07142857142857142, 'mani': 0.

# Step 4: Calculate IDF

In [6]:
# Making a simple table which helps in calculating IDF matrix.
def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1
             
    return word_per_doc_table

count_doc_per_words = _create_documents_per_words(freq_matrix)


# IDF(w) = log_e(Total number of documents / Number of documents with term w in it)
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))
        idf_matrix[sent] = idf_table

    return idf_matrix

idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
print(idf_matrix)

{'Those Who Are R': {'resili': 1.462397997898956, 'stay': 1.462397997898956, 'game': 1.2863067388432747, 'longer': 1.462397997898956, "'on": 1.7634279935629373, 'mountain': 1.7634279935629373, 'truth': 1.7634279935629373, 'never': 1.462397997898956, 'climb': 1.7634279935629373, 'vain': 1.7634279935629373, ':': 0.8603380065709937, 'either': 1.7634279935629373, 'reach': 1.462397997898956, 'point': 1.462397997898956, 'higher': 1.7634279935629373, 'today': 1.7634279935629373, ',': 0.3832167518513312, 'train': 1.7634279935629373, 'power': 1.1613680022349748, 'abl': 1.7634279935629373, 'tomorrow': 1.7634279935629373, '.': 0.0558578174650009, "'": 0.9852767431792936}, '- Friedrich Nie': {'-': 1.2863067388432747, 'friedrich': 1.7634279935629373, 'nietzsch': 1.7634279935629373, 'challeng': 1.462397997898956, 'setback': 1.462397997898956, 'meant': 1.7634279935629373, 'defeat': 1.2863067388432747, ',': 0.3832167518513312, 'promot': 1.7634279935629373, '.': 0.0558578174650009}, 'However, I real': 

# Step 5: Calculate TF-IDF 

In [7]:
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table
    return tf_idf_matrix

tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)

print(tf_idf_matrix)

{'Those Who Are R': {'resili': 0.0635825216477807, 'stay': 0.0635825216477807, 'game': 0.055926379949707596, 'longer': 0.0635825216477807, "'on": 0.07667078232882336, 'mountain': 0.07667078232882336, 'truth': 0.07667078232882336, 'never': 0.0635825216477807, 'climb': 0.15334156465764673, 'vain': 0.07667078232882336, ':': 0.03740600028569538, 'either': 0.07667078232882336, 'reach': 0.0635825216477807, 'point': 0.0635825216477807, 'higher': 0.15334156465764673, 'today': 0.07667078232882336, ',': 0.016661597906579617, 'train': 0.07667078232882336, 'power': 0.050494260966738036, 'abl': 0.07667078232882336, 'tomorrow': 0.07667078232882336, '.': 0.0024286007593478653, "'": 0.04283811926866494}, '- Friedrich Nie': {'-': 0.12863067388432747, 'friedrich': 0.17634279935629374, 'nietzsch': 0.17634279935629374, 'challeng': 0.14623979978989562, 'setback': 0.14623979978989562, 'meant': 0.17634279935629374, 'defeat': 0.12863067388432747, ',': 0.038321675185133124, 'promot': 0.17634279935629374, '.': 

# Step 6: Score all sentences.

In [8]:
# Score a sentence by averaging its words' TF-IDF

def _score_sentences(tf_idf_matrix) -> dict:

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue

sentence_scores = _score_sentences(tf_idf_matrix)

print(sentence_scores)

{'Those Who Are R': 0.06886827214339657, '- Friedrich Nie': 0.12990196017052547, 'However, I real': 0.0950048202508762, 'Have you experi': 0.33577942105476016, 'To be honest, I': 0.1810095426939688, "I can't tell yo": 0.12467317271772207, "However, it's i": 0.09243832777033766, 'To a person wit': 0.08695519755231208, 'Same failure, y': 0.1697937583849021, 'Who is right an': 0.4344525265530228, 'Neither.': 0.45482145275698455, 'Each person has': 0.17661539889393987, 'Those who are r': 0.14666642638598704, "I've coached ma": 0.1252125175314219, 'It was at that ': 0.22974827205421458, 'Perhaps all tho': 0.22138632773021513, 'It was the 19th': 0.07823896041820029, 'No one knows wh': 0.07836023034462326, 'Consider the ad': 0.07082749570790825, 'Even more than ': 0.09278348992938382, 'I know one thin': 0.07866722673430829, 'Some of you rea': 0.14929715269671165, 'For others, at ': 0.1394443852649187, 'What I wish to ': 0.14175358677275202, 'If you settle f': 0.17793962540194655, "'Two people

# Step 7: Generate the summary

In [9]:
# Find the average score from the sentence value dictionary

def _find_average_score(sentenceValue) -> int:
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average

averageScore = _find_average_score(sentence_scores)
print(averageScore)

0.1592249116679136


In [10]:
# Set the threshold on the score of sentences we allow in the summary 
# Here we use 1.5 times of the average score as an example
threshold = 1.5 * averageScore

def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

summary = _generate_summary(sentences, sentence_scores, threshold)
print(summary)

 Have you experienced this before? Who is right and who is wrong? Neither. It must come from within you. Where are you settling in your life right now? Could you be you playing for bigger stakes than you are? Commit to it. Nurture your dreams.
