## TEXT SUMMARIZATION USING TF-IDF

In [4]:
import math
import pandas as pd
import numpy as np
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

In [5]:
import os
dir='C:\\Users\\vishnu\\Desktop\\nlp_project\\archive\\News_Articles'
categories=['business','entertainment','politics','sport','tech']
for category in categories:
    path1=os.path.join(dir,category)
    label=categories.index(category)
    for fol in os.listdir(path1):
        path2=os.path.join(path1,fol)
        f=open(path2,'r')
        text=f.read()
        break

#### Tokenize the sentences

In [6]:
sentences = sent_tokenize(text) # NLTK function
total_documents = len(sentences)
print(sentences)

["Ink helps drive democracy in Asia\n\nThe Kyrgyz Republic, a small, mountainous state of the former Soviet republic, is using invisible ink and ultraviolet readers in the country's elections as part of a drive to prevent multiple voting.", 'This new technology is causing both worries and guarded optimism among different sectors of the population.', 'In an effort to live up to its reputation in the 1990s as "an island of democracy", the Kyrgyz President, Askar Akaev, pushed through the law requiring the use of ink during the upcoming Parliamentary and Presidential elections.', 'The US government agreed to fund all expenses associated with this decision.', 'The Kyrgyz Republic is seen by many experts as backsliding from the high point it reached in the mid-1990s with a hastily pushed through referendum in 2003, reducing the legislative branch to one chamber with 75 deputies.', 'The use of ink is only one part of a general effort to show commitment towards more open elections - the Germa

#### Create the Frequency matrix of the words in each sentence.

In [7]:
for sent in sentences:
    print(sent[:15])
    break

Ink helps drive


In [8]:
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
     
    ps= WordNetLemmatizer()
#     ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.lemmatize(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

In [17]:
# 2 Create the Frequency matrix of the words in each sentence.
freq_matrix = _create_frequency_matrix(sentences)
print(freq_matrix)

{'Ink helps drive': {'ink': 2, 'help': 1, 'drive': 2, 'democracy': 1, 'asia': 1, 'kyrgyz': 1, 'republic': 2, ',': 3, 'small': 1, 'mountainous': 1, 'state': 1, 'former': 1, 'soviet': 1, 'using': 1, 'invisible': 1, 'ultraviolet': 1, 'reader': 1, 'country': 1, "'s": 1, 'election': 1, 'part': 1, 'prevent': 1, 'multiple': 1, 'voting': 1, '.': 1}, 'This new techno': {'new': 1, 'technology': 1, 'causing': 1, 'worry': 1, 'guarded': 1, 'optimism': 1, 'among': 1, 'different': 1, 'sector': 1, 'population': 1, '.': 1}, 'In an effort to': {'effort': 1, 'live': 1, 'reputation': 1, '1990s': 1, '``': 1, 'island': 1, 'democracy': 1, "''": 1, ',': 3, 'kyrgyz': 1, 'president': 1, 'askar': 1, 'akaev': 1, 'pushed': 1, 'law': 1, 'requiring': 1, 'use': 1, 'ink': 1, 'upcoming': 1, 'parliamentary': 1, 'presidential': 1, 'election': 1, '.': 1}, 'The US governme': {'u': 1, 'government': 1, 'agreed': 1, 'fund': 1, 'expense': 1, 'associated': 1, 'decision': 1, '.': 1}, 'The Kyrgyz Repu': {'kyrgyz': 1, 'republic': 

#### Term frequency matrix

In [10]:
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

In [24]:
'''Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document.'''
# 3 Calculate TermFrequency and generate a matrix
tf_matrix = _create_tf_matrix(freq_matrix)
print(tf_matrix)

{'Ink helps drive': {'ink': 0.08, 'help': 0.04, 'drive': 0.08, 'democracy': 0.04, 'asia': 0.04, 'kyrgyz': 0.04, 'republic': 0.08, ',': 0.12, 'small': 0.04, 'mountainous': 0.04, 'state': 0.04, 'former': 0.04, 'soviet': 0.04, 'using': 0.04, 'invisible': 0.04, 'ultraviolet': 0.04, 'reader': 0.04, 'country': 0.04, "'s": 0.04, 'election': 0.04, 'part': 0.04, 'prevent': 0.04, 'multiple': 0.04, 'voting': 0.04, '.': 0.04}, 'This new techno': {'new': 0.09090909090909091, 'technology': 0.09090909090909091, 'causing': 0.09090909090909091, 'worry': 0.09090909090909091, 'guarded': 0.09090909090909091, 'optimism': 0.09090909090909091, 'among': 0.09090909090909091, 'different': 0.09090909090909091, 'sector': 0.09090909090909091, 'population': 0.09090909090909091, '.': 0.09090909090909091}, 'In an effort to': {'effort': 0.043478260869565216, 'live': 0.043478260869565216, 'reputation': 0.043478260869565216, '1990s': 0.043478260869565216, '``': 0.043478260869565216, 'island': 0.043478260869565216, 'demo

#### Number of Times the Word Appeared in all the Sentences

In [11]:
def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

In [25]:
# 4 creating table for documents per words
count_doc_per_words = _create_documents_per_words(freq_matrix)
print(count_doc_per_words)

{'ink': 18, 'help': 1, 'drive': 2, 'democracy': 2, 'asia': 1, 'kyrgyz': 3, 'republic': 3, ',': 17, 'small': 1, 'mountainous': 1, 'state': 1, 'former': 2, 'soviet': 2, 'using': 1, 'invisible': 2, 'ultraviolet': 3, 'reader': 3, 'country': 3, "'s": 4, 'election': 10, 'part': 2, 'prevent': 1, 'multiple': 1, 'voting': 1, '.': 30, 'new': 1, 'technology': 2, 'causing': 1, 'worry': 1, 'guarded': 1, 'optimism': 1, 'among': 1, 'different': 1, 'sector': 1, 'population': 2, 'effort': 1, 'live': 1, 'reputation': 1, '1990s': 1, '``': 2, 'island': 1, "''": 3, 'president': 1, 'askar': 1, 'akaev': 1, 'pushed': 2, 'law': 2, 'requiring': 1, 'use': 7, 'upcoming': 2, 'parliamentary': 3, 'presidential': 2, 'u': 1, 'government': 1, 'agreed': 1, 'fund': 1, 'expense': 1, 'associated': 2, 'decision': 1, 'seen': 1, 'many': 2, 'expert': 1, 'backsliding': 1, 'high': 1, 'point': 1, 'reached': 1, 'mid-1990s': 1, 'hastily': 1, 'referendum': 1, '2003': 1, 'reducing': 1, 'legislative': 1, 'branch': 1, 'one': 3, 'chambe

#### Inverse document frequency (IDF) is how unique or rare a word is.

In [12]:
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

In [26]:
# 5 Calculate IDF and generate a matrix
idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
print(idf_matrix)

{'Ink helps drive': {'ink': 0.2498774732165999, 'help': 1.505149978319906, 'drive': 1.2041199826559248, 'democracy': 1.2041199826559248, 'asia': 1.505149978319906, 'kyrgyz': 1.0280287236002434, 'republic': 1.0280287236002434, ',': 0.27470105694163205, 'small': 1.505149978319906, 'mountainous': 1.505149978319906, 'state': 1.505149978319906, 'former': 1.2041199826559248, 'soviet': 1.2041199826559248, 'using': 1.505149978319906, 'invisible': 1.2041199826559248, 'ultraviolet': 1.0280287236002434, 'reader': 1.0280287236002434, 'country': 1.0280287236002434, "'s": 0.9030899869919435, 'election': 0.505149978319906, 'part': 1.2041199826559248, 'prevent': 1.505149978319906, 'multiple': 1.505149978319906, 'voting': 1.505149978319906, '.': 0.028028723600243534}, 'This new techno': {'new': 1.505149978319906, 'technology': 1.2041199826559248, 'causing': 1.505149978319906, 'worry': 1.505149978319906, 'guarded': 1.505149978319906, 'optimism': 1.505149978319906, 'among': 1.505149978319906, 'different'

#### calculating tf-idf of the words in sentence

In [13]:
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [27]:
# 6 Calculate TF-IDF and generate a matrix
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
print(tf_idf_matrix)

{'Ink helps drive': {'ink': 0.01999019785732799, 'help': 0.06020599913279624, 'drive': 0.09632959861247399, 'democracy': 0.048164799306236995, 'asia': 0.06020599913279624, 'kyrgyz': 0.04112114894400974, 'republic': 0.08224229788801948, ',': 0.03296412683299584, 'small': 0.06020599913279624, 'mountainous': 0.06020599913279624, 'state': 0.06020599913279624, 'former': 0.048164799306236995, 'soviet': 0.048164799306236995, 'using': 0.06020599913279624, 'invisible': 0.048164799306236995, 'ultraviolet': 0.04112114894400974, 'reader': 0.04112114894400974, 'country': 0.04112114894400974, "'s": 0.03612359947967774, 'election': 0.02020599913279624, 'part': 0.048164799306236995, 'prevent': 0.06020599913279624, 'multiple': 0.06020599913279624, 'voting': 0.06020599913279624, '.': 0.0011211489440097415}, 'This new techno': {'new': 0.13683181621090054, 'technology': 0.10946545296872044, 'causing': 0.13683181621090054, 'worry': 0.13683181621090054, 'guarded': 0.13683181621090054, 'optimism': 0.13683181

#### Score of each sentence

In [14]:
def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue

In [31]:
sentence_scores = _score_sentences(tf_idf_matrix)
print(sentence_scores)

{'Ink helps drive': 0.04944558212998766, 'This new techno': 0.11964850012786234, 'In an effort to': 0.049380656249305724, 'The US governme': 0.16036013400274382, 'The Kyrgyz Repu': 0.056479122060062274, 'The use of ink ': 0.11186605701392763, 'The actual tech': 0.1665965587342357, 'The ink is spra': 0.12494724185797523, 'It dries and is': 0.21081909545984895, 'However, the pr': 0.07134710006349634, 'At the entrance': 0.04864899801774233, 'If the ink show': 0.10160743531280843, 'Likewise, any v': 0.13367935000121356, 'These elections': 0.04705437075603565, 'Widely circulat': 0.0670031723331395, 'The author of o': 0.10974909579005312, 'The greatest pa': 0.12607279078676817, 'Local newspaper': 0.06996110139599462, 'Others, such as': 0.09252949175449168, 'This type of in': 0.06908862442484746, 'The other commo': 0.07040660684200312, 'The use of "inv': 0.10792821134304274, 'In most electio': 0.14787026927281943, 'In Serbia, for ': 0.0928220270275755, 'Other rumours a': 0.07659642556393663, 

#### Find the threshold

In [15]:
def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average

In [29]:
# 8 Find the threshold
threshold = _find_average_score(sentence_scores)
print(threshold)

0.10038016299398803


####  Important Algorithm: Generate the summary

In [16]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

In [30]:
summary = _generate_summary(sentences, sentence_scores, 1.1 * threshold)
print(summary)

 This new technology is causing both worries and guarded optimism among different sectors of the population. The US government agreed to fund all expenses associated with this decision. The use of ink is only one part of a general effort to show commitment towards more open elections - the German Embassy, the Soros Foundation and the Kyrgyz government have all contributed to purchase transparent ballot boxes. The actual technology behind the ink is not that complicated. The ink is sprayed on a person's left thumb. It dries and is not visible under normal light. Likewise, any voter who refuses to be inked will not receive the ballot. The use of ink has been controversial - especially among groups perceived to be pro-government. The greatest part of the opposition to ink has often been sheer ignorance. In most elections, numerous rumors have spread about it. The ink stays on the finger for at least 72 hours and for up to a week. The use of ink and readers by itself is not a panacea for e