In [175]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/text-summarization-sample/temp.txt


In [174]:
ls

__notebook_source__.ipynb


In [140]:
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import networkx as nx

In [232]:
def get_data(filename):
    f = open(filename,'r')
    text = f.read()
    lines = text.split(". ")
    input_data = []
    maxL = 0
    for line in lines:
        for word in line:
            word = word.lower()
        if line:
            t = line.replace("[^a-zA-Z]", " ").split(" ")
            maxL = max(maxL,len(t))
            input_data.append(t)
        
    input_data.pop()
    
    return input_data,lines,maxL   
    


In [142]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)

In [143]:
def similarity_matrix(input_data,stop_words):
    n = len(input_data)
    matrix = np.zeros((n,n))
    
    for i in range(n):
        for j in range(n):
            if i==j:
                continue
            matrix[i][j] = sentence_similarity(input_data[i],input_data[j],stop_words)
    
    return matrix

In [187]:
def Summary_PageRank(input_text,top_n=2):
    
    input_data,lines,maxL = get_data(input_text)
    n = len(input_data)
    if n>(3*top_n):
        top_n = int(n/3)
    stop_words = stopwords.words('english')
    input_data_mat = similarity_matrix(input_data,stop_words)
    
    sentence_similarity_graph = nx.from_numpy_array(input_data_mat)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(input_data)), reverse=True)    
    #print("Indexes of top ranked_sentence order are ", ranked_sentence)  
    summarize_text = []
    for i in range(top_n):
      summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - Offcourse, output the summarize texr
    print("Summarize Text: \n", ". ".join(summarize_text))
    

In [188]:
Summary_PageRank("/kaggle/input/text-summarization-sample/temp.txt")

Summarize Text: 
 Last fall, we started a digital fiction series, publishing to the Amazon Kindle two short stories a month by authors like Christopher Buckley, Curtis Sittenfeld, and Paul Theroux. he short story has been integral to The Atlantic since our first issue, in 1857, in which we published four stories, including “The Mourning Veil,” by Harriet Beecher Stowe. All told, The Atlantic is now publishing more fiction than it has since the mid-1970s.But I should admit that these fiction initiatives are experimental, provisional, part of our larger adventure through the seismically shifting landscape of letters. For the Web site each day, we produce dozens of posts analyzing breaking developments in politics, business, culture, technology, and other subjects, some of them longtime preoccupations of The Atlantic, others fairly new to all of us. But none of us has been particularly happy with it, and we have been searching for ways to once again place great fiction in front of all our

In [189]:
def nextPowerOf2(n): 
    count = 0; 
    
    if (n and not(n & (n - 1))): 
        return n 
      
    while( n != 0): 
        n >>= 1
        count += 1
      
    return 1 << count

In [148]:
nextPowerOf2(6)

8

In [190]:
import numpy as np

from tensorflow import keras
import tensorflow as tf



In [191]:
def text_encode(input_data,n):
    m = len(input_data)
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = m*n,filters='')
    tokenizer.fit_on_texts(input_data)
    
    seq = tokenizer.texts_to_sequences(input_data)
    input_data = tf.keras.preprocessing.sequence.pad_sequences(seq,maxlen = n)
    display(input_data)
    return input_data,tokenizer

In [192]:
from sklearn.cluster import KMeans
from scipy.spatial import distance


In [244]:
def get_k(input_data,lines,k=2):    
    
    n = len(input_data)
    print(n)
    if n>(3*k):
        k = int(n/3)
    
    kmeans = KMeans(k,init = 'k-means++',random_state = 42)
    y_kmeans = kmeans.fit_predict(input_data)
    
    res = []
    for i in range(k):
        my_dict = {}
        for j in range(len(y_kmeans)):
            if y_kmeans[j] == i:
                my_dict[j] =  distance.euclidean(kmeans.cluster_centers_[i],input_data[j])
        if my_dict:
            minDist = min(my_dict.values())
            res.append(min(my_dict,key = my_dict.get))
    print(res)
    result = []
    
    for i in sorted(res):
        result.append(lines[i])
        
    return result

In [234]:
def Summary_Enc_Dec_Kmeans(filename):
    input_data,lines,n = get_data(filename)
    m = len(input_data)
    stop_words = stopwords.words('english')
    
    n = nextPowerOf2(n)
    encoded_input,tokenizer = text_encode(input_data,n)
    output = get_k(encoded_input,lines)
    return '. '.join(output)  
    
    
    
    

In [245]:

Summary_Enc_Dec_Kmeans("/kaggle/input/text-summarization-sample/temp.txt")

array([[  0,   0,   0, ...,  74,  75,  76],
       [  0,   0,   0, ...,  88,  89,  90],
       [  0,   0,   0, ...,   8, 101, 102],
       ...,
       [  0,   0,   0, ...,  23,   2,  43],
       [  0,   0,   0, ..., 307, 308, 309],
       [  0,   0,   0, ...,   2, 323, 324]], dtype=int32)

18
[0, 13, 8, 17, 16, 15]


['he short story has been integral to The Atlantic since our first issue, in 1857, in which we published four stories, including “The Mourning Veil,” by Harriet Beecher Stowe',
 'If our hardworking developers have pulled it off, by the time you read this note our Web site, TheAtlantic.com, will have relaunched with a new design and a superior system for finding the subjects you’re interested in and discovering new ideas you didn’t know you were looking for',
 'For our print magazine and our e-reader editions, we are continuing to devote months of reporting and writing to create pieces like Joshua Green’s profile in this issue of Treasury Secretary Timothy Geithner, and Robert D',
 'For the Web site each day, we produce dozens of posts analyzing breaking developments in politics, business, culture, technology, and other subjects, some of them longtime preoccupations of The Atlantic, others fairly new to all of us',
 'As I write, on our site I can see posts popping up by James Fallows ab