In [1]:
import nltk

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance

In [4]:
import numpy as np
import networkx as nx

In [14]:
def read(text):
  sentences = nltk.sent_tokenize(text)
  return sentences


In [42]:
def sentence_similarity(sent1, sent2, stopwords = None):
  if stopwords is None :
    stopwords = []

  sent1 = [w.lower() for w in sent1]
  sent2 = [w.lower() for w in sent2]

  all_words = list(set(sent1 + sent2))

  vector1 = [0] * len(all_words)
  vector2 = [0] * len(all_words)

  for w in sent2 :
    if w in stopwords :
      continue
    vector1[all_words.index(w)] += 1

  for w in sent2 :
    if w in stopwords :
      continue
    vector2[all_words.index(w)] += 1

  return 1 - cosine_distance(vector1, vector2)

In [43]:
def build_similarity_matrix(sentences, stop_words):
  similarity_matrix = np.zeros((len(sentences), len(sentences)))

  for idx1 in range(len(sentences)):
    for idx2 in range(len(sentences)) :
      if idx1 == idx2:
        continue
      similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

  return similarity_matrix

In [44]:
def generate_summary(text, num_sentences=5):
    stop_words = set(stopwords.words('english'))
    summarize_text = []

    sentences = read(text)
    sentence_similarity_matrix = build_similarity_matrix(sentences, stop_words)
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph)

    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

    for i in range(num_sentences):
        summarize_text.append(ranked_sentences[i][1])

    return ' '.join(summarize_text)

In [45]:
text = "The leather jacked showed the scars of being his favorite for years. It wore those scars with pride, feeling that they enhanced his presence rather than diminishing it. The scars gave it character and had not overwhelmed to the point that it had become ratty. The jacket was in its prime and it knew it."

In [48]:
summary = generate_summary(text, num_sentences = 1)
print(summary)

The scars gave it character and had not overwhelmed to the point that it had become ratty.
