In [2]:
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

In [3]:
def read_article(file_name):
    file = open(file_name, 'r')
    filedata = file.readlines()
    article = filedata[0].split(". ")
    sentences = []
    for sentence in article:
        sentences.append(sentence.replace("[^a-zA-z]", " ").split(" "))
    sentences.pop()
    return sentences

In [4]:
def sentence_similarity(sentence1, sentence2, stopwords=None):
    if stopwords is None:
        stopwords=[]
    sentence1 = [w.lower() for w in sentence1]
    sentence2 = [w.lower() for w in sentence2]
    all_words = list(set(sentence1 + sentence2))
    
    vector1 = [0] *len(all_words)
    vector2 = [0] *len(all_words)
    for w in sentence1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
    for w in sentence2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
    return 1-cosine_distance(vector1, vector2)

In [5]:
def gen_sim_matrix(sentences, stop_words):
    similarity_matrix = np.zeros((len(sentences),len(sentences)))
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2:
                continue
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
    return similarity_matrix

In [6]:
def generate_summary(file_name, top_n = 5):
    stop_words = stopwords.words('english')
    summarize_text = []
    sentences = read_article(file_name)
    sentence_similarity_matrix = gen_sim_matrix(sentences, stop_words)
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph)
    ranked_sentence = sorted(((scores[i], s) for i,s in enumerate(sentences)), reverse=True)
    for i in range(top_n):
        summarize_text.append(" ".join(ranked_sentence[i][1]))
    print("Summary \n",". ".join(summarize_text))

In [9]:
read_article("msft.txt")

[['Microsoft',
  'Corporation',
  'is',
  'an',
  'American',
  'multinational',
  'technology',
  'corporation',
  'which',
  'produces',
  'computer',
  'software,',
  'consumer',
  'electronics,',
  'personal',
  'computers,',
  'and',
  'related',
  'services'],
 ['Its',
  'best',
  'known',
  'software',
  'products',
  'are',
  'the',
  'Microsoft',
  'Windows',
  'line',
  'of',
  'operating',
  'systems,',
  'the',
  'Microsoft',
  'Office',
  'suite,',
  'and',
  'the',
  'Internet',
  'Explorer',
  'and',
  'Edge',
  'web',
  'browsers'],
 ['Its',
  'flagship',
  'hardware',
  'products',
  'are',
  'the',
  'Xbox',
  'video',
  'game',
  'consoles',
  'and',
  'the',
  'Microsoft',
  'Surface',
  'lineup',
  'of',
  'touchscreen',
  'personal',
  'computers'],
 ['Microsoft', 'ranked', 'No'],
 ['21',
  'in',
  'the',
  '2020',
  'Fortune',
  '500',
  'rankings',
  'of',
  'the',
  'largest',
  'United',
  'States',
  'corporations',
  'by',
  'total',
  'revenue;[3]',
  'it',

In [10]:
generate_summary("msft.txt")

Summary 
 Its best known software products are the Microsoft Windows line of operating systems, the Microsoft Office suite, and the Internet Explorer and Edge web browsers. Microsoft ranked No. Its flagship hardware products are the Xbox video game consoles and the Microsoft Surface lineup of touchscreen personal computers. Microsoft Corporation is an American multinational technology corporation which produces computer software, consumer electronics, personal computers, and related services. 21 in the 2020 Fortune 500 rankings of the largest United States corporations by total revenue;[3] it was the world's largest software maker by revenue as of 2016.[4] It is considered one of the Big Five companies in the U.S
