In [1]:
from urllib.request import urlopen           # to download a webpage(html)
from bs4 import BeautifulSoup as bs          # to parse a webpage(html)[give text] and remove all crud(div,tags etc) present
import re

In [4]:
def getTextFromURL(url):
    page = urlopen(url).read().decode('utf8')
    soup = bs(page,'lxml')
    text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
    return re.sub(r'[^\x00-\x7F]+',' ', text)   #Replace non-ASCII characters with a single space


def summarize(text,n):
    sents = sent_tokenize(text)
    assert n<=len(sents)
    word_sent = word_tokenize(text)
    customStopWords = set(stopwords.words('english')+list(punctuation))
    word_sent = [word for word in word_sent if word not in customStopWords]
    freq = FreqDist(word_sent)
    ranking = defaultdict(int)

    for i,sent in enumerate(sents):
        for w in word_tokenize(sent.lower()):
            if w in freq:
                ranking[i]+=freq[w]
                
    sents_idx = nlargest(n,ranking,key=ranking.get)
    return [sents[j] for j in sorted(sents_idx)]

In [5]:
articleURL="https://en.wikipedia.org/wiki/Deep_learning"
text = getTextFromURL(articleURL)
text

'Deep learning  (also known as deep structured learning  or hierarchical learning) is part of a broader family of machine learning methods based on artificial neural networks. Learning can be supervised, semi-supervised or unsupervised.[1][2][3]\n Deep learning architectures such as deep neural networks, deep belief networks, recurrent neural networks and convolutional neural networks have been applied to fields including computer vision, speech recognition, natural language processing, audio recognition, social network filtering, machine translation, bioinformatics, drug design, medical image analysis, material inspection and board game programs, where they have produced results comparable to and in some cases superior to human experts.[4][5][6]\n Artificial Neural Networks (ANNs) were inspired by information processing and distributed communication nodes in biological systems. ANNs have various differences from biological brains.  Specifically, neural networks tend to be static and s

In [28]:
#preprocessing article text
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from string import punctuation

In [29]:
sents = sent_tokenize(text)
sents

['Deep learning (also known as deep structured learning  or hierarchical learning) is part of a broader family of machine learning methods based on learning data representations, as opposed to task-specific algorithms.',
 'Learning can be supervised, semi-supervised or unsupervised.',
 '[1][2][3]\n Deep learning architectures such as deep neural networks, deep belief networks and recurrent neural networks have been applied to fields including computer vision, speech recognition, natural language processing, audio recognition, social network filtering, machine translation, bioinformatics, drug design, medical image analysis, material inspection and board game programs, where they have produced results comparable to and in some cases superior to human experts.',
 '[4][5][6]\n \nDeep learning models are vaguely inspired by information processing and communication patterns in biological nervous systems yet have various differences from the structural and functional properties of biological

In [33]:
word_sent = word_tokenize(text)
word_sent

['Deep',
 'learning',
 '(',
 'also',
 'known',
 'as',
 'deep',
 'structured',
 'learning',
 'or',
 'hierarchical',
 'learning',
 ')',
 'is',
 'part',
 'of',
 'a',
 'broader',
 'family',
 'of',
 'machine',
 'learning',
 'methods',
 'based',
 'on',
 'learning',
 'data',
 'representations',
 ',',
 'as',
 'opposed',
 'to',
 'task-specific',
 'algorithms',
 '.',
 'Learning',
 'can',
 'be',
 'supervised',
 ',',
 'semi-supervised',
 'or',
 'unsupervised',
 '.',
 '[',
 '1',
 ']',
 '[',
 '2',
 ']',
 '[',
 '3',
 ']',
 'Deep',
 'learning',
 'architectures',
 'such',
 'as',
 'deep',
 'neural',
 'networks',
 ',',
 'deep',
 'belief',
 'networks',
 'and',
 'recurrent',
 'neural',
 'networks',
 'have',
 'been',
 'applied',
 'to',
 'fields',
 'including',
 'computer',
 'vision',
 ',',
 'speech',
 'recognition',
 ',',
 'natural',
 'language',
 'processing',
 ',',
 'audio',
 'recognition',
 ',',
 'social',
 'network',
 'filtering',
 ',',
 'machine',
 'translation',
 ',',
 'bioinformatics',
 ',',
 'drug',

In [35]:
customStopWords = set(stopwords.words('english')+list(punctuation))
word_sent = [word for word in word_sent if word not in customStopWords]
word_sent

['Deep',
 'learning',
 'also',
 'known',
 'deep',
 'structured',
 'learning',
 'hierarchical',
 'learning',
 'part',
 'broader',
 'family',
 'machine',
 'learning',
 'methods',
 'based',
 'learning',
 'data',
 'representations',
 'opposed',
 'task-specific',
 'algorithms',
 'Learning',
 'supervised',
 'semi-supervised',
 'unsupervised',
 '1',
 '2',
 '3',
 'Deep',
 'learning',
 'architectures',
 'deep',
 'neural',
 'networks',
 'deep',
 'belief',
 'networks',
 'recurrent',
 'neural',
 'networks',
 'applied',
 'fields',
 'including',
 'computer',
 'vision',
 'speech',
 'recognition',
 'natural',
 'language',
 'processing',
 'audio',
 'recognition',
 'social',
 'network',
 'filtering',
 'machine',
 'translation',
 'bioinformatics',
 'drug',
 'design',
 'medical',
 'image',
 'analysis',
 'material',
 'inspection',
 'board',
 'game',
 'programs',
 'produced',
 'results',
 'comparable',
 'cases',
 'superior',
 'human',
 'experts',
 '4',
 '5',
 '6',
 'Deep',
 'learning',
 'models',
 'vaguely'

In [37]:
from nltk.probability import FreqDist
freq = FreqDist(word_sent)
freq

FreqDist({'Deep': 31,
          'learning': 91,
          'also': 11,
          'known': 2,
          'deep': 76,
          'structured': 1,
          'hierarchical': 2,
          'part': 3,
          'broader': 1,
          'family': 1,
          'machine': 17,
          'methods': 11,
          'based': 11,
          'data': 30,
          'representations': 5,
          'opposed': 1,
          'task-specific': 3,
          'algorithms': 6,
          'Learning': 7,
          'supervised': 5,
          'semi-supervised': 1,
          'unsupervised': 6,
          '1': 11,
          '2': 13,
          '3': 2,
          'architectures': 14,
          'neural': 45,
          'networks': 35,
          'belief': 6,
          'recurrent': 4,
          'applied': 9,
          'fields': 2,
          'including': 7,
          'computer': 11,
          'vision': 10,
          'speech': 31,
          'recognition': 37,
          'natural': 4,
          'language': 8,
          'processing': 9,
   

In [39]:
from heapq import nlargest
nlargest(10,freq,key=freq.get)

['learning',
 'deep',
 'neural',
 'The',
 'recognition',
 'networks',
 'Deep',
 'speech',
 'data',
 'network']

In [42]:
from collections import defaultdict
ranking = defaultdict(int)

for i,sent in enumerate(sents):
    for w in word_tokenize(sent.lower()):
        if w in freq:
            ranking[i]+=freq[w]
            
ranking

defaultdict(int,
            {0: 712,
             1: 103,
             2: 868,
             3: 289,
             4: 911,
             5: 229,
             6: 315,
             7: 202,
             8: 76,
             9: 414,
             10: 194,
             11: 28,
             12: 36,
             13: 214,
             14: 139,
             15: 487,
             16: 33,
             17: 77,
             18: 298,
             19: 194,
             20: 224,
             21: 362,
             22: 296,
             23: 67,
             24: 270,
             25: 205,
             26: 182,
             27: 63,
             28: 131,
             29: 100,
             30: 26,
             31: 102,
             32: 49,
             33: 390,
             34: 215,
             35: 203,
             36: 218,
             37: 13,
             38: 213,
             39: 39,
             40: 99,
             41: 10,
             42: 117,
             43: 153,
             44: 33,
             45: 

In [43]:
sents_idx = nlargest(4,ranking,key=ranking.get)
sents_idx 

[4, 2, 0, 78]

In [44]:
[sents[j] for j in sorted(sents_idx)]

['Deep learning (also known as deep structured learning  or hierarchical learning) is part of a broader family of machine learning methods based on learning data representations, as opposed to task-specific algorithms.',
 '[1][2][3]\n Deep learning architectures such as deep neural networks, deep belief networks and recurrent neural networks have been applied to fields including computer vision, speech recognition, natural language processing, audio recognition, social network filtering, machine translation, bioinformatics, drug design, medical image analysis, material inspection and board game programs, where they have produced results comparable to and in some cases superior to human experts.',
 '[7][8][9] Deep learning is a class of machine learning algorithms that:[10](pp199 200)\n Most modern deep learning models are based on an artificial neural network, although they can also include propositional formulas or latent variables organized layer-wise in deep generative models such a