In [37]:
import nltk
# may need to run nltk.download() the first time

In [38]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [39]:
text = "Mary had a little lamb. Her fleece was white as snow"

In [40]:
sents = sent_tokenize(text)
print(sents)

['Mary had a little lamb.', 'Her fleece was white as snow']


In [41]:
words = [word_tokenize(sent) for sent in sents]
print(words)

[['Mary', 'had', 'a', 'little', 'lamb', '.'], ['Her', 'fleece', 'was', 'white', 'as', 'snow']]


In [42]:
# removing stop words
from nltk.corpus import stopwords
from string import punctuation
customStopWords = set(stopwords.words('english')+list(punctuation))

In [43]:
wordsWOStopWords = [word for word in word_tokenize(text) if word not in customStopWords]

In [44]:
print(wordsWOStopWords)

['Mary', 'little', 'lamb', 'Her', 'fleece', 'white', 'snow']


In [45]:
#constructing n-grams (words that cooccur)
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(wordsWOStopWords)
finder.ngram_fd.items()
# sorted(finder.ngram_fd.items()) prints most important bigrams on top

dict_items([(('Mary', 'little'), 1), (('little', 'lamb'), 1), (('lamb', 'Her'), 1), (('Her', 'fleece'), 1), (('fleece', 'white'), 1), (('white', 'snow'), 1)])

In [46]:
text2 = "Mary closed on closing night when she was in the mood to close."

In [47]:
# stemming -> to treat closed, closing, and close as same word
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
stemmedWords = [st.stem(word) for word in word_tokenize(text2)]

In [48]:
print(stemmedWords)

['mary', 'clos', 'on', 'clos', 'night', 'when', 'she', 'was', 'in', 'the', 'mood', 'to', 'clos', '.']


In [49]:
nltk.pos_tag(word_tokenize(text2))

[('Mary', 'NNP'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'NN'),
 ('night', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mood', 'NN'),
 ('to', 'TO'),
 ('close', 'VB'),
 ('.', '.')]

In [50]:
from nltk.corpus import wordnet as wn
for ss in wn.synsets('state'):
    print(ss, ss.definition())

Synset('state.n.01') the territory occupied by one of the constituent administrative districts of a nation
Synset('state.n.02') the way something is with respect to its main attributes
Synset('state.n.03') the group of people comprising the government of a sovereign state
Synset('state.n.04') a politically organized body of people under a single government
Synset('state_of_matter.n.01') (chemistry) the three traditional states of matter are solids (fixed shape and volume) and liquids (fixed volume and shaped by the container) and gases (filling the container)
Synset('state.n.06') a state of depression or agitation
Synset('country.n.02') the territory occupied by a nation
Synset('department_of_state.n.01') the federal department in the United States that sets and maintains foreign policies
Synset('state.v.01') express in words
Synset('submit.v.02') put before
Synset('express.v.04') indicate through a symbol, formula, etc.


In [51]:
from nltk.wsd import lesk
sense1 = lesk(word_tokenize("water is in the liquid state"), "state")
print(sense1, sense1.definition())

Synset('state.n.03') the group of people comprising the government of a sovereign state


In [95]:
# auto summarizing text
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup

In [96]:
articleURL = 'https://www.washingtonpost.com/news/politics/wp/2018/07/12/in-about-20-years-half-the-population-will-live-in-eight-states/?utm_term=.04fd3b97c350'

In [153]:
def getArticle(url):
    page = urlopen(url).read().decode('utf-8', 'ignore').strip()
    article = soup(page, 'lxml')
    #articleText = ' '.join(map(lambda p: p.text, article.findAll('article')))
    articleText.find('article')
    cleanArticle = articleText.replace('\xa0—', '')#.encode('ascii', errors='replace').replace("?", " ")
    return cleanArticle

In [154]:
politicalDemographics = getArticle(articleURL)

In [155]:
type(politicalDemographics)
politicalDemographics

'    The Top of the Rock at Rockefeller Center in New York on March 16, 2016. (Justin Lane/EPA)  In response to Post opinion writer Paul Waldman’s essay about the current power of the minority in American politics, the American Enterprise Institute’s Norman Ornstein offered a stunning bit of data on Twitter.   I want to repeat a statistic I use in every talk: by 2040 or so, 70 percent of Americans will live in 15 states. Meaning 30 percent will choose 70 senators. And the 30% will be older, whiter, more rural, more male than the 70 percent. Unsettling to say the least https://t.co/EGPD5nE4qG— Norman Ornstein (@NormOrnstein) July 10, 2018    In broad strokes, Ornstein is correct. The Weldon Cooper Center for Public Service of the University of Virginia analyzed Census Bureau population projections to estimate each state’s likely population in 2040, including the expected breakdown of the population by age and gender. Although that data was released in 2016, before the bureau revised its

In [158]:
sentences = sent_tokenize(politicalDemographics)
sentences

['    The Top of the Rock at Rockefeller Center in New York on March 16, 2016.',
 '(Justin Lane/EPA)  In response to Post opinion writer Paul Waldman’s essay about the current power of the minority in American politics, the American Enterprise Institute’s Norman Ornstein offered a stunning bit of data on Twitter.',
 'I want to repeat a statistic I use in every talk: by 2040 or so, 70 percent of Americans will live in 15 states.',
 'Meaning 30 percent will choose 70 senators.',
 'And the 30% will be older, whiter, more rural, more male than the 70 percent.',
 'Unsettling to say the least https://t.co/EGPD5nE4qG— Norman Ornstein (@NormOrnstein) July 10, 2018    In broad strokes, Ornstein is correct.',
 'The Weldon Cooper Center for Public Service of the University of Virginia analyzed Census Bureau population projections to estimate each state’s likely population in 2040, including the expected breakdown of the population by age and gender.',
 'Although that data was released in 2016, be

In [160]:
words = word_tokenize(politicalDemographics.lower())
words

['the',
 'top',
 'of',
 'the',
 'rock',
 'at',
 'rockefeller',
 'center',
 'in',
 'new',
 'york',
 'on',
 'march',
 '16',
 ',',
 '2016',
 '.',
 '(',
 'justin',
 'lane/epa',
 ')',
 'in',
 'response',
 'to',
 'post',
 'opinion',
 'writer',
 'paul',
 'waldman',
 '’',
 's',
 'essay',
 'about',
 'the',
 'current',
 'power',
 'of',
 'the',
 'minority',
 'in',
 'american',
 'politics',
 ',',
 'the',
 'american',
 'enterprise',
 'institute',
 '’',
 's',
 'norman',
 'ornstein',
 'offered',
 'a',
 'stunning',
 'bit',
 'of',
 'data',
 'on',
 'twitter',
 '.',
 'i',
 'want',
 'to',
 'repeat',
 'a',
 'statistic',
 'i',
 'use',
 'in',
 'every',
 'talk',
 ':',
 'by',
 '2040',
 'or',
 'so',
 ',',
 '70',
 'percent',
 'of',
 'americans',
 'will',
 'live',
 'in',
 '15',
 'states',
 '.',
 'meaning',
 '30',
 'percent',
 'will',
 'choose',
 '70',
 'senators',
 '.',
 'and',
 'the',
 '30',
 '%',
 'will',
 'be',
 'older',
 ',',
 'whiter',
 ',',
 'more',
 'rural',
 ',',
 'more',
 'male',
 'than',
 'the',
 '70',


In [162]:
word_sent = [word for word in words if word not in customStopWords]

In [165]:
from nltk.probability import FreqDist

In [180]:
freq = FreqDist(word_sent)

In [168]:
from heapq import nlargest

In [174]:
nlargest(15, freq, key=freq.get)

['states',
 '’',
 'percent',
 'population',
 '16',
 'ornstein',
 '—',
 'center',
 '70',
 'country',
 'senate',
 'data',
 '2040',
 'americans',
 '30']

In [170]:
from collections import defaultdict

In [171]:
ranking = defaultdict(int)


In [172]:
for i,sent in enumerate(sentences):
    for w in word_tokenize(sent.lower()):
        if w in freq:
            ranking[i] += freq[w]

In [178]:
sentence_rank = nlargest(4, ranking, key=ranking.get)
sentence_rank

[9, 12, 6, 8]

In [179]:
[sentences[sent] for sent in sorted(sentence_rank)]

['The Weldon Cooper Center for Public Service of the University of Virginia analyzed Census Bureau population projections to estimate each state’s likely population in 2040, including the expected breakdown of the population by age and gender.',
 'Eight states will have just under half of the total population of the country, 49.5 percent, according to the Weldon Cooper Center’s estimate.',
 'The next eight most populous states will account for an additional fifth of the population, up to 69.2 percent — meaning that the 16 most populous states will be home to about 70 percent of Americans.',
 'Ornstein’s (and Waldman’s) point is clear: 30 percent of the population of the country will control 68 percent of the seats in the U.S. Senate.']