In [1]:
import json
import nltk
from nltk.probability import FreqDist

In [2]:
with open('News_Category_Dataset_v2.json', 'r') as f:
    news = [json.loads(line) for line in f]

In [3]:
news_short_description = []
for new in news:
    news_short_description.append(new['short_description'])

In [4]:
stopwords = nltk.corpus.stopwords.words('english')
def is_content_word(word):
    return word.lower() not in stopwords and word.isalpha()

In [5]:
news_description_words = []
news_description_allwords = []
for des in news_short_description:
    for w in nltk.word_tokenize(des):
        news_description_allwords.append(w)
        if is_content_word(w):
            news_description_words.append(w) 

num_of_description = len(news_short_description)
print('The number of descriptions: {}'.format(num_of_description))
avg_wordlen_description = round(len(news_description_allwords) / len(news_short_description))
print('Average length of descriptions: {}'.format(avg_wordlen_description))

The number of descriptions: 200853
Average length of descriptions: 23


In [6]:
num_of_sen = 0
for des in news_short_description:
    num_of_sen =  num_of_sen + len(nltk.sent_tokenize(des))
print('The number of sentences: {}'.format(num_of_sen))
avg_senlen_description = round(num_of_sen / len(news_short_description), 3)
print('Average number of sentences of descriptions: {}'.format(avg_senlen_description))

The number of sentences: 280300
Average number of sentences of descriptions: 1.396


In [7]:
fdist = FreqDist(news_description_words)
print('25 most common words:')
print(fdist.most_common(25))

25 most common words:
[('one', 11060), ('time', 8840), ('people', 8603), ('like', 8160), ('new', 6886), ('us', 6869), ('said', 6414), ('would', 6055), ('get', 5869), ('life', 5846), ('make', 5399), ('know', 5320), ('Trump', 5159), ('many', 5081), ('years', 4909), ('first', 4876), ('way', 4839), ('world', 4814), ('could', 4771), ('may', 4574), ('year', 4466), ('day', 4366), ('want', 4252), ('even', 4160), ('need', 4025)]


In [8]:
content_bigrams = [bigram for bigram in list(nltk.bigrams(news_description_allwords))
                    if is_content_word(bigram[0]) and is_content_word(bigram[1])]
bigrams_fdist = nltk.FreqDist([bigram for bigram in content_bigrams])
print('25 most common words:')
print(bigrams_fdist.most_common(25))

25 most common words:
[(('New', 'York'), 1689), (('Donald', 'Trump'), 1650), (('United', 'States'), 894), (('years', 'ago'), 739), (('HuffPost', 'Style'), 699), (('White', 'House'), 698), (('first', 'time'), 605), (('Hillary', 'Clinton'), 573), (('health', 'care'), 527), (('last', 'week'), 518), (('last', 'year'), 434), (('York', 'City'), 429), (('social', 'media'), 421), (('every', 'day'), 416), (('climate', 'change'), 411), (('Supreme', 'Court'), 409), (('feel', 'like'), 403), (('many', 'people'), 391), (('one', 'thing'), 372), (('President', 'Obama'), 368), (('Los', 'Angeles'), 349), (('new', 'study'), 326), (('make', 'sure'), 298), (('New', 'Year'), 289), (('high', 'school'), 285)]


In [9]:
content_trigrams = [trigram for trigram in list(nltk.trigrams(news_description_allwords))
                    if is_content_word(trigram[0]) and is_content_word(trigram[1]) and is_content_word(trigram[2])]
trigrams_fdist = nltk.FreqDist([trigram for trigram in content_trigrams])
print('25 most common trigrams:')
print(trigrams_fdist.most_common(25))

25 most common trigrams:
[(('New', 'York', 'City'), 429), (('New', 'York', 'Times'), 254), (('HuffPost', 'Rise', 'Morning'), 193), (('Rise', 'Morning', 'Newsbrief'), 193), (('President', 'Donald', 'Trump'), 164), (('President', 'Barack', 'Obama'), 132), (('Affordable', 'Care', 'Act'), 122), (('political', 'news', 'every'), 108), (('news', 'every', 'evening'), 107), (('home', 'story', 'idea'), 105), (('Saturday', 'Night', 'Live'), 96), (('New', 'York', 'Fashion'), 96), (('PR', 'pitches', 'sent'), 95), (('York', 'Fashion', 'Week'), 90), (('need', 'help', 'maintaining'), 79), (('personal', 'spiritual', 'practice'), 79), (('Kids', 'may', 'say'), 69), (('HuffPost', 'Style', 'beauty'), 59), (('Style', 'beauty', 'content'), 58), (('Fox', 'News', 'host'), 55), (('Mother', 'Nature', 'Network'), 53), (('popular', 'YouTube', 'videos'), 50), (('World', 'War', 'II'), 49), (('Twitter', 'never', 'fail'), 49), (('new', 'study', 'suggests'), 49)]
