Use the Article search API from New York Times ( http://developer.nytimes.com/docs/read/article_search_api_v2 ) to find (some, not all to save time) articles mentioning Finland and published between 1999 and 2009.

* What are the most common words in the lead paragraph of the stories?
* Remember to use lemmas when counting the words!

In [7]:
import requests
import json

import nltk
import nltk.data

import dateutil.parser

import collections

In [None]:
## check how many articles we have
url = 'http://api.nytimes.com/svc/search/v2/articlesearch.json'
key = 'API_KEY_HERE' ## your NY Times API key

params = {'q' : 'Finland', 'begin_date' : '20150101', 'end_data' : '20151231', 'api-key': key }

data = requests.get( url , params = params )
data = data.json()

article_count = data['response']['meta']['hits']

datas = []

## download all articles using the pagination API
for i in range( 1 , article_count / 10 + 1 ):
    
    params = {'q' : 'Finland', 'begin_date' : '20150101', 'end_data' : '20151231', 'api-key': key, 'page' : i }

    data = requests.get( url , params = params )
    data = data.json()
    
    data = data['response']['docs']

    datas += data
    
## store result for reuse
json.dump( datas , open('ny_times_2015.json', 'w') )    

{u'status': u'OK', u'response': {u'docs': [{u'type_of_material': u'News', u'blog': [], u'news_desk': u'Travel', u'lead_paragraph': u'The desire to see a rare animal sometimes just isn\u2019t enough.', u'headline': {u'main': u'In Finland, Slippery Seals', u'print_headline': u'In Finland, Freshwater Seals Prove Elusive'}, u'abstract': None, u'print_page': None, u'word_count': u'1617', u'_id': u'55e3dd5538f0d84c14715e70', u'snippet': u'The desire to see a rare animal sometimes just isn\u2019t enough.', u'source': u'The New York Times', u'slideshow_credits': None, u'web_url': u'http://www.nytimes.com/2015/09/01/travel/in-finland-freshwater-seals-prove-elusive.html', u'multimedia': [{u'subtype': u'wide', u'url': u'images/2015/08/31/travel/seal-map-1440067015944/seal-map-1440067015944-thumbWide.png', u'height': 126, u'width': 190, u'legacy': {u'wide': u'images/2015/08/31/travel/seal-map-1440067015944/seal-map-1440067015944-thumbWide.png', u'wideheight': u'126', u'widewidth': u'190'}, u'type'

In [8]:
data = json.load( open('ny_times_2015.json') )

stemma = nltk.stem.lancaster.LancasterStemmer()

words_per_month = collections.defaultdict( list )

for d in data:
    
    out = []
    
    if d['lead_paragraph']: ## remove Nones
        
        words = nltk.word_tokenize( d['lead_paragraph'] )

        for word in words:
            out.append( stemma.stem( word ) )
            
    date = dateutil.parser.parse( d['pub_date'] )
            
    words_per_month[ date.month ] += out
        
     
for month in sorted( words_per_month.keys() ):
    
    counted = collections.Counter( words_per_month[ month ] )
    
    print month, counted.most_common( 5 )

1 [(u',', 6), (u'a', 6), (u'the', 6), (u'.', 5), (u'in', 4)]
2 [(u'in', 4), (u'of', 4), (u'the', 4), (u',', 3), (u'.', 3)]
3 [(u'the', 20), (u'of', 17), (u'in', 17), (u'.', 16), (u',', 13)]
4 [(u',', 11), (u'a', 9), (u'in', 7), (u'.', 6), (u'the', 5)]
5 [(u'it', 2), (u'as', 2), (u',', 2), (u'own', 1), (u'influ', 1)]
6 [(u',', 2), (u'the', 2), (u'firm', 1), (u'husband-and-wife', 1), (u'moreau', 1)]
7 [(u'the', 6), (u'.', 3), (u'in', 2), (u',', 2), (u'to', 2)]
8 [(u'of', 3), (u'.', 2), (u'hav', 2), (u'by', 2), (u'the', 2)]
9 [(u'a', 4), (u',', 3), (u'.', 3), (u'in', 2), (u'to', 2)]
10 [(u'art', 2), (u',', 2), (u'.', 2), (u'hous', 2), (u'and', 1)]
12 [(u',', 6), (u'a', 5), (u'.', 3), (u'on', 3), (u'is', 2)]
