In [2]:
# import some Python dependencies

import urllib2
import json
import datetime
import csv
import time
import os

In [3]:
# Since the code output in this notebook leaks the app_secret,
# it has been reset by the time you read this.

app_id = os.getenv("FACEBOOK_APP_ID")
app_secret = os.getenv("FACEBOOK_APP_SECRET")

access_token = app_id + "|" + app_secret

In [4]:
page_id = 'nytimes'

In [5]:
def testFacebookPageData(page_id, access_token):
    
    # construct the URL string
    base = "https://graph.facebook.com/v2.4"
    node = "/" + page_id
    parameters = "/?access_token=%s" % access_token
    url = base + node + parameters
    
    # retrieve data
    req = urllib2.Request(url)
    response = urllib2.urlopen(req)
    data = json.loads(response.read())
    
    print json.dumps(data, indent=4, sort_keys=True)

In [6]:
testFacebookPageData(page_id, access_token)

{
    "id": "5281959998", 
    "name": "The New York Times"
}


In [8]:
def request_until_succeed(url):
    req = urllib2.Request(url)
    success = False
    while success is False:
        try: 
            response = urllib2.urlopen(req)
            if response.getcode() == 200:
                success = True
        except Exception, e:
            print e
            time.sleep(5)
            
            print "Error for URL %s: %s" % (url, datetime.datetime.now())

    return response.read()

In [11]:
def testFacebookPageFeedData(page_id, access_token):
    
    # construct the URL string
    base = "https://graph.facebook.com/v2.4"
    node = "/" + page_id + "/feed" # changed
    parameters = "/?access_token=%s" % access_token
    url = base + node + parameters
    
    # retrieve data
    data = json.loads(request_until_succeed(url))
    
    print json.dumps(data, indent=4, sort_keys=True)
    

testFacebookPageFeedData(page_id, access_token)

{
    "data": [
        {
            "created_time": "2016-04-19T20:26:06+0000", 
            "id": "5281959998_10150793902249999", 
            "message": "He wiped blood onto his 5-year-old daughter's shirt. Then he changed his clothes and fled the scene."
        }, 
        {
            "created_time": "2016-04-19T19:55:01+0000", 
            "id": "5281959998_10150793836489999", 
            "message": "The New York Yankees have the highest success rate challenging umpires' calls. The reason? This guy."
        }, 
        {
            "created_time": "2016-04-19T19:37:01+0000", 
            "id": "5281959998_10150793864759999", 
            "message": "Alissa J. Rubin, who won a Pulitzer Prize for reporting on the abuse and injustice faced by women in Afghanistan and the troubled Western response to it, answers your questions.", 
            "story": "The New York Times was live."
        }, 
        {
            "created_time": "2016-04-19T19:31:43+0000", 
            "id": 

In [12]:
def getFacebookPageFeedData(page_id, access_token, num_statuses):
    
    # construct the URL string
    base = "https://graph.facebook.com"
    node = "/" + page_id + "/feed" 
    parameters = "/?fields=message,link,created_time,type,name,id,likes.limit(1).summary(true),comments.limit(1).summary(true),shares&limit=%s&access_token=%s" % (num_statuses, access_token) # changed
    url = base + node + parameters
    
    # retrieve data
    data = json.loads(request_until_succeed(url))
    
    return data
    

test_status = getFacebookPageFeedData(page_id, access_token, 1)["data"][0]
print json.dumps(test_status, indent=4, sort_keys=True)

{
    "comments": {
        "data": [
            {
                "created_time": "2016-04-19T20:29:51+0000", 
                "from": {
                    "id": "10153597244488963", 
                    "name": "Karen Crook"
                }, 
                "id": "10150793902249999_10150793904884999", 
                "message": "What a disgusting excuse for a human being!!!! Not only did he kill his daughter, he then assaulted his other daughter and put the blame on her. Revolting! I hope they throw the book at this pathetic human."
            }
        ], 
        "paging": {
            "cursors": {
                "after": "MwZDZD", 
                "before": "MwZDZD"
            }, 
            "next": "https://graph.facebook.com/v2.6/5281959998_10150793902249999/comments?access_token=1588347538160720%7Cd77eae2a264f512a1acb6f84df6bc29b&summary=true&limit=1&after=MwZDZD"
        }, 
        "summary": {
            "can_comment": false, 
            "order": "ranked", 
    

In [13]:
def processFacebookPageFeedStatus(status):
    
    # The status is now a Python dictionary, so for top-level items,
    # we can simply call the key.
    
    # Additionally, some items may not always exist,
    # so must check for existence first
    
    status_id = status['id']
    status_message = '' if 'message' not in status.keys() else status['message'].encode('utf-8')
    link_name = '' if 'name' not in status.keys() else status['name'].encode('utf-8')
    status_type = status['type']
    status_link = '' if 'link' not in status.keys() else status['link']
    
    
    # Time needs special care since a) it's in UTC and
    # b) it's not easy to use in statistical programs.
    
    status_published = datetime.datetime.strptime(status['created_time'],'%Y-%m-%dT%H:%M:%S+0000')
    status_published = status_published + datetime.timedelta(hours=-5) # EST
    status_published = status_published.strftime('%Y-%m-%d %H:%M:%S') # best time format for spreadsheet programs
    
    # Nested items require chaining dictionary keys.
    
    num_likes = 0 if 'likes' not in status.keys() else status['likes']['summary']['total_count']
    num_comments = 0 if 'comments' not in status.keys() else status['comments']['summary']['total_count']
    num_shares = 0 if 'shares' not in status.keys() else status['shares']['count']
    
    # return a tuple of all processed data
    return (status_id, status_message, link_name, status_type, status_link,
           status_published, num_likes, num_comments, num_shares)

processed_test_status = processFacebookPageFeedStatus(test_status)
print processed_test_status

(u'5281959998_10150793902249999', "He wiped blood onto his 5-year-old daughter's shirt. Then he changed his clothes and fled the scene.", 'Father Tried to Blame 5-Year-Old With Killing of 4-Year-Old, Police Say', u'link', u'http://nyti.ms/1qD0ygD', '2016-04-19 15:26:06', 22, 3, 13)


In [14]:
def scrapeFacebookPageFeedStatus(page_id, access_token):
    with open('%s_facebook_statuses.csv' % page_id, 'wb') as file:
        w = csv.writer(file)
        w.writerow(["status_id", "status_message", "link_name", "status_type", "status_link",
           "status_published", "num_likes", "num_comments", "num_shares"])
        
        has_next_page = True
        num_processed = 0   # keep a count on how many we've processed
        scrape_starttime = datetime.datetime.now()
        
        print "Scraping %s Facebook Page: %s\n" % (page_id, scrape_starttime)
        
        statuses = getFacebookPageFeedData(page_id, access_token, 100)
        
        while has_next_page:
            for status in statuses['data']:
                w.writerow(processFacebookPageFeedStatus(status))
                
                # output progress occasionally to make sure code is not stalling
                num_processed += 1
                if num_processed % 1000 == 0:
                    print "%s Statuses Processed: %s" % (num_processed, datetime.datetime.now())
                    
            # if there is no next page, we're done.
            if 'paging' in statuses.keys():
                statuses = json.loads(request_until_succeed(statuses['paging']['next']))
            else:
                has_next_page = False
                
        
        print "\nDone!\n%s Statuses Processed in %s" % (num_processed, datetime.datetime.now() - scrape_starttime)


scrapeFacebookPageFeedStatus(page_id, access_token)

Scraping nytimes Facebook Page: 2016-04-19 13:41:26.016741

1000 Statuses Processed: 2016-04-19 13:42:07.108040
2000 Statuses Processed: 2016-04-19 13:42:47.508978
3000 Statuses Processed: 2016-04-19 13:43:29.922538
4000 Statuses Processed: 2016-04-19 13:44:10.557863
5000 Statuses Processed: 2016-04-19 13:44:59.414747
6000 Statuses Processed: 2016-04-19 13:45:38.018267
7000 Statuses Processed: 2016-04-19 13:46:22.777741
HTTP Error 500: Internal Server Error
Error for URL https://graph.facebook.com/v2.6/5281959998/feed?fields=message,link,created_time,type,name,id,likes.limit%281%29.summary%28true%29,comments.limit%281%29.summary%28true%29,shares&limit=100&__paging_token=enc_AdAn8E6iddlsECrCRLbGRYvA3fQBcwmB7dAI9oKNHcPsuZAlvGJkXjZBw0ThnZC0i6R1ilASzqHv76HITgwlKTYnuSM&access_token=1588347538160720|d77eae2a264f512a1acb6f84df6bc29b&until=1446123307: 2016-04-19 13:47:01.933752
8000 Statuses Processed: 2016-04-19 13:47:17.341622
9000 Statuses Processed: 2016-04-19 13:47:59.533960
10000 Statuse