Skip to content

Commit

Permalink
Fork of code from ScraperWiki at https://classic.scraperwiki.com/scra…
Browse files Browse the repository at this point in the history
  • Loading branch information
timpryce committed Jun 17, 2014
0 parents commit 8332e2d
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
# Ignore output of scraper
data.sqlite
3 changes: 3 additions & 0 deletions README.textile
@@ -0,0 +1,3 @@
This is a Twitter scraper that inputs a SCREENNAME and outputs a database of that twitter account's follower's usernames, id, location, profile image url and bio.

Feel free to fork!
49 changes: 49 additions & 0 deletions scraper.py
@@ -0,0 +1,49 @@
import scraperwiki
import simplejson
import urllib2
import sys

SCREENNAME = 'ScraperWiki'

# API help: https://dev.twitter.com/docs/api/1/get/followers/ids
url = 'http://api.twitter.com/1/followers/ids.json?screen_name=%s' % (urllib2.quote(SCREENNAME))
print url
followers_json = simplejson.loads(scraperwiki.scrape(url))
print "Found %d followers of %s" % (len(followers_json), SCREENNAME)
followers_json = followers_json['ids'] # get earliest followers first for batching
followers_json.reverse()

# Groups a list in chunks of a given size
def group(lst, n):
for i in range(0, len(lst), n):
val = lst[i:i+n]
if len(val) == n:
yield tuple(val)

# Where to start? Overlap one batch to increase hit rate if people unfollow etc.
batchdone = scraperwiki.sqlite.get_var('batchdone', 1)
batchstart = batchdone - 1
if batchstart < 1:
batchstart = 1

# Take 100 at a time, and do one lookup call for each batch
c = 0
for follower_list in group(followers_json, 100):
c = c + 1
if c < batchstart:
continue
print "number", c, "out of", len(followers_json) / 100
print 'batch of ids:', follower_list
url = 'http://api.twitter.com/1/users/lookup.json?user_id=%s' % (urllib2.quote(','.join(map(str, follower_list))))
print 'getting url:', url
details_json = simplejson.loads(scraperwiki.scrape(url))
print details_json
for detail in details_json:
data = {'screen_name': detail['screen_name'],'id': detail['id'],'location': detail['location'], 'bio': detail['description'], 'image': detail['profile_image_url']}
print "Found person", data
scraperwiki.sqlite.save(unique_keys=['id'], data = data)
scraperwiki.sqlite.save_var('batchdone', c)




0 comments on commit 8332e2d

Please sign in to comment.