diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..66d464d --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +# Ignore output of scraper +data.sqlite diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..cab2fe1 --- /dev/null +++ b/scraper.py @@ -0,0 +1,35 @@ +import scraperwiki +import simplejson +import urllib2 +import datetime + +# Change QUERY to your search term of choice. +# Examples: 'newsnight', 'from:bbcnewsnight', 'to:bbcnewsnight' +QUERY = 'to:oneworldnl' +RESULTS_PER_PAGE = '100' +LANGUAGE = '' +NUM_PAGES = 15 + +for page in range(1, NUM_PAGES+1): + base_url = 'http://search.twitter.com/search.json?q=%s&rpp=%s&lang=%s&page=%s' \ + % (urllib2.quote(QUERY), RESULTS_PER_PAGE, LANGUAGE, page) + try: + results_json = simplejson.loads(scraperwiki.scrape(base_url)) + for result in results_json['results']: + data = {} + data['id'] = result['id'] + data['text'] = result['text'].replace(""", "'") + data['from_user'] = result['from_user'] + data['profile_image_url'] = result['profile_image_url'] + data['geo'] = result['geo'] + data['source'] = result['source'] + data['iso_language_code'] = result['iso_language_code'] + data['from_user_name'] = result['from_user_name'] + data['date'] = datetime.datetime.today() + print data['from_user'], data['text'] + scraperwiki.sqlite.save(["id"], data) + except: + print 'Oh dear, failed to scrape %s' % base_url + break + +