This file exists to demonstrate a way to use the CSV-saving feature of our scraper.

**Warning: You could be scraping upwards of 10,000 tweets if you run this entire notebook.**

In [1]:
print("This prompt is to make sure you know you'll be potentially querying twitter for 10,000 tweets and are okay with that.")

inp = input('(y/n)\n > ')

if inp != 'y':
    print("Aborted!")
    raise KeyboardInterrupt
else:
    print("Continuing.")


This prompt is to make sure you know you'll be potentially querying twitter for 10,000 tweets and are okay with that.
(y/n)
 > y
Continuing.


In [2]:
# This is to force using installed packages for testing purposes.
from __future__ import absolute_import

import os

from pprint import pprint

import numpy as np
import matplotlib.pyplot as plt
import matplotlib

import twitter_fire_scraper
from twitter_fire_scraper.scraper import Scraper
from twitter_fire_scraper.util import geobox_to_geocode

from twitter_fire_scraper.twitter import GEOBOX_CHICAGO

from tweepy import Status

In [3]:
output_path='output'

geocode=geobox_to_geocode(GEOBOX_CHICAGO, "20mi")

In [4]:
# Get a Twitter developer account and get an API key!

from twitter_fire_scraper.scraper import Scraper
from twitter_fire_scraper.twitter import TwitterAuthentication

twauth = TwitterAuthentication(
    consumer_key="FILL ME IN!",
    consumer_secret="FILL ME IN!",
    access_token="FILL ME IN!",
    access_token_secret="FILL ME IN!",
)

print("You can fill these in yourself and avoid having to use `secrets.json` inside of your home folder.")

if twauth.consumer_key == "FILL ME IN!":
    # If they have not set up the twauth variable,
    
    if not os.path.exists(os.path.expanduser("~/secrets.json")):
        # If autodetect of twitter API keys will surely fail,
        
        print(
            "This demo will not work without either a valid TwitterAuthentication object or a file that has your secrets in it.")
        print(
            "Either make a file at `{}` containing your keys, or put them directly into the TwitterAuthentication object.".format(
                os.path.expanduser("~\\secrets.json")))
        print("Read the README about this to know more.")
        print("Aborting!")
        exit(1)
    else:
        # Autodetectof twitter API keys should work.
        twauth = TwitterAuthentication.autodetect_twitter_auth()


You can fill these in yourself and avoid having to use `secrets.json` inside of your home folder.


In [5]:
scraper = Scraper(twitter_authentication=twauth)

if not os.path.exists(output_path):
    os.mkdir(output_path)

In [6]:
def print_status(terms, count):
    print("Saving {} each of {} terms for a total of {} tweets:".format(count, len(terms), (count*len(terms))))
    print('\n'.join(terms))

In [7]:
def scrape_and_save_csv(scraper, csv, terms, count):
    if not os.path.isfile(csv):
        results = scraper.scrape(terms=terms, count=count, include_retweets=False, geocode=geocode)

        scraper.save_statusdict_to_csv(results, csv)

        print("Results saved to {}".format(csv))
    else:
        print("CSV already exists at {}!".format(csv))
        print("Delete it to grab unrelated tweets again.")

# Scraping

The actual scraping happens below this cell.

In [8]:
unrelated_csv = os.path.join(output_path, 'unrelated-terms.csv')

unrelated_terms = {'chicago', 'football', 'weather', 'congress', 'trains'}
count = 20

print_status(unrelated_terms, count)

scrape_and_save_csv(scraper, unrelated_csv, unrelated_terms, count)

Saving 20 each of 5 terms for a total of 100 tweets:
weather
trains
congress
chicago
football
CSV already exists at output\unrelated-terms.csv!
Delete it to grab unrelated tweets again.


In [9]:
disaster_csv = os.path.join(output_path, 'disaster-terms.csv')

disaster_terms = {'fire','house fire', 'landslide', 'mudslide', 'emergency'}
count = 400

print_status(disaster_terms, count)

scrape_and_save_csv(scraper, disaster_csv, disaster_terms, count)

Saving 400 each of 5 terms for a total of 2000 tweets:
house fire
fire
landslide
emergency
mudslide
CSV already exists at output\disaster-terms.csv!
Delete it to grab unrelated tweets again.


In [10]:
fire_csv = os.path.join(output_path, 'fire-related.csv')

fire_terms = {'fire', 'chicago fire', 
              'house fire', 'apartment fire',
              'cooking fire', }
count = 200

print_status(fire_terms, count)

scrape_and_save_csv(scraper, fire_csv, fire_terms, count)

Saving 200 each of 5 terms for a total of 1000 tweets:
house fire
apartment fire
fire
chicago fire
cooking fire
CSV already exists at output\fire-related.csv!
Delete it to grab unrelated tweets again.


In [11]:
flood_csv = os.path.join(output_path, 'flood-related.csv')

flood_terms = {'flood', 'flooding', 
              'flood waters', 'river flood',
              'cooking fire', }
count = 200

print_status(flood_terms, count)

scrape_and_save_csv(scraper, flood_csv, flood_terms, count)

Saving 200 each of 5 terms for a total of 1000 tweets:
flooding
river flood
flood waters
cooking fire
flood
CSV already exists at output\flood-related.csv!
Delete it to grab unrelated tweets again.
