# Saving tweets to a database

This demonstrates how easy scraping hundreds of tweets is, and also how easy it is to then save them to a MongoDB database.

In [1]:
from pymongo import MongoClient

import os

from pprint import pprint

from twitter_fire_scraper.config import Config
from twitter_fire_scraper.scraper import Scraper
from twitter_fire_scraper.twitter import GEOBOX_CHICAGO
from twitter_fire_scraper.util import geobox_to_geocode, flatten_status_dict, save_statuses_dict_to_mongodb

In [2]:
# Get a Twitter developer account and get an API key!

from twitter_fire_scraper.scraper import Scraper
from twitter_fire_scraper.twitter import TwitterAuthentication

twauth = TwitterAuthentication(
    consumer_key="FILL ME IN!",
    consumer_secret="FILL ME IN!",
    access_token="FILL ME IN!",
    access_token_secret="FILL ME IN!",
)

print("You can fill these in yourself and avoid having to use `secrets.json` inside of your home folder.")

if twauth.consumer_key == "FILL ME IN!":
    # If they have not set up the twauth variable,
    
    if not os.path.exists(os.path.expanduser("~/secrets.json")):
        # If autodetect of twitter API keys will surely fail,
        
        print(
            "This demo will not work without either a valid TwitterAuthentication object or a file that has your secrets in it.")
        print(
            "Either make a file at `{}` containing your keys, or put them directly into the TwitterAuthentication object.".format(
                os.path.expanduser("~\\secrets.json")))
        print("Read the README about this to know more.")
        print("Aborting!")
        exit(1)
    else:
        # Autodetectof twitter API keys should work.
        twauth = TwitterAuthentication.autodetect_twitter_auth()

In [3]:
scraper = Scraper(twitter_authentication=twauth)


We can also tell our scraper to only look 20mi from Chicago's center.

Not everyone uses geotagging, so this will reduce the amount of hits you get.

In [4]:
radius = "20mi"
chicago_geocode = geobox_to_geocode(GEOBOX_CHICAGO, radius)

So, with a few lines of code we can scrape a large amount of tweets on a list of words.

In [5]:
terms = {"fire", "condofire", "dwellingfire", "housefire", "fireloss", "firedamage",}

In [6]:
# Scrape results.
geotagged_results = scraper.scrape_terms(geocode=chicago_geocode, terms=terms, count=200)

for category, statuses in geotagged_results.items():
    print("{:15s}: {} hits".format(category, len(statuses)))
print(terms)

fireloss       : 2 hits
fire           : 200 hits
firedamage     : 6 hits
{'condofire', 'fireloss', 'fire', 'dwellingfire', 'firedamage', 'housefire'}


If there are categories with zero hits, then nobody has tweeted at all in a GEOTAGGED tweet about those categories.

Saving all of those tweets to a database is also made easy.

In [7]:
mongoclient = MongoClient("mongodb://localhost:27017/")

mongodb = mongoclient['saved_tweets_geotagged_chicago']

# Save results to mongoDB
save_statuses_dict_to_mongodb(geotagged_results, mongodb)

At this point, opening MongoDB Compass Community or some other MongoDB database browser will let you see our freshly-scraped tweets!

Now, I will contrast this with not searching geotagged tweets. We should get a LOT more.

In [8]:
not_geotagged_results = scraper.scrape_terms(terms=terms, count=200)

for category, statuses in not_geotagged_results.items():
    print("{:15s}: {} hits".format(category, len(statuses)))
print(terms)

Rate limit reached. Sleeping for: 502


fireloss       : 4 hits
fire           : 200 hits
dwellingfire   : 17 hits
firedamage     : 126 hits
housefire      : 200 hits
{'condofire', 'fireloss', 'fire', 'dwellingfire', 'firedamage', 'housefire'}


In [9]:
mongodb = mongoclient['saved_tweets_not_geotagged']

# Save results to mongoDB
save_statuses_dict_to_mongodb(not_geotagged_results, mongodb)