# 1a. Historic Twitter Scraper

This notebook uses the Tweepy and GetOldTweets modules to access the Twitter API. We used the Twitter API to get Tweets from blue-checkmarked 511 Twitter accounts from five different states. Since our focus was looking at Twitter during Hurricane Matthew, we only downloaded tweets from October 4, 2016 to October 14, 2016.

In [1]:
import tweepy
import jsonpickle
import json
import datetime
import GetOldTweets3 as got
import time

from pandas.io.json import json_normalize

import pandas as pd

In [2]:
# load Twitter API credentials
with open('../twitter_credentials.json') as cred_data:
    info = json.load(cred_data)
    consumer_key = info['CONSUMER_KEY']
    consumer_secret = info['CONSUMER_SECRET']
    access_token = info['ACCESS_TOKEN']
    access_secret = info['ACCESS_SECRET']

In [3]:
# Authenticate to Twitter
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth)

try:
    api.verify_credentials()
    print("Authentication OK")
except:
    print("Error during authentication")

Authentication OK


In [4]:
user = api.me()
print (user.name)
print(api.rate_limit_status()['resources']['search'])

Tee Moore
{'/search/tweets': {'limit': 180, 'remaining': 180, 'reset': 1564169077}}


In [5]:
# create username list
my_accounts = []

# extract usernames from the 511 Twitter list tied to Temple's twitter account
for member in tweepy.Cursor(api.list_members, 'ClydeLazersex', 'evac-route-official').items():
    my_accounts.append(member.screen_name)

# Show list
my_accounts

['GDOTATL',
 'SCDOTMidlands',
 'SCDOTPeeDee',
 'SCDOTLowCountry',
 'SCDOTPiedmont',
 '511statewideva',
 'fl511_panhandl',
 '511Georgia',
 'fl511_state',
 'fl511_northeast',
 'fl511_southeast',
 'fl511_southwest',
 'fl511_tampabay',
 'fl511_central',
 '511centralva',
 '511hamptonroads',
 '511northernva',
 'NCDOT_Westmtn',
 'NCDOT_Triangle',
 'NCDOT_Piedmont',
 'NCDOT_Charlotte',
 'NCDOT_Asheville',
 'NCDOT_Scoast',
 'NCDOT_Ncoast']

In [1]:
# code modified from Jasmine Vasandani
# With the help of got.manager, get all the tweets from 10/04/2016 till 10/14/2016
# Store all the collected tweets in DF
t0 = time.time()

max_tweets=3000
closure_list=[]
final_df=pd.DataFrame()

# GetOldTweets is able to get historical tweets from individual accounts
# we loop through the account list to query the GOT module
for account in my_accounts:
    
    # establish our twitter criteria
    tweetCriteria = got.manager.TweetCriteria().setUsername(account).setSince("2016-10-04").setUntil("2016-10-14").setMaxTweets(max_tweets)   
    
    # print the criteria object
    print(tweetCriteria)
    
    # query GOT
    tweet = got.manager.TweetManager.getTweets(tweetCriteria)
    
    # create dictionary from tweet object
    for i in range(len(tweet)):
        tweet_dict = {}
        tweet_dict['id'] = tweet[i].id
        tweet_dict['username'] = tweet[i].username
        tweet_dict['date'] = tweet[i].date
        tweet_dict['text'] = tweet[i].text
        tweet_dict['hashtags'] = tweet[i].hashtags
        tweet_dict['geo'] = tweet[i].geo
        tweet_dict['type'] = 'official'
        
        # added tweet dictionary to list
        closure_list.append(tweet_dict)

# create dataframe from list
df1=pd.DataFrame(closure_list)

# create final dataframe to save to CSV
final_df = pd.concat([df1, final_df])
print("Shape of DataFrame:", final_df.shape)
print(f"Time run: {time.time() - t0}")

NameError: name 'time' is not defined

In [7]:
# review the shape of the dataframe
final_df.shape

(24054, 7)

In [8]:
closure_list[500]

{'id': '783245751862386692',
 'username': 'SCDOTMidlands',
 'date': datetime.datetime(2016, 10, 4, 10, 1, 53, tzinfo=datetime.timezone.utc),
 'text': 'CLEARED: Construction; I-20 EB, btwn 1 mi W of Exit58 & 1 mi W of Exit61.| 6:01A',
 'hashtags': '',
 'geo': '',
 'type': 'official'}

In [9]:
final_df.iloc[300].text

'CLEARED: Obstruction on roadway; I-20 WB, btwn 3 mi E of Exit123 & 1 mi W of Exit108.| 11:54P'

In [10]:
# list the columns from the dataframe
final_df.columns

Index(['date', 'geo', 'hashtags', 'id', 'text', 'type', 'username'], dtype='object')

In [11]:
# list the accounts followed and the count of Tweets collected from each account
final_df.groupby(by='username')['username'].count()

username
511Georgia           94
511centralva       2926
511hamptonroads    1733
511northernva      3000
511statewideva      976
GDOTATL              47
NCDOT_Asheville      72
NCDOT_Charlotte     181
NCDOT_Ncoast        455
NCDOT_Piedmont       94
NCDOT_Scoast        529
NCDOT_Triangle      618
NCDOT_Westmtn         2
SCDOTLowCountry     308
SCDOTMidlands       471
SCDOTPeeDee          70
SCDOTPiedmont       386
fl511_central      1906
fl511_northeast    2894
fl511_panhandl      345
fl511_southeast    3000
fl511_southwest     508
fl511_state        2631
fl511_tampabay      808
Name: username, dtype: int64

In [12]:
final_df.to_csv('../data/Tweets_Historical/historic_tweets_official_07262019.csv', index = False)

In [13]:
df = pd.read_csv('../data/Tweets_Historical/historic_tweets_official_07262019.csv')

In [14]:
df['text']

0        The PIOH for the SR 138/I-20 is going on now u...
1        We appreciate all the hard work our crews have...
2        Need info on re-entry? Check here! Also follow...
3        Do you need some updates? You can find ALL of ...
4        Did you know that the @511Georgia website and ...
5        We are ready to be out there tomorrow morning ...
6        The roads are definitely not clear but if anyb...
7        From dawn to dusk, we are still hard at work i...
8        For the most up to date information on re-entr...
9        We need you to have patience in clearing the r...
10       For the most up to date information on re-entr...
11       Please WAIT to re-enter the coastal areas. We ...
12       100s of GDOT employees are here for you either...
13       We will let you know immediately when roads ar...
14       You may drive a large truck or SUV but it only...
15       Hurricanes can produce deadly flooding hundred...
16       If you must drive, don't pass barricades and d.