# DEMO: Extract Tweets and Handle Metadata

This notebook demonstrates a process of extracting tweets for the IST 736 Final Project.

For this purpose, users Twitter handles and timeframe are key parameters for extracting the right data.

In [1]:
import tweepy 
import os
import bz2
import json
import time
from datetime import datetime, date, time, timedelta
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import collections
import seaborn as sns 
import re

In [2]:
# authorization codes here from personal Twitter developer account
# https://apps.twitter.com/
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
OAUTH_TOKEN = ''
OAUTH_SECRET = ''

## Data Source
This data source includes index, people names, class labels, and twitter handle

In [3]:
# This data source includes index, people names, class labels, and twitter handle
People = pd.read_excel('data/People.xlsx', sheet_name='All') # data source
People.head()

Unnamed: 0,Key,Name,Party,State,Governor,Senate,House,Ran 2020,Ran 2016,positions held,Twitter Handle
0,Aaron Schock-Illinois,Aaron Schock,Republican,Illinois,0,0,1,0,0,1,-
1,Abby Finkenauer-Iowa,Abby Finkenauer,Democratic,Iowa,0,0,1,0,0,1,-
2,Abigail Spanberger-Virginia,Abigail Spanberger,Democratic,Virginia,0,0,1,0,0,1,-
3,Adam Kinzinger-Illinois,Adam Kinzinger,Republican,Illinois,0,0,1,0,0,1,-
4,Adam Putnam-Florida,Adam Putnam,Republican,Florida,0,0,1,0,0,1,-


## Specify list of Twitter handles
These will be used to query Twitter

In [4]:
People['Twitter Handle'] = People['Twitter Handle'].fillna('-')
hlist = People.loc[People['Twitter Handle'] != '-']['Twitter Handle'].tolist()
print("There are {:d} Twitter handles in the data".format(len(hlist)))
print("Sample:", hlist[:5])

There are 110 Twitter handles in the data
Sample: ['amyklobuchar', 'AndrewYang', 'SenAngusKing', 'AsaHutchinson', 'BarbaraBoxer']


## Data Tools

In [5]:
def oauth_login():
    """login to Twitter with ordinary rate limiting
    needs defined authorization codes for personal twitter developer application
    CONSUMER_KEY (consumer api key)
    CONSUMER_SECRET (consumer api secret key)
    OAUTH_TOKEN (access token)
    OAUTH_SECRET (access token secret)
    Returns:
        [tweepy.api.API] -- [tweepy api]
    """
    # get the authorization from Twitter and save in the Tweepy package
    auth = tweepy.OAuthHandler(CONSUMER_KEY,CONSUMER_SECRET)
    auth.set_access_token(OAUTH_TOKEN,OAUTH_SECRET)
    tweepy_api = tweepy.API(auth)
    # if a null api is returned, give error message
    if (not tweepy_api):
        print ("Problem Connecting to API with OAuth")
        # return the Twitter api object that allows access for the Tweepy api functions
    return tweepy_api

def get_tweets(api, query, start, end, max_results=20):
    """Uses the tweepy Cursor to wrap a twitter api search for the query string
    returns json formatted results
    Arguments:
        api {[tweepy.api.API]} -- [tweepy api]
        query {[str]} -- [twitter handle]
    
    Keyword Arguments:
        start {str} -- [timeframe start e.g. '20200411']
        end {str} -- [timeframe end e.g. '20200412']
        max_results {int} -- [number of tweets] (default: {20})
    
    Returns:
        [list] -- [list of dictionaries where each element is a tweet]
    """
    start = datetime.strptime(start, '%Y%m%d')
    end = datetime.strptime(end, '%Y%m%d')
    # the first search initializes a cursor, stored in the metadata results,
    # that allows next searches to return additional tweets
    search_results = [status for status in tweepy.Cursor(api.user_timeline, id=query).items(max_results) if status.created_at < end and status.created_at > start]  
    # for each tweet, get the json representation
    tweets = [tweet._json for tweet in search_results]  
    return tweets

def get_handle_data(api, query):
    """Uses the tweepy Cursor to wrap a twitter api search for the query string
    returns json formatted results
    Arguments:
        api {[tweepy.api.API]} -- [tweepy api]
        query {[str]} -- [handle text]    
    
    Returns:
        [dict] -- [dictionary of metadata for a handle]
    """
    item = api.get_user(query)
    hdict = {}
    hdict['handle'] = query
    hdict['name'] = item.name
    hdict['created_at'] = item.created_at
    hdict['screen_name'] = item.screen_name 
    hdict['description'] = item.description 
    hdict['statuses_count'] = item.statuses_count # number of tweets published 
    hdict['friends_count'] = item.friends_count 
    hdict['followers_count'] = item.followers_count
    return hdict

## Establish API

In [6]:
api = oauth_login()
print("Twitter OAuthorization")

Twitter OAuthorization


## Get Handle Meta Data

In [8]:
%%time
handle_data = [] # list of dictionaries where each element is a twitter handle
handle_err_list = [] # list of handles for which data could not be found
for h in hlist:
    try:
        handle_data.append(get_handle_data(api, query = h))
    except:
        handle_err_list.append(h)

print("Metadata collected for {:d} Twitter handles".format(len(handle_data)))
print("Could not get data for {:d} Twitter handles: {:s}".format(len(handle_err_list), str(handle_err_list)))

Metadata collected for 109 Twitter handles
Could not get data for 1 Twitter handles: ['lisamurkowaki']
Wall time: 24.3 s


## Summary of Handle Data

In [9]:
df_handle = pd.DataFrame(handle_data)
df_handle.describe()

Unnamed: 0,statuses_count,friends_count,followers_count
count,109.0,109.0,109.0
mean,9420.0,4661.348624,1735197.0
std,10469.155728,16318.042178,7836438.0
min,12.0,18.0,4585.0
25%,3936.0,383.0,54299.0
50%,6876.0,899.0,157858.0
75%,11511.0,2622.0,917202.0
max,71491.0,138405.0,76688940.0


## Get Tweets and Save Data
In this demonstration, only 50 tweets are extracted per user from '20150101' to '20191201'

In [10]:
%%time
# filename of data - json formatted tweets
fname = "data/tweets_demo.json"
startdate = '20150101'
enddate = '20191201'
maxresults = 50
## write tweets to a json file
tweets_list = []
handle_err_list = [] # list of handles for which data could not be found
for handle in hlist:
    try:
        tweets_list.extend(get_tweets(api, start = startdate, end = enddate, query = handle, max_results=maxresults))
    except:
        handle_err_list.append(handle)
with bz2.BZ2File(fname, 'w') as fout:
    fout.write(json.dumps(tweets_list).encode('utf-8'))
print("{:d} tweets collected for {:d} users".format(len(tweets_list), len(hlist)))
if len(handle_err_list) > 0:
    print("Could not get data for {:d} Twitter handles: {:s}".format(len(handle_err_list), str(handle_err_list)))
print("Results are saved in {:s}".format(fname))

499 tweets collected for 110 users
Could not get data for 1 Twitter handles: ['lisamurkowaki']
Results are saved in data/tweets_demo.json
Wall time: 1min 56s


In [13]:
# load tweet data for analysis and models
with bz2.BZ2File(fname, 'r') as fin:
    data = json.loads(fin.read().decode('utf-8'))

In [14]:
# to dataframe and save handle metadata
df_handle.to_csv('data/handle_data_demo.csv', index=None)