In [1]:
import sys
import platform
import json
import pickle
import time

from pathlib import Path
import tweepy
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim
import gmplot

import configparser

tweepy.debug(True) # show the rate limit information

DATA_DIR = Path('data')
RESULTS_PER_QUERY = 100 

# needed for .parquet files
!pip install pyarrow 
!pip install jupyternotify

%load_ext jupyternotify



Error processing line 7 of c:\users\tommy\anaconda3\lib\site-packages\pywin32.pth:

  Traceback (most recent call last):
    File "c:\users\tommy\anaconda3\lib\site.py", line 168, in addpackage
      exec(line)
    File "<string>", line 1, in <module>
  ModuleNotFoundError: No module named 'pywin32_bootstrap'

Remainder of file ignored




Error processing line 7 of c:\users\tommy\anaconda3\lib\site-packages\pywin32.pth:

  Traceback (most recent call last):
    File "c:\users\tommy\anaconda3\lib\site.py", line 168, in addpackage
      exec(line)
    File "<string>", line 1, in <module>
  ModuleNotFoundError: No module named 'pywin32_bootstrap'

Remainder of file ignored


<IPython.core.display.Javascript object>

Note: The Twitter API is rate-limited to a certain number of calls per 15 minute block. If the API returns error code 429, you've called too often. Avoid calling tweepy unless you need to. Once Tweepy is running it will manage the rate limit, so if you're searching 5,000+ tweets it will take more than 15 minutes to complete. 

Setting `tweepy.debug(True)` will print information on rate limit and more. Divide the number of tweets by 100 to get the number of API calls needed. Tweepy shouldn't crash while if it hits the limit in process, but halt before breaking the limit. Be careful when initializing a new search, as Tweepy will return an error if you've reached the limit, crashing your script. 

Make sure you set your access keys correctly in `config.ini`. This file is ignored by git, so you have to make one yourself. Don't add your info to Git here. You can set `run_cell=False` to prevent the file from being overwritten once you have it configured (or just remove this cell).

In [2]:
run_cell = False # set this to True to write the config file, back to false to prevent erasing your info.

if run_cell:
    with open('config.ini', 'w') as f:

        f.write(
            "[twitter] \n" +
            "consumer_key = <YOUR INFO HERE> \n" +
            "consumer_secret = <YOUR INFO HERE> \n" + 
            "\n" +
            "access_token = <YOUR INFO HERE> \n" +
            "access_token_secret = <YOUR INFO HERE>"
        )

        f.close()

# API

First, set up the API so it can collect. Make sure your access tokens etc. are configured correctly.

In [3]:
def get_api(cfg_path='./config.ini'):

    # Read config file
    Config = configparser.ConfigParser()
    Config.read(cfg_path)

    auth = tweepy.auth.OAuthHandler(
            Config.get("twitter", "consumer_key"),
            Config.get("twitter", "consumer_secret")
        )
    auth.set_access_token(
            Config.get("twitter", "access_token"),
            Config.get("twitter", "access_token_secret")
        )
    
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
    
    return api

# TweetTrawler

Make an object to handle calls to the API. The data are stored to separate pickle files (`data/*.pkl`). Once trawling is complete, use `TweetTrawler.gather_to_csv()` to gather up the pickle files and build a dataframe, which is written to csv.

In [9]:
class TweetTrawler:
    
    def __init__(self, api, data_dir=DATA_DIR):
        self.api = api
        self.dir = data_dir
        data_dir.mkdir(exist_ok=True, parents=True)
    
    def search(self, query, dates, n_tweets, lang, results_type, tweets_mode, **kwargs):
        
        since, until = dates
            
        print(f'\n\n*** RETRIEVING {n_tweets}>= {results_type.upper()} RESULTS FROM {since} TO {until} ***\n\n')

        tweets = list(tweepy.Cursor(api.search, q=query, count=RESULTS_PER_QUERY, lang=lang,
                                    since=since, until=until,
                                    results_type=results_type, tweet_mode=tweets_mode).items(n_tweets))

        df = pd.json_normalize([status._json for status in tweets])
        df.index = pd.to_datetime(df['created_at'].astype(str))
        
        # save the data
        foldername = str(results_type) + '_' + '_'.join(lang)
        (self.dir / foldername).mkdir(exist_ok=True, parents=True)
        filename = self.dir / foldername / f'{query}_{since[-2:]}_to_{until[-2:]}_{len(df)}'
        
        df.to_pickle(f'{filename}.pkl')
        df.to_csv(f'{filename}.csv')

# Retrieve Results

Use the TwitterTrawler object to retrieve the desired dataset. See `load_data.ipynb` for instructions on how to load the generated datasets from the `data` directory.

ideas and todos:
- two basic datasets: most popular, random sample (recent)
- iterate over days, collect as many tweets as you want for each day.
- The `params` dict allows for entering a bunch of parameters, check out the class above for more info.
- for the popular dataset - track popularity over time?
- we can retrieve more tweets by geocoding maybe?
- sentiment analysis on finance data?
- @Ersin - figure out (if you can) how to use a dev environment with Tweepy.

In [11]:
api = get_api()

queries = ['coronavirus', 'COVID19', 'COVID-19', 'COVID_19', 'CoronaVirusUpdates',
           'SafeHands']

date_range = ('2020-03-11', '2020-03-18')
    
DATA_DIR.mkdir(exist_ok=True, parents=True)
print(date_range)

('2020-03-11', '2020-03-18')


In [None]:
trawler = TweetTrawler(api)

# param dicts can be generated iteratively for grid search.
params = {
    'dates': date_range,
    'n_tweets': 10000,
    'lang': ['en'], # 'en', etc.
    'results_type': 'popular', # ['popular', 'recent', 'mixed']
    'tweets_mode': 'extended'
}

for query in queries:
    
    done = False 
    while not done:
        try:
            trawler.search(query, **params)
            
            print('\n\n*** QUERY COMPLETE ***\n\n')
            time.sleep(5)
            done = True
            
        except tweepy.TweepError:
            print(f'trawler search failed on query {query}. Retrying query.')
    
    %notify -m f"completed query {str(query)}"



*** RETRIEVING 10000>= POPULAR RESULTS FROM 2020-03-11 TO 2020-03-18 ***


send: b'GET /1.1/search/tweets.json?q=coronavirus&count=100&lang=%5B%27en%27%5D&since=2020-03-11&until=2020-03-18&results_type=popular&tweet_mode=extended HTTP/1.1\r\nAccept-Encoding: identity\r\nHost: api.twitter.com\r\nAuthorization: OAuth oauth_nonce="138303778510257370821584549729", oauth_timestamp="1584549729", oauth_version="1.0", oauth_signature_method="HMAC-SHA1", oauth_consumer_key="jrEWCn2eDNx0KLsqC9EJxIp9p", oauth_token="276043201-uJPOMmGaj4R4K93tP3q7oP29z69lNSrsdHDvhfHl", oauth_signature="8kKkCxBro7etZgWwZCJQKGL1UCE%3D"\r\n\r\n'
reply: 'HTTP/1.1 200 OK\r\n'
header: cache-control: no-cache, no-store, must-revalidate, pre-check=0, post-check=0
header: content-disposition: attachment; filename=json.json
header: content-length: 602403
header: content-type: application/json;charset=utf-8
header: date: Wed, 18 Mar 2020 16:42:10 GMT
header: expires: Tue, 31 Mar 1981 05:00:00 GMT
header: last-modified: Wed

send: b'GET /1.1/search/tweets.json?max_id=1238916331048701964&q=coronavirus&count=100&lang=%5B%27en%27%5D&since=2020-03-11&until=2020-03-18&results_type=popular&tweet_mode=extended HTTP/1.1\r\nAccept-Encoding: identity\r\nHost: api.twitter.com\r\nCookie: guest_id=v1%3A158454972938971931; personalization_id="v1_YXagranGLqAnv3OlP8hG4A=="; lang=en\r\nAuthorization: OAuth oauth_nonce="170651401883066053931584549734", oauth_timestamp="1584549734", oauth_version="1.0", oauth_signature_method="HMAC-SHA1", oauth_consumer_key="jrEWCn2eDNx0KLsqC9EJxIp9p", oauth_token="276043201-uJPOMmGaj4R4K93tP3q7oP29z69lNSrsdHDvhfHl", oauth_signature="uy1MqsI%2F7cBiKTthVi5eMnlLa9o%3D"\r\n\r\n'
reply: 'HTTP/1.1 200 OK\r\n'
header: cache-control: no-cache, no-store, must-revalidate, pre-check=0, post-check=0
header: content-disposition: attachment; filename=json.json
header: content-length: 473921
header: content-type: application/json;charset=utf-8
header: date: Wed, 18 Mar 2020 16:42:15 GMT
header: expires: T

<IPython.core.display.Javascript object>



*** RETRIEVING 10000>= POPULAR RESULTS FROM 2020-03-11 TO 2020-03-18 ***


send: b'GET /1.1/search/tweets.json?q=COVID19&count=100&lang=%5B%27en%27%5D&since=2020-03-11&until=2020-03-18&results_type=popular&tweet_mode=extended HTTP/1.1\r\nAccept-Encoding: identity\r\nHost: api.twitter.com\r\nAuthorization: OAuth oauth_nonce="99655723728795327891584549743", oauth_timestamp="1584549743", oauth_version="1.0", oauth_signature_method="HMAC-SHA1", oauth_consumer_key="jrEWCn2eDNx0KLsqC9EJxIp9p", oauth_token="276043201-uJPOMmGaj4R4K93tP3q7oP29z69lNSrsdHDvhfHl", oauth_signature="8mYTtn4rpbs4%2Fgx8n71ATAUCehk%3D"\r\n\r\n'
reply: 'HTTP/1.1 200 OK\r\n'
header: cache-control: no-cache, no-store, must-revalidate, pre-check=0, post-check=0
header: content-disposition: attachment; filename=json.json
header: content-length: 684170
header: content-type: application/json;charset=utf-8
header: date: Wed, 18 Mar 2020 16:42:24 GMT
header: expires: Tue, 31 Mar 1981 05:00:00 GMT
header: last-modified: Wed, 1

# More information retrieval

In [None]:
# return all the retweets of the most highly retweeted tweets?
# more networking strategies to find more tweets?