In [1]:
import sys
import platform
import json
import pickle
import time

from pathlib import Path
import tweepy
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim
import gmplot

import configparser

tweepy.debug(True) # show the rate limit information

DATA_DIR = Path('data')
RESULTS_PER_QUERY = 100 

# needed for .parquet files
!pip install pyarrow 
!pip install jupyternotify

%load_ext jupyternotify



Error processing line 7 of c:\users\tommy\anaconda3\lib\site-packages\pywin32.pth:

  Traceback (most recent call last):
    File "c:\users\tommy\anaconda3\lib\site.py", line 168, in addpackage
      exec(line)
    File "<string>", line 1, in <module>
  ModuleNotFoundError: No module named 'pywin32_bootstrap'

Remainder of file ignored




Error processing line 7 of c:\users\tommy\anaconda3\lib\site-packages\pywin32.pth:

  Traceback (most recent call last):
    File "c:\users\tommy\anaconda3\lib\site.py", line 168, in addpackage
      exec(line)
    File "<string>", line 1, in <module>
  ModuleNotFoundError: No module named 'pywin32_bootstrap'

Remainder of file ignored


<IPython.core.display.Javascript object>

Note: The Twitter API is rate-limited to a certain number of calls per 15 minute block. If the API returns error code 429, you've called too often. Avoid calling tweepy unless you need to. Once Tweepy is running it will manage the rate limit, so if you're searching 5,000+ tweets it will take more than 15 minutes to complete. 

Setting `tweepy.debug(True)` will print information on rate limit and more. Divide the number of tweets by 100 to get the number of API calls needed. Tweepy shouldn't crash while if it hits the limit in process, but halt before breaking the limit. Be careful when initializing a new search, as Tweepy will return an error if you've reached the limit, crashing your script. 

Make sure you set your access keys correctly in `config.ini`. This file is ignored by git, so you have to make one yourself. Don't add your info to Git here. You can set `run_cell=False` to prevent the file from being overwritten once you have it configured (or just remove this cell).

In [2]:
run_cell = False # set this to True to write the config file, back to false to prevent erasing your info.

if run_cell:
    with open('config.ini', 'w') as f:

        f.write(
            "[twitter] \n" +
            "consumer_key = <YOUR INFO HERE> \n" +
            "consumer_secret = <YOUR INFO HERE> \n" + 
            "\n" +
            "access_token = <YOUR INFO HERE> \n" +
            "access_token_secret = <YOUR INFO HERE>"
        )

        f.close()

# API

First, set up the API so it can collect. Make sure your access tokens etc. are configured correctly.

In [3]:
def get_api(cfg_path='./config.ini'):

    # Read config file
    Config = configparser.ConfigParser()
    Config.read(cfg_path)

    auth = tweepy.auth.OAuthHandler(
            Config.get("twitter", "consumer_key"),
            Config.get("twitter", "consumer_secret")
        )
    auth.set_access_token(
            Config.get("twitter", "access_token"),
            Config.get("twitter", "access_token_secret")
        )
    
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
    
    return api

# TweetTrawler

Make an object to handle calls to the API. The data are stored to separate pickle files (`data/*.pkl`). Once trawling is complete, use `TweetTrawler.gather_to_csv()` to gather up the pickle files and build a dataframe, which is written to csv.

In [4]:
class TweetTrawler:
    
    def __init__(self, api, data_dir=DATA_DIR):
        self.api = api
        self.dir = data_dir
        data_dir.mkdir(exist_ok=True, parents=True)
    
    def print_lookup_table(self):
        """Use Ctrl+F to find the WoeID you need."""
        print(self.api.trends_available())
        
        
    def get_trends(self, woeid=1):

        trends = self.api.trends_place(woeid)[0]['trends']
        result = pd.json_normalize(trends)
        
        return result
        
    def search(self, query, dates, n_tweets, lang, results_type, tweets_mode, **kwargs):
        
        since, until = dates
            
        print(f'\n\n*** RETRIEVING {n_tweets}>= {results_type.upper()} RESULTS FROM {since} TO {until} ***\n\n')

        tweets = list(tweepy.Cursor(api.search, q=query, count=RESULTS_PER_QUERY, lang=lang,
                                    since=since, until=until,
                                    results_type=results_type, tweet_mode=tweets_mode).items(n_tweets))

        df = pd.json_normalize([status._json for status in tweets])
        df.index = pd.to_datetime(df['created_at'].astype(str))
        
        # save the data
        foldername = str(results_type) + '_' + '_'.join(lang)
        (self.dir / foldername).mkdir(exist_ok=True, parents=True)
        filename = self.dir / foldername / f'{query}_{since[-2:]}_to_{until[-2:]}_{len(df)}'
        
        df.to_pickle(f'{filename}.pkl')
        df.to_csv(f'{filename}.csv')

In [5]:
# x = TweetTrawler(api)

# trends = x.get_trends(1)

# Retrieve Results

Use the TwitterTrawler object to retrieve the desired dataset. See `load_data.ipynb` for instructions on how to load the generated datasets from the `data` directory.

ideas and todos:
- two basic datasets: most popular, random sample (recent)
- iterate over days, collect as many tweets as you want for each day.
- The `params` dict allows for entering a bunch of parameters, check out the class above for more info.
- for the popular dataset - track popularity over time?
- we can retrieve more tweets by geocoding maybe?
- sentiment analysis on finance data?
- @Ersin - figure out (if you can) how to use a dev environment with Tweepy.

In [6]:
api = get_api()

queries = ['coronavirus', 'COVID19', 'COVID-19', 'COVID_19', 'CoronaVirusUpdates',
           'SafeHands']

date_range = ('2020-03-11', '2020-03-18')
    
DATA_DIR.mkdir(exist_ok=True, parents=True)
print(date_range)

('2020-03-11', '2020-03-18')


In [7]:
trawler = TweetTrawler(api)

# param dicts can be generated iteratively for grid search.
params = {
    'dates': date_range,
    'n_tweets': 100,
    'lang': ['nl'], # 'en', etc.
    'results_type': 'popular', # ['popular', 'recent', 'mixed']
    'tweets_mode': 'extended'
}

for query in queries:
    
    done = False 
    while not done:
        try:
            trawler.search(query, **params)
            
            print('\n\n*** QUERY COMPLETE ***\n\n')
            time.sleep(5)
            done = True
            
        except tweepy.RateLimitError:
            print(f'trawler search failed on query {query}. Retrying query.')
    
    %notify -m f"completed query {str(query)}"



*** RETRIEVING 100>= POPULAR RESULTS FROM 2020-03-11 TO 2020-03-18 ***




Rate limit reached. Sleeping for: 30


send: b'GET /1.1/search/tweets.json?q=coronavirus&count=100&lang=%5B%27nl%27%5D&since=2020-03-11&until=2020-03-18&results_type=popular&tweet_mode=extended HTTP/1.1\r\nAccept-Encoding: identity\r\nHost: api.twitter.com\r\nAuthorization: OAuth oauth_nonce="171023406371173270611584557015", oauth_timestamp="1584557015", oauth_version="1.0", oauth_signature_method="HMAC-SHA1", oauth_consumer_key="jrEWCn2eDNx0KLsqC9EJxIp9p", oauth_token="276043201-uJPOMmGaj4R4K93tP3q7oP29z69lNSrsdHDvhfHl", oauth_signature="iSv5MnGU4gv0z8bF04ZxOyt55V4%3D"\r\n\r\n'
reply: 'HTTP/1.1 429 Too Many Requests\r\n'
header: content-length: 56
header: content-type: application/json;charset=utf-8
header: date: Wed, 18 Mar 2020 18:43:36 GMT
header: server: tsa_o
header: set-cookie: personalization_id="v1_bhaBPIxZrgodvXpZPWXOCQ=="; Max-Age=63072000; Expires=Fri, 18 Mar 2022 18:43:36 GMT; Path=/; Domain=.twitter.com; Secure; SameSite=None
header: set-cookie: personalization_id="v1_bhaBPIxZrgodvXpZPWXOCQ=="; Max-Age=6307200

<IPython.core.display.Javascript object>



*** RETRIEVING 100>= POPULAR RESULTS FROM 2020-03-11 TO 2020-03-18 ***


send: b'GET /1.1/search/tweets.json?q=COVID19&count=100&lang=%5B%27nl%27%5D&since=2020-03-11&until=2020-03-18&results_type=popular&tweet_mode=extended HTTP/1.1\r\nAccept-Encoding: identity\r\nHost: api.twitter.com\r\nAuthorization: OAuth oauth_nonce="130915698349521068891584557056", oauth_timestamp="1584557056", oauth_version="1.0", oauth_signature_method="HMAC-SHA1", oauth_consumer_key="jrEWCn2eDNx0KLsqC9EJxIp9p", oauth_token="276043201-uJPOMmGaj4R4K93tP3q7oP29z69lNSrsdHDvhfHl", oauth_signature="QyIcYU1y6vUcnSEeB%2FgAXPRegfM%3D"\r\n\r\n'
reply: 'HTTP/1.1 200 OK\r\n'
header: cache-control: no-cache, no-store, must-revalidate, pre-check=0, post-check=0
header: content-disposition: attachment; filename=json.json
header: content-length: 30006
header: content-type: application/json;charset=utf-8
header: date: Wed, 18 Mar 2020 18:44:17 GMT
header: expires: Tue, 31 Mar 1981 05:00:00 GMT
header: last-modified: Wed, 18 

<IPython.core.display.Javascript object>



*** RETRIEVING 100>= POPULAR RESULTS FROM 2020-03-11 TO 2020-03-18 ***


send: b'GET /1.1/search/tweets.json?q=COVID-19&count=100&lang=%5B%27nl%27%5D&since=2020-03-11&until=2020-03-18&results_type=popular&tweet_mode=extended HTTP/1.1\r\nAccept-Encoding: identity\r\nHost: api.twitter.com\r\nAuthorization: OAuth oauth_nonce="20476827749792285071584557062", oauth_timestamp="1584557062", oauth_version="1.0", oauth_signature_method="HMAC-SHA1", oauth_consumer_key="jrEWCn2eDNx0KLsqC9EJxIp9p", oauth_token="276043201-uJPOMmGaj4R4K93tP3q7oP29z69lNSrsdHDvhfHl", oauth_signature="q46eh%2FeBq5YkP7FcBwKIJNIV9DA%3D"\r\n\r\n'
reply: 'HTTP/1.1 200 OK\r\n'
header: cache-control: no-cache, no-store, must-revalidate, pre-check=0, post-check=0
header: content-disposition: attachment; filename=json.json
header: content-length: 104688
header: content-type: application/json;charset=utf-8
header: date: Wed, 18 Mar 2020 18:44:23 GMT
header: expires: Tue, 31 Mar 1981 05:00:00 GMT
header: last-modified: Wed, 18

<IPython.core.display.Javascript object>



*** RETRIEVING 100>= POPULAR RESULTS FROM 2020-03-11 TO 2020-03-18 ***


send: b'GET /1.1/search/tweets.json?q=COVID_19&count=100&lang=%5B%27nl%27%5D&since=2020-03-11&until=2020-03-18&results_type=popular&tweet_mode=extended HTTP/1.1\r\nAccept-Encoding: identity\r\nHost: api.twitter.com\r\nAuthorization: OAuth oauth_nonce="135600572613680943011584557068", oauth_timestamp="1584557068", oauth_version="1.0", oauth_signature_method="HMAC-SHA1", oauth_consumer_key="jrEWCn2eDNx0KLsqC9EJxIp9p", oauth_token="276043201-uJPOMmGaj4R4K93tP3q7oP29z69lNSrsdHDvhfHl", oauth_signature="NA6bndJ4qiuAcg8xvxPwN2ny75o%3D"\r\n\r\n'
reply: 'HTTP/1.1 200 OK\r\n'
header: cache-control: no-cache, no-store, must-revalidate, pre-check=0, post-check=0
header: content-disposition: attachment; filename=json.json
header: content-length: 104688
header: content-type: application/json;charset=utf-8
header: date: Wed, 18 Mar 2020 18:44:29 GMT
header: expires: Tue, 31 Mar 1981 05:00:00 GMT
header: last-modified: Wed, 18 

<IPython.core.display.Javascript object>



*** RETRIEVING 100>= POPULAR RESULTS FROM 2020-03-11 TO 2020-03-18 ***


send: b'GET /1.1/search/tweets.json?q=CoronaVirusUpdates&count=100&lang=%5B%27nl%27%5D&since=2020-03-11&until=2020-03-18&results_type=popular&tweet_mode=extended HTTP/1.1\r\nAccept-Encoding: identity\r\nHost: api.twitter.com\r\nAuthorization: OAuth oauth_nonce="85996672220738882141584557074", oauth_timestamp="1584557074", oauth_version="1.0", oauth_signature_method="HMAC-SHA1", oauth_consumer_key="jrEWCn2eDNx0KLsqC9EJxIp9p", oauth_token="276043201-uJPOMmGaj4R4K93tP3q7oP29z69lNSrsdHDvhfHl", oauth_signature="d8BmcxXin%2FGs3fDC%2F1eaBBLLNoY%3D"\r\n\r\n'
reply: 'HTTP/1.1 200 OK\r\n'
header: cache-control: no-cache, no-store, must-revalidate, pre-check=0, post-check=0
header: content-disposition: attachment; filename=json.json
header: content-length: 379
header: content-type: application/json;charset=utf-8
header: date: Wed, 18 Mar 2020 18:44:35 GMT
header: expires: Tue, 31 Mar 1981 05:00:00 GMT
header: last-modified

KeyError: 'created_at'

Get the top trending

In [8]:
trend_trawler = TweetTrawler(api)
trends = trend_trawler.get_trends(1)

send: b'GET /1.1/trends/place.json?id=1 HTTP/1.1\r\nAccept-Encoding: identity\r\nHost: api.twitter.com\r\nAuthorization: OAuth oauth_nonce="24432382053723147211584557184", oauth_timestamp="1584557184", oauth_version="1.0", oauth_signature_method="HMAC-SHA1", oauth_consumer_key="jrEWCn2eDNx0KLsqC9EJxIp9p", oauth_token="276043201-uJPOMmGaj4R4K93tP3q7oP29z69lNSrsdHDvhfHl", oauth_signature="3vxc4RqsrVBuh8Gd46M3silCLI8%3D"\r\n\r\n'
reply: 'HTTP/1.1 200 OK\r\n'
header: cache-control: no-cache, no-store, must-revalidate, pre-check=0, post-check=0
header: content-disposition: attachment; filename=json.json
header: content-length: 9536
header: content-type: application/json;charset=utf-8
header: date: Wed, 18 Mar 2020 18:46:25 GMT
header: expires: Tue, 31 Mar 1981 05:00:00 GMT
header: last-modified: Wed, 18 Mar 2020 18:46:25 GMT
header: pragma: no-cache
header: server: tsa_o
header: set-cookie: personalization_id="v1_IkGESZiVlBwRQOOGjWMKEg=="; Max-Age=63072000; Expires=Fri, 18 Mar 2022 18:46:25

# More information retrieval

In [None]:
# return all the retweets of the most highly retweeted tweets?
# more networking strategies to find more tweets?

In [9]:
trends

Unnamed: 0,name,url,promoted_content,query,tweet_volume
0,#SokağaCıkmaYasağıSart,http://twitter.com/search?q=%23Soka%C4%9FaC%C4...,,%23Soka%C4%9FaC%C4%B1kmaYasa%C4%9F%C4%B1Sart,30131.0
1,#PlayStation5,http://twitter.com/search?q=%23PlayStation5,,%23PlayStation5,24672.0
2,Foles,http://twitter.com/search?q=Foles,,Foles,38376.0
3,#CoronaVirusChallenge,http://twitter.com/search?q=%23CoronaVirusChal...,,%23CoronaVirusChallenge,155823.0
4,#TrumpLiedPeopleDied,http://twitter.com/search?q=%23TrumpLiedPeople...,,%23TrumpLiedPeopleDied,77787.0
5,#PS5Reveal,http://twitter.com/search?q=%23PS5Reveal,,%23PS5Reveal,11285.0
6,SOBRANG WILD,http://twitter.com/search?q=%22SOBRANG+WILD%22,,%22SOBRANG+WILD%22,15547.0
7,Bleach,http://twitter.com/search?q=Bleach,,Bleach,267497.0
8,Mark Cerny,http://twitter.com/search?q=%22Mark+Cerny%22,,%22Mark+Cerny%22,24663.0
9,iPad Pro,http://twitter.com/search?q=%22iPad+Pro%22,,%22iPad+Pro%22,140215.0
