In [None]:
# install tweepy
# !pip install git+https://github.com/tweepy/tweepy.git

In [235]:
import json
import time

from dotenv import load_dotenv
import tweepy

In [228]:
# if there's .env with BEARER_TOKEN key
load_dotenv()

True

In [230]:
import logging

logger = logging.getLogger("Tweepy")
logger.setLevel(logging.DEBUG)
handler = logging.FileHandler(filename="tweepy.log")
logger.addHandler(handler)

Data extraction closely following this [example from tweepy](https://github.com/tweepy/tweepy/blob/master/examples/API_v2/search_recent_tweets.py)

Pagination closely following this [pagination page from tweepy](https://docs.tweepy.org/en/stable/v2_pagination.html) (my code didn't use `flatten` because it doesn't have the `includes` attr necessary to extract retweets; though, because of that the data isn't as much richer)

Check out the [recent search API](https://docs.tweepy.org/en/stable/client.html#tweepy.Client.search_recent_tweets) for more info

In [231]:
client = tweepy.Client(bearer_token=BEARER_TOKEN)

In [209]:
def _extract_full_text(tweet, includes):
    """
    Full text can only be found on the original tweet.
    but, the data also contains RTs in which text are truncated.
    so, try to detect whether a tweet is an RT and extract text from its original
    """
    # if tweet is an RT, search for its original in `includes`
    if tweet.referenced_tweets and \
        tweet.referenced_tweets[0].type == "retweeted":
        return list(
                filter(lambda x: x.id == tweet.referenced_tweets[0].id,
                       includes['tweets'])
                ).pop().text
    
    # tweet is original, just return the text
    return tweet.text


def extract_tweet_data(tweet):
    data = []
    for t in tweet.data:
        d = {
            "author_id": t.author_id,
            "id": t.id,
            "in_reply_to_user_id": t.in_reply_to_user_id,
            # extract date wtih timezone
            # https://stackoverflow.com/a/48725037/8996974
            "created_at": t.created_at.strftime('%Y-%m-%d %H:%M:%S.%f'),
            "lang": t.lang,
            "text": _extract_full_text(t, tweet.includes),
            "possibly_sensitive": t.possibly_sensitive,
            "source": t.source,
        }
        data.append(d)
    return data

In [225]:
def dump(name, tweets):
    # include the .json format in name
    assert name[-5:] == '.json'
    
    # overwrite current file if exist
    with open(name, "w") as f:
        f.write("")
    
    with open(name, "a") as f:
        for tweet in tweets:
            json.dump(tweet, f, indent=4)
    
    print(f"{name} dumped!")

In [None]:
# FREELY EDIT THIS
def _runner(dump_filename):
    data = []
    for tweet in tweepy.Paginator(client.search_recent_tweets,
                                  query,
                                  expansions=expansions,
                                  tweet_fields=tweet_fields,
                                  max_results=100,
                                  limit=180):
        ex = extract_tweet_data(tweet)
        data.extend(ex)
    dump(dump_filename, data)

        
# https://developer.twitter.com/en/docs/twitter-api/rate-limits
# Requests limit per 15-minute for recent search is 180 per user
FIFTEEN_MINUTES = 900  # in seconds

def run(query, tweet_fields, expansions):
    target_tweets_count = 100_000
    target_tweets_progress = 0
    
    while target_tweets_progress < target_tweets_count:
        start = time.time()
        print(f"Processing part-{target_tweets_progress}")

        dump_filename = f"{target_tweets_progress}.json"
        _runner(dump_filename)
        
        if (time.time() - start) < FIFTEEN_MINUTES:
            # if finished earlier before 15 minutes; sleep (add 10 as buffer)
            sleep_for = (time.time() - start) + 10
            time.sleep(sleep_for)
            
        target_tweets_progress += (180 * 100)
    
    print("Done!")

In [None]:
query = "bitcoin OR btc OR ethereum OR eth OR crypto"
tweet_fields = "author_id,id,in_reply_to_user_id,created_at,lang,text,possibly_sensitive,source,referenced_tweets"
expansions = "referenced_tweets.id"

In [142]:
run(query, tweet_fields, expansions)