In [1]:
from collections import defaultdict
from datetime import datetime as dt
from searchtweets import ResultStream, gen_rule_payload, collect_results, load_credentials
import os
import json
import numpy as np
import pandas as pd

In [12]:
# Load Twitter credentials from YAML file.

# Credentials for fetching actual tweets.
search_args = load_credentials(
    'twitter_keys.yaml',
    yaml_key='search_tweets_premium',
    env_overwrite=False,
)

# Credentials for fetching counts of tweets.
count_args = load_credentials(
    'twitter_keys.yaml',
    yaml_key='tweet_counts_premium',
    env_overwrite=False,
)

In [3]:
# List of keywords for each category (Grey's Anatomy + abortion).
greys_words = [
    'greysanatomy',
    '#greysanatomy',
    '"greys anatomy"',
    '"grey\'s anatomy"',
    '@GreysABC',
    '#greys',
    '"meredith grey"',
    '"derek shepherd"',
]

abortion_words = [
    'abortion',
    '#abortion',
    'prochoice',
    'pro-choice',
    '#prochoice',
    '#pro-choice',
    'prolife',
    'pro-life',
    '#prolife',
    '#pro-life',
    '"roe v wade"',
    '"roe vs wade"',
]

In [4]:
# Define three queries: one for Grey's Anatomy, one for abortion, and for tweets that contain keywords from both lists.
greys_q =  f"({' OR '.join(greys_words)})"
abortion_q =  f"({' OR '.join(abortion_words)})"
both_q = f"{greys_q} {abortion_q}"

# Add english language restriction to all three queries.
greys_q += ' lang:en'
abortion_q += ' lang:en'
both_q += ' lang:en'

In [25]:
# Create 'rule' which is the JSON that gets sent to Twitter's endpoints-- one for each query.
greys_rule = gen_rule_payload(
    greys_q,
    from_date='2019-10-24',
    to_date='2019-11-21',
#     count_bucket='day',  <- uncomment this parameter if you're getting counts rather than tweets
)

abortion_rule = gen_rule_payload(
    abortion_q,
    from_date='2019-10-24',
    to_date='2019-11-21',
#     count_bucket='day',  <- uncomment this parameter if you're getting counts rather than tweets
)

both_rule = gen_rule_payload(
    both_q,
    from_date='2019-10-24',
    to_date='2019-11-21',
#     count_bucket='day',  <- uncomment this parameter if you're getting counts rather than tweets
)

In [14]:
# For convenience, copy of above cell but with the count_bucket parameter uncommented, for fetching counts.
greys_rule = gen_rule_payload(
    greys_q,
    from_date='2019-10-24',
    to_date='2019-11-21',
    count_bucket='day',
)

abortion_rule = gen_rule_payload(
    abortion_q,
    from_date='2019-10-24',
    to_date='2019-11-21',
    count_bucket='day',
)

both_rule = gen_rule_payload(
    both_q,
    from_date='2019-10-24',
    to_date='2019-11-21',
    count_bucket='day',
)

In [15]:
# Get total counts for each of the three queries.
greys_counts = collect_results(
    greys_rule,
    max_results=1000,
    result_stream_args=count_args,
)

abortion_counts = collect_results(
    abortion_rule,
    max_results=1000,
    result_stream_args=count_args,
)

both_counts = collect_results(
    both_rule,
    max_results=1000,
    result_stream_args=count_args,
)

In [22]:
# Convert counts data into DataFrames.
greys_df = pd.DataFrame(greys_counts)
abortion_df = pd.DataFrame(abortion_counts)
both_df = pd.DataFrame(both_counts)

# Format timePeriod column as a datetime.
greys_df['timePeriod'] = pd.to_datetime(greys_df.timePeriod)
abortion_df['timePeriod'] = pd.to_datetime(abortion_df.timePeriod)
both_df['timePeriod'] = pd.to_datetime(both_df.timePeriod)

In [24]:
# Write all three DataFrames to CSV.
greys_df.to_csv('./greys.csv', index=None)
abortion_df.to_csv('./abortion.csv', index=None)
both_df.to_csv('./both.csv', index=None)

In [26]:
# Now, gather 500 actual tweets using the Grey's Anatomy terms to validate that the data is good.
greys_tweets = collect_results(
    greys_rule,
    max_results=500,
    result_stream_args=search_args,
)

In [53]:
# Function for getting the useful information out of the tweet JSON and adding to a dict.
def parse_tweet(tweet):
    d = {}
    d['created_at'] = tweet['created_at']
    d['tweet_id'] = tweet['id_str']
    try:
        d['text'] = tweet['extended_tweet']['full_text']
    except KeyError:
        d['text'] = tweet['text']
    d['screen_name'] = tweet['user']['screen_name']
    d['location'] = tweet['user']['location']
    d['tweet_url'] = f'https://twitter.com/{d["screen_name"]}/status/{d["tweet_id"]}'
    return d

In [65]:
# Parse all of the Grey's Anatomy tweets.
parsed_tweets = [parse_tweet(tweet) for tweet in greys_tweets]

# Convert the parsed tweets into a DataFrame.
greys_tweets_df = pd.DataFrame(parsed_tweets)

# Convert created_at column to datetime, then remove timezone info so Excel doesn't have issues.
greys_tweets_df['created_at'] = pd.to_datetime(greys_tweets_df.created_at).map(lambda x: x.replace(tzinfo=None))

# Convert tweet_id column to string so Excel doesn't have formatting issues.
greys_tweets_df['tweet_id'] = greys_tweets_df.tweet_id.astype(str)

# Write Grey's Anatomy tweets DataFrame to Excel file.
greys_tweets_df.to_excel('greys_tweets.xlsx', index=None)