# Tweet Thread Augmentation

Many tweets are part of **threads**, which consist of multiple tweets in a linked-list sequence of replies to one another. Since some of the tweets may not have contained the original coronavirus keywords, this step pulls tweets in threads for which at least one tweet is in the dataset. This consists of three steps:

1. Extract **upstream tweets**, which are explicitly linked to in the `in_reply_to_status_id_str` field of the tweet. We can do this using a simple hydrate command with Twarc.
2. Extract **downstream tweets**, which are not explicitly linked. Instead, we look for tweets in each user's timeline within a two-day window on either side of their tweets in the dataset, and recursively find the tweets that link back to the original dataset.
3. Join these two sets together with the original tweet dataset, and assign each tweet a **thread ID**.

In [None]:
import twarc
import json
import time
import shutil
import pandas as pd
import numpy as np
import os
import datetime
import utils
import matplotlib.pyplot as plt

### Paths

Input the paths to your Twarc credentials, and input and output paths below.

In [None]:
# Path to Twarc credentials path (usually expanded version of ~/.twarc)
credentials_path = "/Users/venkatesh-sivaraman/.twarc"

# Path to CSV file batches
input_dir = "raw_data"

# Path to scratch directory for intermediate results
intermediate_dir = "intermediate_data"
if not os.path.exists(intermediate_dir):
    os.mkdir(intermediate_dir)
    
# Path to output directory
output_dir = "intermediate_data"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

In [None]:
# Load input tweets
filenames = [os.path.join(input_dir, path) for path in os.listdir(input_dir)
                 if path.endswith(".csv") and not path.startswith(".")]
df = pd.concat([pd.read_csv(filename, dtype=utils.dtype_spec, lineterminator='\n')
                for filename in filenames])
df = df.loc[:, ~df.columns.str.contains('^Unnamed')].reset_index(drop=True)

df.head()

In [None]:
# Load Twarc object
t = utils.load_twarc(credentials_path)

# Upstream Tweets

We want to extract specifically all messages that were replied to by a tweet in the dataset, or that reply to a tweet in the dataset.


In [None]:
reply_ids = df[~pd.isna(df.reply_to_id) & (df.reply_to_user == df.user_id)].reply_to_id.unique().tolist()
print("{} reply IDs".format(len(reply_ids)))

In [None]:
# Recursively extract replies
seen_ids = set()
reply_ids = list(set(reply_ids))
hydrated_replies = []
i = 0

while reply_ids:
    print("Round {}, {} tweets to hydrate".format(i, len(reply_ids)))
    new_replies = list(t.hydrate(reply_ids))
    hydrated_replies += new_replies
    seen_ids |= set([tweet["id_str"] for tweet in new_replies])
    # Mark tweets that are in reply to a message by the same user for the next round
    reply_ids = [tweet["in_reply_to_status_id_str"] for tweet in new_replies
                 if tweet["in_reply_to_status_id_str"] is not None and
                 tweet["in_reply_to_status_id_str"] not in seen_ids and
                 tweet["in_reply_to_user_id_str"] == tweet["user"]["id_str"]]
    i += 1

# Write upstream tweets as JSON
print("Writing JSON...")
upstream_tweets = []
with open(os.path.join(intermediate_dir, "all_upstream_tweets.json"), "w") as file:
    for item in hydrated_replies:
        tweet = json.dumps(item)
        file.write(tweet + "\n")
        upstream_tweets.append(utils.json_to_tweet(item))

print("Writing CSV...")
upstream_df = pd.DataFrame(upstream_tweets)
upstream_df.to_csv(os.path.join(intermediate_dir, "all_upstream_tweets.csv"),
                   line_terminator='\n')
print("Wrote {} upstream tweets.".format(len(hydrated_replies)))

# Downstream Tweets

Next use user timelines to find tweets that reply to tweets in the dataset. To do this efficiently, we find a unique set of users and find a consensus date window in which to search for tweets. We assume that replies occur within two days of the original tweet.

In [None]:
# First establish a list of reference IDs for each date
def get_date(tweet, day_delta=0):
    date = datetime.datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
    if day_delta != 0:
        date = date + datetime.timedelta(days=day_delta)
    return datetime.date.strftime(date, '%Y-%m-%d')

min_ids = {}
max_ids = {}
for i, tweet in df.iterrows():
    if i % 100000 == 0:
        print(i)
    id_num = int(tweet["id"])
    date = get_date(tweet)
    min_ids[date] = min(min_ids.get(date, 1e30), id_num)
    max_ids[date] = max(max_ids.get(date, 0), id_num)

print(sorted(min_ids.items())[-5:], sorted(max_ids.items())[-5:])

In [None]:
# Now get a set of users and the required search dates
user_dates = {}

tweets_with_replies = df[~pd.isna(df.reply_to_id) & (df.reply_to_user == df.user_id)]
for i, tweet in tweets_with_replies.iterrows():
    user = tweet["user_id"]
    min_date = get_date(tweet, -2)
    max_date = get_date(tweet, 2)
    if user in user_dates:
        user_dates[user] = (min(min_date, user_dates[user][0]),
                          min(max_date, user_dates[user][1]))
    else:
        user_dates[user] = (min_date, max_date)

print("{} users".format(len(user_dates)))

In [None]:
# Fill in reference IDs for dates that aren't in the set. To do this, we'll create two
# rough, conservative linear models for tweet IDs over time by estimating the increment
# in the minimum and maximum tweet IDs per day.

# Note that we're going to give each tweet a two-day interval on either side, so the
# exactness of this estimate isn't important except to improve the performance of the
# tweet scraper.

available_days = sorted(min_ids.keys())
series = [available_days[0]]
current = series[-1]

min_id_items = []
max_id_items = []

date_index = 0
while current != available_days[-1]:
    date = datetime.datetime.strptime(current, '%Y-%m-%d')
    current = datetime.date.strftime(date + datetime.timedelta(days=1), '%Y-%m-%d')
    if current in min_ids:
        min_id_items.append((date_index, min_ids[current]))
    if current in max_ids:
        max_id_items.append((date_index, max_ids[current]))
    series.append(current)
    date_index += 1

min_inc_per_day = (min_id_items[-1][1] - min_id_items[0][1]) / (min_id_items[-1][0] - min_id_items[0][0])
max_inc_per_day = (max_id_items[-1][1] - max_id_items[0][1]) / (max_id_items[-1][0] - max_id_items[0][0])
earliest_date = datetime.datetime.strptime(available_days[0], '%Y-%m-%d')

def get_min_id(date_str):
    """Estimates the minimum tweet ID for the given date string, in the format YYYY-MM-DD."""
    date = datetime.datetime.strptime(date_str, '%Y-%m-%d')
    days = (date - earliest_date).days
    return min_ids[available_days[0]] + days * min_inc_per_day

def get_max_id(date_str):
    """Estimates the maximum tweet ID for the given date string, in the format YYYY-MM-DD."""
    date = datetime.datetime.strptime(date_str, '%Y-%m-%d')
    days = (date - earliest_date).days
    return max_ids[available_days[0]] + days * max_inc_per_day

print("Estimate:", get_min_id("2020-02-04"), "Actual:", min_ids["2020-02-04"])
print("Estimate:", get_max_id("2020-02-08"), "Actual:", max_ids["2020-02-08"])

In [None]:
current_batch = []
current_csv = []
batch_idx = 0

user_items = sorted(user_dates.items())

i = 0
for i, (user_id, (min_date, max_date)) in enumerate(user_items):
    if i % 100 == 0:
        # Periodically sleep to appease the Twitter rate limiting gods
        print(i)
        time.sleep(20)
    i += 1
    
    # Compute the boundary tweet IDs needed to search the Twitter timeline for this user
    min_id = int(get_min_id(min_date))
    max_id = int(get_max_id(max_date))
    for tweet in t.timeline(user_id=user_id, max_id=max_id, since_id=min_id):
        current_batch.append(tweet)
        current_csv.append(utils.json_to_tweet(tweet))

    if i % 1000 == 0:
        print("Writing")
        with open(os.path.join(intermediate_dir, "timeline_tweets_{}.json".format(batch_idx)), "w") as file:
            for item in current_batch:
                file.write(json.dumps(item) + "\n")

        batch_df = pd.DataFrame(current_csv)
        batch_df.to_csv(os.path.join(intermediate_dir, "timeline_tweets_{}.csv".format(batch_idx)),
                      line_terminator="\n")
        batch_idx += 1
        current_batch = []
        current_csv = []
        
# Write out the stragglers
print("Writing last batch")
with open(os.path.join(intermediate_dir, "timeline_tweets_{}.json".format(batch_idx)), "w") as file:
    for item in current_batch:
        file.write(json.dumps(item) + "\n")

batch_df = pd.DataFrame(current_csv)
batch_df.to_csv(os.path.join(intermediate_dir, "timeline_tweets_{}.csv".format(batch_idx)), line_terminator='\n')
batch_idx += 1
current_batch = []
current_csv = []

# Filter Downstream Tweets for Threads

The above timeline tweets include all tweets by the specified users - find only the tweets that are part of threads in the dataset.




In [None]:
batch_idx = 0
path = os.path.join(intermediate_dir, "timeline_tweets_{}.csv".format(batch_idx))
timelines = None
while os.path.exists(path):
    print("Reading {}...".format(os.path.basename(path)))
    sub_df = pd.read_csv(path, dtype=utils.dtype_spec, lineterminator='\n')
    if timelines is None:
        timelines = sub_df
    else:
        timelines = pd.concat([timelines, sub_df])
    batch_idx += 1
    path = os.path.join(intermediate_dir, "timeline_tweets_{}.csv".format(batch_idx))
timelines = timelines.loc[:, ~timelines.columns.str.contains('^Unnamed')].reset_index(drop=True)    

timelines.head()

In [None]:
# Get the set of IDs that are allowed reply to from the original dataframe
possible_reply_parents = set(df.id.tolist())
print("{} possible reply parents".format(len(possible_reply_parents)))

In [None]:
# Recursively find downstream tweets
ids_to_check = possible_reply_parents
replies = []
round_num = 0
while ids_to_check:
    new_ids = set()
    for i, row in timelines[~pd.isna(timelines.reply_to_id)].iterrows():
        if (row["reply_to_id"] in ids_to_check and 
            row["reply_to_user"] == row["user_id"]):
            replies.append(row)
            new_ids.add(row["id"])
    print("Round {}: {} replies added".format(round_num, len(new_ids)))
    ids_to_check = new_ids
    round_num += 1

replies_df = pd.DataFrame(replies)
replies_df.to_csv(os.path.join(intermediate_dir, "threaded_downstream_tweets.csv"), line_terminator='\n')
print("Found {} downstream tweets that are linked to a tweet in the original dataset.".format(len(replies_df)))

replies_df.head()

# Putting It All Together

In [None]:
# Let's build a set of all the threaded tweets we know about.
all_threaded_tweets = pd.concat([
  pd.read_csv(os.path.join(intermediate_dir, "threaded_downstream_tweets.csv"), dtype=utils.dtype_spec, index_col=0, lineterminator='\n'),
  pd.read_csv(os.path.join(intermediate_dir, "all_upstream_tweets.csv"), dtype=utils.dtype_spec, index_col=0, lineterminator='\n'),
  df,
])

print("{} tweets total".format(len(all_threaded_tweets)))

In [None]:
# Build a dataframe of threads
threads_data = []
index_mapping = {}
print("Sorting...")
dedup_tweets = all_threaded_tweets.drop_duplicates("id")
dedup_tweets["id_num"] = dedup_tweets["id"].astype(int)
dedup_tweets = dedup_tweets.sort_values("id_num", ascending=False).reset_index()
print("Done sorting")

couldnt_find = 0
found = 0
for i, row in dedup_tweets.iterrows():
    if i % 10000 == 0: print(i, couldnt_find, found)

    if row["id"] in index_mapping:
        found += 1
        # Place the row's tweet text into the appropriate thread
        threads_data[index_mapping[row["id"]]].insert(0, row)
        if not pd.isna(row.reply_to_id) and row.reply_to_user == row.user_id:
            index_mapping[row.reply_to_id] = index_mapping[row["id"]]
    else:
        couldnt_find += 1
        threads_data.append([row])
        index_mapping[row["id"]] = len(threads_data) - 1
        if not pd.isna(row.reply_to_id) and row.reply_to_user == row.user_id:
            index_mapping[row.reply_to_id] = len(threads_data) - 1

print("{} threads".format(len(threads_data)))

In [None]:
# What length do threads have?
plt.figure()
plt.hist([len(t) for t in threads_data if len(t) > 1], bins=np.arange(2, 30))
plt.xlabel("Thread Length")
plt.ylabel("Count")
plt.show()

In [None]:
# Find some example threads by screen name
test_screen_name = "AdamJKucharski"

for thread in threads_data:
    if len(thread) > 1 and thread[0]["screen_name"] == test_screen_name:
        for tweet in thread:
            print(tweet.full_text)
        print("===")

In [None]:
# Write out all threads containing tweets that are in the original dataset 

must_include_ids = set(df["id"].values)

joined_data = []
thread_id = 0
for thread in threads_data:
    if not any(t["id"] in must_include_ids for t in thread):
        continue
    for t in thread:
        td = t.to_dict()
        td["thread_id"] = thread_id
        joined_data.append(td)
    thread_id += 1
    if thread_id % 100000 == 0:
        print(thread_id)

joined_df = pd.DataFrame(joined_data)
joined_df.to_csv(os.path.join(output_dir, "thread_annotated_tweets.csv"),
                 line_terminator="\n")

In [None]:
# Sanity check: is every tweet in original data in joined data?
joined_ids = set(joined_df["id"].values)
print(len(must_include_ids - joined_ids), "tweets included in original but not new set")
print(len(joined_ids - must_include_ids), "tweets included in new set but not original")