__Objective__: Process the json dataset crawled using `twint` and make a new json dataset such that for every reply tweet, its parent tweet (if it exists) is known.

__Runtime__: CPU

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm 
import pickle as pkl
import matplotlib.pyplot as plt
from matplotlib import colors

In [None]:
# reading the raw twitter dataset

# This will contain tweets from NYTimes until 2017-06-20
nytimes_tweets_df = pd.read_json('/content/gdrive/MyDrive/DL/Twitter/million/nytimes/nytimes.json', lines=True)

# This will contain reply tweets targeted to NYTimes since August 2016 to January 2017
nytimes_replies_before_df = pd.read_json('/content/gdrive/MyDrive/DL/Twitter/million/nytimes/nytimes_before_replies.json', lines=True)

# This will contain reply tweets targeted to NYTimes since March 2017 to July 2017
nytimes_replies_after_df = pd.read_json('/content/gdrive/MyDrive/DL/Twitter/million/nytimes/nytimes_after_replies.json', lines=True)

In [None]:
df_columns = nytimes_tweets_df.columns
print(df_columns)

In [None]:
nytimes_tweetids = set(nytimes_tweets_df['id'])

_nytimes_replies_before_tweetids = list(nytimes_replies_before_df['id'])
_nytimes_replies_before_convids = list(nytimes_replies_before_df['conversation_id'])
nytimes_replies_before_conv = [(x, y) for x, y in zip(_nytimes_replies_before_tweetids, _nytimes_replies_before_convids)]

_nytimes_replies_after_tweetids = list(nytimes_replies_after_df['id'])
_nytimes_replies_after_convids = list(nytimes_replies_after_df['conversation_id'])
nytimes_replies_after_conv = [(x, y) for x, y in zip(_nytimes_replies_after_tweetids, _nytimes_replies_after_convids)]

In [None]:
reply_map = dict()  # reply tweet id (as key) - nytimes tweet id (as value)

for x, y in nytimes_replies_before_conv:
    if y in nytimes_tweetids:
        reply_map[x] = y 

for x, y in nytimes_replies_after_conv:
    if y in nytimes_tweetids:
        reply_map[x] = y

In [None]:
len(reply_map)

In [None]:
nytimes_tweet_ids = list(nytimes_tweets_df['id']) 
nytimes_tweets = list(nytimes_tweets_df['tweet'])

In [None]:
tweet_id_map = dict()

for x, y in zip(nytimes_tweet_ids, nytimes_tweets):
    tweet_id_map[x] = y

In [None]:
def get_parent_tweet(tweet_id):
    if tweet_id not in reply_map:
        return " "
    return tweet_id_map[reply_map[tweet_id]]

In [None]:
nytimes_replies_before_df['parent_tweet'] = nytimes_replies_before_df['id'].apply(get_parent_tweet)
nytimes_replies_after_df['parent_tweet'] = nytimes_replies_after_df['id'].apply(get_parent_tweet)

In [None]:
nytimes_replies_before_df.to_json('/content/gdrive/MyDrive/DL/Twitter/million/nytimes/processed/nytimes_before.json', lines=True, orient="records")
nytimes_replies_after_df.to_json('/content/gdrive/MyDrive/DL/Twitter/million/nytimes/processed/nytimes_after.json', lines=True, orient="records")