In [1]:
import json
from collections import defaultdict
from itertools import chain
from pprint import pprint
from random import sample

import jsonlines
import pandas as pd

from hope.twitter.graph import get_graphing_info
from hope.twitter.item import TweetItem

In [2]:
DEBUG = False

node_set = set()
all_nodes = []
all_edges = []

In [3]:
with jsonlines.open("../data/2023Elections_sample.jsonl") as f:
    for line in f:
        _ = "" if line.get("_id", None) is None else line.pop("_id")
    
        nodes, edge = get_graphing_info(TweetItem(**line), tweets_as_nodes=True)
        if nodes:
            if DEBUG:
                pprint(nodes)
                pprint(edge)
            
            all_edges.append(edge)

            for node in nodes:
                if node["id"] not in node_set:
                    all_nodes.append(node)
                    node_set.add(node["id"])

In [4]:
len(all_edges)

293486

In [5]:
len(all_nodes)

357547

In [6]:
all_edges[0]

[{'source': '1626899713382383616',
  'type': 'Tweet-Directed',
  'action': 'retweet',
  'target': '1626882679995092992',
  'weight': 1}]

In [7]:
all_nodes[0]

{'id': '1626899713382383616',
 'tweet': 'RT @IAOkowa: We are counting down to #NigeriaDecides - only 6 days and 21 hours to go! \n\nToday, we are in Adamawa State, the birthplace of…',
 'user_id': '1590006005420756993',
 'timestamp': '2023-02-18T11:01:32Z'}

In [6]:
actions = {"n_reply": 0, "n_retweet": 0, "n_quote": 0}  # Default values

In [7]:
node_mapping = {node["id"]: {**node, **actions} for node in all_nodes}

In [8]:
# Count number of retweets, replies, quotes for each node in sample
# TODO: Formalize into reusable function
for edge in chain(*all_edges):
    target_node: dict = node_mapping[edge["target"]]
    target_node[f"n_{edge['action']}"] += 1

In [28]:
idx = sample(range(len(all_nodes)), k=1)[0]     # Get random index
pprint(all_nodes[idx])

{'id': '1629059859311218693',
 'n_retweet': 1,
 'timestamp': '',
 'tweet': '',
 'user_id': '1482343842414309377'}


In [9]:
with open("../data/2023Elections_sample_nodes.jsonl", 'w') as f:
    for d in node_mapping.values():
        line = json.dumps(d) + "\n"
        f.write(line)

In [10]:
with open("../data/2023Elections_sample_edges.jsonl", "w") as f:
    for d in chain(*all_edges):
        line = json.dumps(d) + "\n"
        f.write(line)

In [2]:
types = defaultdict(lambda: "str")

In [3]:
pd.read_json("../data/2023Elections_sample_nodes.jsonl", lines=True, dtype=types).to_csv("../data/2023Elections_sample_nodes.csv", index=False)

In [4]:
pd.read_json("../data/2023Elections_sample_edges.jsonl", lines=True, dtype=types).to_csv("../data/2023Elections_sample_edges.csv", index=False)

### Further Sampling Nodes & Edges

In [22]:
SUBSAMPLE_SIZE = 1000
DEBUG = False

In [18]:
subsample = pd.read_json("../data/2023Elections_sample.jsonl", dtype=types, lines=True).sample(SUBSAMPLE_SIZE) #.to_csv("../data/sample_nodes.csv", index=False)

In [19]:
subsample.head()

Unnamed: 0,tweet_id,user_id,user_created_at,user_screen_name,timestamp,text,is_retweet,rt_id,rt_timestamp,rt_user_id,...,is_quote,qt_id,qt_timestamp,qt_user_id,qt_user_created_at,qt_user_screen_name,is_reply,in_reply_to_tweet_id,in_reply_to_user_id,in_reply_to_user_screen_name
60021,1627677996235558913,904923386,{'$date': '2012-10-25T23:56:13Z'},jeffito202,{'$date': '2023-02-20T14:34:09Z'},RT @4GOODGEOFF: @Gistloversblog1 This is the r...,True,1627641449134141442,{'$date': '2023-02-20T12:08:56Z'},372537576,...,,,,,,,,,,
290911,1629395762718076928,1301098347051331584,{'$date': '2020-09-02T10:04:00Z'},JojoNitq,{'$date': '2023-02-25T08:19:57Z'},RT @firstladyship: Peter Obi on his way to his...,True,1629384992554926080,{'$date': '2023-02-25T07:37:09Z'},201237617,...,,,,,,,,,,
131342,1628327238327320577,1620020785535455232,{'$date': '2023-01-30T11:27:16Z'},ifeanyiNwabue17,{'$date': '2023-02-22T09:34:01Z'},RT @AdrianOdogwu: In Atiku's mandate we trust\...,True,1627988360441520134,{'$date': '2023-02-21T11:07:26Z'},1572251945334603776,...,True,1.6279708841651569e+18,{'$date': '2023-02-21T09:57:59Z'},1.6200450730637391e+18,{'$date': '2023-01-30T13:03:48Z'},akinfemi08125,,,,
48579,1627332485083267080,2451709839,{'$date': '2014-04-01T06:59:46Z'},Da_caresser,{'$date': '2023-02-19T15:41:13Z'},RT @Sports_Doctor2: I have tears in my eyes.\n...,True,1627231920516919297,{'$date': '2023-02-19T09:01:37Z'},752140837614678016,...,,,,,,,,,,
132978,1628340862324293632,1169185705521754112,{'$date': '2019-09-04T09:49:47Z'},oboybob1,{'$date': '2023-02-22T10:28:09Z'},RT @asemota: I like this! He wants to seal the...,True,1628074931434012674,{'$date': '2023-02-21T16:51:26Z'},5404442,...,True,1.6280349544698184e+18,{'$date': '2023-02-21T14:12:35Z'},3411882591.0,{'$date': '2015-08-10T05:02:18Z'},MissPearls,,,,


In [24]:
node_set = set()
all_nodes = []
all_edges = []

for idx, row in subsample.iterrows():
    row_dict = row.to_dict()
    _ = "" if row_dict.get("_id", None) is None else row_dict.pop("_id")
    
    nodes, edge = get_graphing_info(TweetItem(**row_dict), tweets_as_nodes=True)
    if nodes:
        if DEBUG:
            pprint(nodes)
            pprint(edge)
        
        all_edges.append(edge)

        for node in nodes:
            if node["id"] not in node_set:
                all_nodes.append(node)
                node_set.add(node["id"])

In [25]:
actions = {"n_reply": 0, "n_retweet": 0, "n_quote": 0}  # Default values

In [26]:
node_mapping = {node["id"]: {**node, **actions} for node in all_nodes}

In [27]:
# Count number of retweets, replies, quotes for each node in sample
# TODO: Formalize into reusable function
for edge in chain(*all_edges):
    target_node: dict = node_mapping[edge["target"]]
    target_node[f"n_{edge['action']}"] += 1

In [28]:
node_json = "\n".join([json.dumps(d) for d in node_mapping.values()])

In [32]:
edge_json = "\n".join([json.dumps(d) for d in chain(*all_edges)])

In [31]:
pd.read_json(node_json, lines=True, dtype=types).to_csv("../data/2023Elections_subsample_nodes.csv", index=False)

In [33]:
pd.read_json(edge_json, lines=True, dtype=types).to_csv("../data/2023Elections_subsample_edges.csv", index=False)