### Import libraries and the dataset

In [None]:
import networkx as nx
from matplotlib import pyplot as plt
from random import random
import json
import pickle

In [None]:
with open('tweets_hashtag_COP27_exclRetweets_2022-11-06_20.json') as json_file:
    data = json.load(json_file)

print(len(data['data']))

# record all the tweets id in the dataset
tweet_id_list = []
for tweet in data['data']:
    tweet_id_list.append(tweet['id'])

### Functions

In [None]:
def get_nodes(data):
    nodes = []
    for tweet in data['data']:
        # encode new tweets authors to be the nodes, given a bunch of attributes in the public_metrics
        if tweet['author_id'] not in [node[0] for node in nodes]:
            nodes.append((tweet['author_id'], tweet['public_metrics']))
            continue
        
        # if the author already exists in the nodes list
        for i, d in enumerate(nodes):
            if tweet['author_id'] == nodes[i][0]:
                # sum the values of public_metrics and update the author's attributes in the nodes list
                nodes[i] = (tweet['author_id'], 
                            {key: tweet['public_metrics'].get(key) + nodes[i][1].get(key) \
                            for key in set(tweet['public_metrics']) | set(nodes[i][1])})
                break
    return nodes



In [None]:
def get_nodes(data):
    nodes = []
    for tweet in data['data']:
        # encode new tweets authors to be the nodes, given a bunch of attributes in the public_metrics
        if tweet['author_id'] not in [node[0] for node in nodes]:
            nodes.append((tweet['author_id'], tweet['public_metrics']))
            continue
        
        # if the author already exists in the nodes list
        for i, d in enumerate(nodes):
            if tweet['author_id'] == nodes[i][0]:
                # sum the values of public_metrics and update the author's attributes in the nodes list
                nodes[i] = (tweet['author_id'], 
                            {key: tweet['public_metrics'].get(key) + nodes[i][1].get(key) \
                            for key in set(tweet['public_metrics']) | set(nodes[i][1])})
                break
    return nodes



In [None]:
def get_tweets_id(data):
    tweets_id = {}
    # construct a dictionary which uses author_id to look up tweets
    for tweet in data['data']:
        if tweet['author_id'] not in tweets_id.keys():
            tweets_id[tweet['author_id']] = [tweet['id']]
        else:
            tweets_id[tweet['author_id']].append(tweet['id']) 
    return tweets_id

In [None]:
def get_hashtags(data):
    hashtags = {}
    # construct a dictionary which uses author_id to look up hashtags
    for tweet in data['data']:
        if 'entities' in tweet:
            if 'hashtags' in tweet['entities']:
                hashtag_list = tweet['entities']['hashtags']
                for hashtag in hashtag_list:

                    if tweet['author_id'] not in hashtags.keys():
                        hashtags[tweet['author_id']] = [hashtag['tag']]
                    else:
                        hashtags[tweet['author_id']].append(hashtag['tag']) 
    return hashtags

In [None]:
def get_urls(data):
    urls = {}
    # construct a dictionary which uses author_id to look up urls
    for tweet in data['data']:
        if 'entities' in tweet:
            if 'urls' in tweet['entities']:
                url_list = tweet['entities']['urls']
                for url in url_list:

                    if tweet['author_id'] not in urls.keys():
                        urls[tweet['author_id']] = [url['url']]
                    else:
                        urls[tweet['author_id']].append(url['url']) 
    return urls

In [None]:
def get_links(data, existing_authors):
    links = []
    # generate edges based on users interactions (mentions)
    for tweet in data['data']:
        if 'entities' in tweet:
            if 'mentions' in tweet['entities']:
                mention_list = tweet['entities']['mentions']
                for mention in mention_list:
                    if mention['id'] in existing_authors and mention['id'] != tweet['author_id']:
                        # the weight for all edges by default is 1
                        if (tweet['author_id'], mention['id']) not in [(link[0], link[1]) for link in links]:
                            links.append((tweet['author_id'], mention['id'], {'weight': 1}))  
                        else:
                            # in case there are multiple identical edges, the weight of the edge increase by 1 at a time
                            for i , d in enumerate(links):
                                if tweet['author_id'] == links[i][0] and mention['id'] == links[i][1]:
                                    links[i] = (tweet['author_id'], mention['id'], {'weight': links[i][2].get('weight') + 1})
                                    break
    return links   

### Graph conversion

In [None]:
# extract useful information from the original dataset
nodes = get_nodes(data)
tweets_id = get_tweets_id(data)
hashtags = get_hashtags(data)
urls = get_urls(data)
links = get_links(data, tweets_id.keys())

In [None]:
# construct the directed graph 
dG = nx.DiGraph()
dG.add_nodes_from(nodes)
dG.add_edges_from(links)

In [None]:
# remove all isolated nodes
dG.remove_nodes_from(list(nx.isolates(dG)))

In [None]:
print(nx.info(dG))

In [None]:
degree_dict = dict(dG.degree(dG.nodes()))
nx.set_node_attributes(dG, degree_dict, 'degree')

In [None]:
closeness_centrality = nx.closeness_centrality(dG)
nx.set_node_attributes(dG, closeness_centrality, 'closeness')

In [None]:
eigenvector_centrality = nx.eigenvector_centrality_numpy(dG)
nx.set_node_attributes(dG, eigenvector_centrality, 'eigenvector')

In [None]:
between_centrality = nx.betweenness_centrality(dG)
nx.set_node_attributes(dG, between_centrality, 'betweenness')

In [None]:
# save to disk:
# with open('nodes_with_centralities.pkl', 'wb') as f:
#     pickle.dump(dG.nodes(data=True), f)
# with open('nodes.pkl', 'wb') as f:
#     pickle.dump(nodes, f)
# with open('tweet_id_list.pkl', 'wb') as f:
#     pickle.dump(tweet_id_list, f)
# with open('hashtags.pkl', 'wb') as f:
#     pickle.dump(hashtags, f)
# with open('urls.pkl', 'wb') as f:
#     pickle.dump(urls, f)
# with open('links.pkl', 'wb') as f:
#     pickle.dump(dG.edges(data=True), f)