In [None]:
import networkx as nx
from matplotlib import pyplot as plt
from random import random
import json
import pickle
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import pandas as pd
from datetime import datetime
from statistics import mean
import requests


In [None]:
# load from disk 
with open('nodes_with_centralities.pkl', 'rb') as f:
    nodes = pickle.load(f)
# a look-up dictionary: tweets_id[user_id] = [IDs for all tweets from this user]
with open('tweets_id.pkl', 'rb') as f:
    tweets_id = pickle.load(f)
# a look-up dictionary: hashtags[user_id] = [all hashtags used by this user]
# repetitive hashtags are saved because the frequency or percentage for each hashtag might be of interest
with open('hashtags.pkl', 'rb') as f:
    hashtags = pickle.load(f)
# a look-up dictionary: urls[user_id] = [all urls linked by this user]
with open('urls.pkl', 'rb') as f:
    urls = pickle.load(f)

with open('links.pkl', 'rb') as f:
    links = pickle.load(f)

with open('tweets_hashtag_COP27_exclRetweets_2022-11-06_20.json') as json_file:
    data = json.load(json_file)

In [None]:
# load the graph
dG = nx.DiGraph()
dG.add_nodes_from(nodes)
dG.add_edges_from(links)

### Get top influensers in different measurement

In [None]:
# possible metrics: engagement, degree, closeness, eigenvector, betweenness
def get_top5_influensers(metric):
    hashtag_dict = {}
    url_dict = {}
    if metric == 'engagement':
        top5_influensers = sorted(dG.nodes(), key = lambda x: dG.nodes[x]['reply_count'] + dG.nodes[x]['quote_count'] + dG.nodes[x]['retweet_count'] + dG.nodes[x]['like_count'])[-1:-6:-1]
    else:
        top5_influensers = sorted(dG.nodes(), key = lambda x: dG.nodes[x][metric])[-1:-6:-1]

    for user in top5_influensers:
        try:
            hashtag_dict[user] = Counter(list(map(lambda x: x.lower(), hashtags[user]))).most_common()
            url_dict[user] = Counter(urls[user]).most_common()
        except:
            pass
    return (top5_influensers, hashtag_dict, url_dict)

In [None]:
def get_engagement_count(user_list):
    # return the engagement counts for a list of users
    engagement_count = []
    for user_id in user_list:
        engagement_count.append(dG.nodes[user_id]['reply_count'] + dG.nodes[user_id]['quote_count'] + dG.nodes[user_id]['retweet_count'] + dG.nodes[user_id]['like_count'])
    return engagement_count

In [None]:
# the result of get_top5_influensers(metric) is in the format of (user_ids, hashtag_counter, url_counter)
top5 = get_top5_influensers('betweenness')
print(top5[0]) # print the user IDs for the top 5 users under a certain metric
print(get_engagement_count(top5[0])) # print the engagement counts for the top 5 users under a certain metric
# print(top5[1])
# print(top5[2])

#### visualizing top users hashtags / urls

In [None]:
def get_user_hashtag_dist(user_rank):
    # convert the hashtag distribution of a user into a dataframe
    users_hashtags = top5[1]
    hashtag_df = pd.DataFrame(users_hashtags[top5[0][user_rank]], columns=['hashtags', 'frequency'])
    return hashtag_df

In [None]:
def get_user_url_dist(user_rank):
    # convert the url distribution of a user into a dataframe
    users_urls = top5[2]
    hashtag_df = pd.DataFrame(users_urls[top5[0][user_rank]], columns=['urls', 'frequency'])
    return hashtag_df

In [None]:
# select which user's hashtag distribution you want to visualize by filling in the user's rank number -> 0 = top one
df_to_visulize = get_user_hashtag_dist(0) 
ax = df_to_visulize.plot(kind='bar', x='hashtags', figsize=(15, 5), legend=False)
ax.set_ylabel('Frequency')
ax.set_xlabel('Hashtags')

In [None]:
# select which user's url distribution you want to visualize by filling in the user's rank number -> 0 = top one
df_to_visulize = get_user_url_dist(1)
ax = df_to_visulize.plot(kind='bar', x='urls', figsize=(15, 5), legend=False)
ax.set_ylabel('Frequency')
ax.set_xlabel('URLs')

### The user who tweets the most

In [None]:
# top 5 frequent users
sorted(tweets_id, key = lambda x: len(tweets_id[x]))[-1:-6:-1]

In [None]:
# number of tweets a certain user tweets
len(tweets_id['1252764865'])

In [None]:
# print the user's tweets if needed
for tweet in data['data']:
    if tweet['author_id'] == '1252764865':
        # print(tweet)
        pass

### How many distinct hashtags there are

In [None]:
hashtag_counter = Counter()
for node in nodes:
    try:
        hashtag_counter += Counter(list(map(lambda x: x.lower(), hashtags[node[0]])))
    except:
        pass


In [None]:
len(hashtag_counter)

### How many times each of the hashtag has been posted

In [None]:
# in the format of [(hashtag, frequency)]
hashtag_counter.most_common()

### Total Amount of hashtags in the dataset

In [None]:
hashtags_sum = sum([frequency for (hashtag, frequency) in hashtag_counter.most_common()])
hashtags_sum

### Top 30 Hashtags among all users (excluding #cop27)

In [None]:
top30_hashtags = hashtag_counter.most_common()[1:31]
print(hashtag_counter.most_common()[1:11])
hashtag_df = pd.DataFrame(top30_hashtags, columns=['hashtags', 'frequency'])

In [None]:
ax = hashtag_df.plot(kind='bar', x='hashtags', figsize=(15, 5), legend=False)
ax.set_ylabel('Frequency')
ax.set_xlabel('Hashtags')

In [None]:
# phrases contain non-alphabetic symbols 
[hashtag for (hashtag, frequency) in top30_hashtags if not hashtag.isalpha()]

## Get hashtags per user-ID

In [None]:
user_id = '73882819'
Counter(list(map(lambda x: x.lower(), hashtags[user_id]))).most_common()[1:6]

### Temporal tweets distribution

In [None]:
temporal_list= []
for tweet in data['data']:
    temporal_list.append((tweet['id'], datetime.strptime(tweet['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ").date()))

In [None]:
temporal_df = pd.DataFrame(temporal_list, columns=['tweet', 'date'])

In [None]:
ax2 = temporal_df.groupby(['date']).size().plot(kind='bar', x='date', figsize=(15, 5), legend=False)
ax2.set_ylabel('Amount of tweets')
ax2.set_xlabel('Date')

### Sentiment analysis


#### Old implementation

In [None]:
# nlp = spacy.load('en_core_web_md')
# nlp.add_pipe('spacytextblob')

In [None]:
# def convert_sentiment(data):
#     tweet_sentiments = [0, 0, 0]
#     for polarity in data:
#         if polarity < 0:
#             tweet_sentiments[0] += 1
#         elif polarity == 0:
#             tweet_sentiments[1] += 1
#         else:
#             tweet_sentiments[2] += 1
#     return tweet_sentiments


In [None]:
# sentiments = []
# subjectivity = []
# for tweet in data['data']:
#     if tweet['id'] in tweets_id['1498985846078377985']:
#         doc = nlp(tweet['text'])
#         sentiments.append(doc._.blob.polarity)
#         subjectivity.append(doc._.blob.subjectivity)


In [None]:
# # The average subjectivity for a certain user, the higher the value, the more subjective the user's speech 
# mean(subjectivity)

In [None]:
# y = convert_sentiment(sentiments)

# mylabels = ["Negative", "Neutral", "Positive"]
# myexplode = [0, 0, 0]
# plt.pie(y, explode = myexplode)
# total = sum(y)
# plt.legend(
#     loc='lower right',
#     labels=['%s, %1.2f%%' % (l, (float(s) / total) * 100) for l, s in zip(mylabels, y)],
#     prop={'size': 11},
#     bbox_to_anchor=(1.1, 0.7),
#     bbox_transform=plt.gcf().transFigure
# )
# plt.show() 

#### New implementation (API mentioned in the lecture)

In [None]:
def get_sentiment(text):
    url = 'http://text-processing.com/api/sentiment/'
    params = {'text': text}

    resp = requests.post(url, params)
    data = resp.json()
    
    return data['label']


In [None]:
sentiments2 = []
counter = 0
for tweet in data['data']:
    # set the maximum number of tweets to be analyzed to 100 to avoid being blocked by the API
    if counter == 100:
        break
    # pass in the user id you want to do sentiment analysis
    if tweet['id'] in tweets_id['1498985846078377985']:
        label = get_sentiment(tweet['text'])
        sentiments2.append(label)
        counter +=1

In [None]:
def convert_sentiment2(data):
    # concvert the sentiments into frequency counts
    tweet_sentiments = [0, 0, 0]
    for label in data:
        if label == 'neg':
            tweet_sentiments[0] += 1
        elif label == 'neutral':
            tweet_sentiments[1] += 1
        else:
            tweet_sentiments[2] += 1
    return tweet_sentiments

In [None]:
# visualize the sentiment analysis for a certain user using pie chart
y = convert_sentiment2(sentiments2)
mylabels = ["Negative", "Neutral", "Positive"]
myexplode = [0, 0, 0]
plt.pie(y, explode = myexplode)
total = sum(y)
plt.legend(
    loc='lower right',
    labels=['%s, %1.2f%%' % (l, (float(s) / total) * 100) for l, s in zip(mylabels, y)],
    prop={'size': 11},
    bbox_to_anchor=(1.1, 0.7),
    bbox_transform=plt.gcf().transFigure
)
plt.show() 

### Central hub visualization

In [None]:
# get the most central node under a certain metric
top_influenser = get_top5_influensers('closeness')[0][0]
dG.nodes[top_influenser]

In [None]:
# get the descendants of the central node (top_influenser) at a certain distance
reachable_nodes = nx.descendants_at_distance(dG, top_influenser, 1)
# the descendants_at_distance() function be default exludes the source node itself, manually add the source node
reachable_nodes.add(top_influenser)
# construct a subgraph of all the reachable nodes
central_hub = dG.subgraph(reachable_nodes).copy()
# remove isolated nodes
central_hub.remove_nodes_from(list(nx.isolates(central_hub)))

In [None]:
nx.info(central_hub)

In [None]:
def show_graph_node_centrality(graph, centrality_measure, label_offset=0.08, plot_margin=0.1, show_edge_attribute=False):
    fig, ax = plt.subplots(figsize=(10, 7))
    pos = nx.spring_layout(graph)

    if centrality_measure not in ['betweenness', 'eigenvector', 'degree', 'closeness', 'engagement']:
        raise ValueError
    # set nodes' color theme based on the centrality measure taken
    if centrality_measure == 'engagement':
        node_colors = [d['reply_count'] + d['quote_count'] + d['retweet_count'] + d['like_count'] for (n, d) in graph.nodes(data = True)]
    else:
        node_colors = [d[centrality_measure] for (n, d) in graph.nodes(data = True)]
    nx.draw_networkx_nodes(graph, pos, node_size=30, node_color=node_colors)
    # uncomment the below 2 lines to enable dynamic node size
    # node_size = [d['retweet_count'] for (n, d) in graph.nodes(data = True)]
    # nx.draw_networkx_nodes(graph, pos, node_size=node_size, node_color=node_colors)
    
    nx.draw_networkx_edges(graph, pos, edgelist = graph.edges())

    if show_edge_attribute:
        edge_labels = {(u, v): d['weight'] for (u, v, d) in graph.edges(data=True)}
        nx.draw_networkx_edge_labels(graph, pos, edge_labels)
        
    xs = [p[0] for p in pos.values()] # extract all x...
    ys = [p[1] for p in pos.values()] # ...and y values from edges positions
    ax.set_xlim((min(xs) - plot_margin, max(xs) + plot_margin))
    ax.set_ylim((min(ys) - plot_margin, max(ys) + plot_margin))
    plt.show()

In [None]:
# possible metric to set the color theme of the nodes: 'betweenness', 'eigenvector', 'degree', 'closeness', 'engagement'
show_graph_node_centrality(central_hub, 'closeness')