In [1]:
import re
import json
import pandas as pd
import numpy as np
from tweet import config
import matplotlib.pyplot as plt
from collections import Counter
from itertools import combinations

In [2]:
def extractHashTags(tweets, preprocess=False):
    '''Extract # information from tweets'''
    pattern = "\#\w+"
    hashtag = []
    for t in tweets:
        if preprocess:
            raw_tags = re.findall(pattern, t)
            tags = []
            for t in raw_tags:
                if 'covid' in t.lower() or 'virus' in t.lower():
                    t = t[1:].lower().replace("_", "").replace("ãƒ¼", "")
                    tags.append("#" + t.title())
                else:
                    tags.append(t)
        else:
            tags = re.findall(pattern, t)
        hashtag.append(tags)
    return hashtag

def save_json(node_json, edge_json, node_path, edge_path):
    with open(node_path, 'w') as outfile:
        json.dump(node_json, outfile)
    with open(edge_path, 'w') as outfile:
        json.dump(edge_json, outfile)

In [3]:
## Use these two functions for processed node data
def count_nodes_edges(hashtag_list, keep=20):
    # Count all the nodes
    node_counts_full = Counter()
    for hashtags in hashtag_list:
        for h in hashtags:
            node_counts_full[h] += 1

    # Only keep the top `keep` tags
    top_tags_dict = sorted(node_counts_full.items(), key=lambda x:x[1], reverse=True)[:keep]
    node_counts = {k:v for k,v in top_tags_dict}
    top_tags = list(node_counts.keys())
    
    # Count the edges for top tags
    edge_counts = Counter()
    seen_tags = []
    for hashtags in hashtag_list:
        # Filter hashtags list
        hashtags_filtered = [h for h in hashtags if h in top_tags]
        # Count the edges
        tag_pairs = list(combinations(set(hashtags_filtered), r=2))
        for a,b in tag_pairs:
            edge_counts[(a,b)] += 1
            seen_tags.extend([a,b])
    
    # Update node_counts to exclude those with no edges
    for node in node_counts.keys():
        if node not in seen_tags:
            del node_counts[node]
    return node_counts, edge_counts

def create_json(node_counts, edge_counts):
    # Convert node to json format
    node_json = {'nodes':[]}
    for i, (n,count) in enumerate(node_counts.items()):
        node_json['nodes'].append({'tag_name':n,
                                   'tag_count':count,
                                   'id':i})

    # Create dict that maps tag to id
    node_2_id = {tag_d['tag_name']:tag_d['id'] for tag_d in node_json['nodes']}

    # Convert edges to json format
    edge_json = {'edges':[]}
    for (a,b), count in edge_counts.items():
        source, target = node_2_id[a], node_2_id[b]
        edge_json['edges'].append({'source_tag':a,
                                   'target_tag':b,
                                   'source':source,
                                   'target':target,
                                   'edge_count':count})
    return node_json, edge_json

In [4]:
# Use this block functions for processed node data

# Read in data
df = pd.read_csv(config.data / 'covid19_tweets_final.csv')
df = df.dropna(subset=['full_text'])
tweets = df['full_text']

# Create save path
tag_net_path = config.data / 'tag_network'
if not tag_net_path.exists(): 
    tag_net_path.mkdir()
node_path = tag_net_path / 'node.json'
edge_path = tag_net_path / 'edge.json'

# Create nodes and edges json
hashtag_list = extractHashTags(tweets, preprocess=True)
node_counts, edge_counts = count_nodes_edges(hashtag_list, keep=20)
node_json, edge_json = create_json(node_counts, edge_counts)
save_json(node_json, edge_json, node_path, edge_path)

with open(node_path, 'r') as outfile:
    node_json = json.load(outfile)
    
with open(edge_path, 'r') as outfile:
    edge_json = json.load(outfile)
    
print(node_json['nodes'][:5])
print(edge_json['edges'][:5])

[{'tag_name': '#Covid19', 'tag_count': 7669, 'id': 0}, {'tag_name': '#Coronavirus', 'tag_count': 3456, 'id': 1}, {'tag_name': '#Covid', 'tag_count': 956, 'id': 2}, {'tag_name': '#China', 'tag_count': 313, 'id': 3}, {'tag_name': '#lockdown', 'tag_count': 261, 'id': 4}]
[{'source_tag': '#Coronavirus', 'target_tag': '#China', 'source': 1, 'target': 3, 'edge_count': 109}, {'source_tag': '#Wuhan', 'target_tag': '#Coronavirus', 'source': 14, 'target': 1, 'edge_count': 65}, {'source_tag': '#pandemic', 'target_tag': '#Coronavirus', 'source': 5, 'target': 1, 'edge_count': 32}, {'source_tag': '#Wuhan', 'target_tag': '#China', 'source': 14, 'target': 3, 'edge_count': 30}, {'source_tag': '#BREAKING', 'target_tag': '#Coronavirus', 'source': 9, 'target': 1, 'edge_count': 23}]


# Use this function for getting data ready for d3

In [None]:
# # Use this function for getting data ready for d3
# def create_json(df, hashtag_list):
#     # Parse Nodes json
#     node_json = {'nodes':[]}

#     for hashtags, row_id, date in zip(hashtag_list, df['id'], df['date_short']):
#         for tag in hashtags:
#             node_info = {'row_id': row_id,
#                          'date': date,
#                          'tag': tag}
#             node_json['nodes'].append(node_info)

#     print("Total number of hashtags parsed:", len(node_json['nodes']))

#     # Parse Edges json
#     edge_json = {'edges':[]}
#     for hashtags, row_id, date in zip(hashtag_list, df['id'], df['date_short']):
#         # Generate tag pairs
#         tag_pairs = list(combinations(hashtags, r=2))
#         for a,b in tag_pairs:
#             edge_info = {'row_id': row_id,
#                          'date': date,
#                          'source_tag':a,
#                          'target_tag':b}
#             edge_json['edges'].append(edge_info)
#     print("Total number of edges parsed:", len(edge_json['edges']))
    
#     # Add node id to both dicts
#     unique_tags = list(set(tag['tag'] for tag in node_json['nodes']))
#     node_2_id = {t:i for i,t in enumerate(unique_tags)}

#     for node_info in node_json['nodes']:
#         node_info.update({"tag_id":node_2_id[node_info['tag']]})
        
#     for edge_info in edge_json['edges']:
#         source = node_2_id[edge_info['source_tag']]
#         target = node_2_id[edge_info['target_tag']]
#         edge_info.update({'source': source, 'target': target})
             
#     return node_json, edge_json

In [None]:
# # Read in data
# df = pd.read_csv(config.data / 'covid19_tweets_final.csv')

# # Create save path
# tag_net_path = config.data / 'tag_network'
# if not tag_net_path.exists(): 
#     tag_net_path.mkdir()
# node_path = tag_net_path / 'node.json'
# edge_path = tag_net_path / 'edge.json'

# # Create nodes and edges json
# hashtag_list = extractHashTags(df['full_text'], preprocess=True)
# node_json, edge_json = create_json(df, hashtag_list)
# save_json(node_json, edge_json, node_path, edge_path)

In [None]:
# with open(node_path, 'r') as outfile:
#     node_json = json.load(outfile)
    
# with open(edge_path, 'r') as outfile:
#     edge_json = json.load(outfile)
    
# print(node_json['nodes'][:5])
# print(edge_json['edges'][:5])