In [1]:
import re
import json
import pandas as pd
import numpy as np
from tweet import config
import matplotlib.pyplot as plt
from collections import Counter
from itertools import combinations

In [2]:
def extractHashTags(tweets, preprocess=False):
    '''Extract # information from tweets'''
    pattern = "\#\w+"
    hashtag = []
    for t in tweets:
        if preprocess:
            raw_tags = re.findall(pattern, t)
            tags = []
            for t in raw_tags:
                if 'covid' in t.lower() or 'virus' in t.lower():
                    t = t[1:].lower().replace("_", "").replace("ー", "")
                    tags.append("#" + t.title())
                else:
                    tags.append(t)
        else:
            tags = re.findall(pattern, t)
        hashtag.append(tags)
    return hashtag

In [3]:
def count_nodes_edges(hashtag_list, keep=20):
    # Count all the nodes
    node_counts_full = Counter()
    for hashtags in hashtag_list:
        for h in hashtags:
            node_counts_full[h] += 1

    # Only keep the top `keep` tags
    top_tags_dict = sorted(node_counts_full.items(), key=lambda x:x[1], reverse=True)[:keep]
    node_counts = {k:v for k,v in top_tags_dict}
    top_tags = list(node_counts.keys())
    
    # Count the edges for top tags
    edge_counts = Counter()
    for hashtags in hashtag_list:
        # Filter hashtags list
        hashtags_filtered = [h for h in hashtags if h in top_tags]
        # Count the edges
        tag_pairs = list(combinations(hashtags_filtered, r=2))
        for a,b in tag_pairs:
            edge_counts[(a,b)] += 1
    return node_counts, edge_counts

In [4]:
def create_json(node_counts, edge_counts):
    # Convert node to json format
    node_json = {'nodes':[]}
    for i, (n,count) in enumerate(node_counts.items()):
        node_json['nodes'].append({'tag_name':n,
                                   'tag_count':count,
                                   'id':i})

    # Create dict that maps tag to id
    node_2_id = {tag_d['tag_name']:tag_d['id'] for tag_d in node_json['nodes']}

    # Convert edges to json format
    edge_json = {'edges':[]}
    for (a,b), count in edge_counts.items():
        source, target = node_2_id[a], node_2_id[b]
        edge_json['edges'].append({'source_tag':a,
                                   'target_tag':b,
                                   'source':source,
                                   'target':target,
                                   'edge_count':count})
    return node_json, edge_json

In [5]:
def save_json(node_json, edge_json, node_path, edge_path):
    with open(node_path, 'w') as outfile:
        json.dump(node_json, outfile)
    with open(edge_path, 'w') as outfile:
        json.dump(edge_json, outfile)

In [6]:
# Read in data
df = pd.read_csv(config.data / 'covid19_tweets_final.csv')
df = df.dropna(subset=['full_text'])
tweets = df['full_text'][50000:60000]

# Create save path
tag_net_path = config.data / 'tag_network'
if not tag_net_path.exists(): 
    tag_net_path.mkdir()
node_path = tag_net_path / 'node.json'
edge_path = tag_net_path / 'edge.json'

# Create nodes and edges json
hashtag_list = extractHashTags(tweets, preprocess=True)
node_counts, edge_counts = count_nodes_edges(hashtag_list, keep=20)
node_json, edge_json = create_json(node_counts, edge_counts)
save_json(node_json, edge_json, node_path, edge_path)

In [7]:
with open(node_path, 'r') as outfile:
    node_json = json.load(outfile)
    
with open(edge_path, 'r') as outfile:
    edge_json = json.load(outfile)

In [8]:
node_json

{'nodes': [{'tag_name': '#Covid19', 'tag_count': 813, 'id': 0},
  {'tag_name': '#Coronavirus', 'tag_count': 213, 'id': 1},
  {'tag_name': '#Covid', 'tag_count': 100, 'id': 2},
  {'tag_name': '#Hydroxychloroquine', 'tag_count': 29, 'id': 3},
  {'tag_name': '#pandemic', 'tag_count': 28, 'id': 4},
  {'tag_name': '#WearAMask', 'tag_count': 26, 'id': 5},
  {'tag_name': '#lockdown', 'tag_count': 25, 'id': 6},
  {'tag_name': '#Trumpvirus', 'tag_count': 19, 'id': 7},
  {'tag_name': '#China', 'tag_count': 17, 'id': 8},
  {'tag_name': '#BREAKING', 'tag_count': 16, 'id': 9},
  {'tag_name': '#auspol', 'tag_count': 16, 'id': 10},
  {'tag_name': '#Trump', 'tag_count': 14, 'id': 11},
  {'tag_name': '#SocialDistancing', 'tag_count': 14, 'id': 12},
  {'tag_name': '#SaveJEE_NEETstudentsPM', 'tag_count': 14, 'id': 13},
  {'tag_name': '#StayHome', 'tag_count': 13, 'id': 14},
  {'tag_name': '#HongKong', 'tag_count': 13, 'id': 15},
  {'tag_name': '#StaySafe', 'tag_count': 12, 'id': 16},
  {'tag_name': '#SAR

In [9]:
edge_json

{'edges': [{'source_tag': '#Covid19',
   'target_tag': '#WearAMask',
   'source': 0,
   'target': 5,
   'edge_count': 5},
  {'source_tag': '#pandemic',
   'target_tag': '#Covid19',
   'source': 4,
   'target': 0,
   'edge_count': 6},
  {'source_tag': '#China',
   'target_tag': '#Covid19',
   'source': 8,
   'target': 0,
   'edge_count': 7},
  {'source_tag': '#Covid',
   'target_tag': '#Covid19',
   'source': 2,
   'target': 0,
   'edge_count': 6},
  {'source_tag': '#Covid19',
   'target_tag': '#Covid19',
   'source': 0,
   'target': 0,
   'edge_count': 28},
  {'source_tag': '#Covid19',
   'target_tag': '#Trumpvirus',
   'source': 0,
   'target': 7,
   'edge_count': 3},
  {'source_tag': '#Coronavirus',
   'target_tag': '#Covid19',
   'source': 1,
   'target': 0,
   'edge_count': 43},
  {'source_tag': '#Coronavirus',
   'target_tag': '#Trumpvirus',
   'source': 1,
   'target': 7,
   'edge_count': 5},
  {'source_tag': '#WearAMask',
   'target_tag': '#StaySafe',
   'source': 5,
   'target'