In [30]:
import networkx as nx
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import metis

In [2]:
df = pd.read_pickle('data/retweets_test.pkl')

In [10]:
retweets_by_hashtags = defaultdict(list)
for i, r in df.iterrows():    
    for h in r['hashtags']:
        retweets_by_hashtags[h].append(r.to_dict())

In [26]:
tag2freq = Counter({h: len(retweets_by_hashtags[h])
                    for h in retweets_by_hashtags})
# print(tag2freq.most_common(10))
top_tags = [h for h, _ in tag2freq.most_common(10)]
print('Top 10 frequent tags: {}'.format(top_tags))

Top 10 frequent tags: ['AMAs', 'ALDUB16thMonthsary', 'ALDUBMomAndDad', 'VoiceSaveAaron', 'TeenWolf', 'ARIASONEDIRECTION', 'MannequinChallenge', 'Trump', 'NoDAPL', 'ARIASJUSTINBIEBER']


In [27]:
def build_graph_from_retweets(rts):
    g = nx.Graph()  # QUESTION: directed or undirected?
    g.add_edges_from((r['retweeter'], r['retweetee']) for r in rts)
    return g

In [43]:
def describe_graph(g):
    components = list(nx.connected_components(g))
    top_cc_sizes = list(sorted(map(len, components), reverse=True))[:3]
    top_cc_ratios = np.array(top_cc_sizes) / g.number_of_nodes()
    print("""
    #nodes: {},
    #edges: {},
    size of top-3 component: {}
    ratio of top-3 components: {}
    """.format(
            g.number_of_nodes(),
            g.number_of_edges(),
            top_cc_sizes,
            top_cc_ratios
        ))

In [40]:
graph_by_hashtag = {
    h: build_graph_from_retweets(retweets_by_hashtags[h])
    for h in top_tags}

In [44]:
# some general statistics
for h, g in list(graph_by_hashtag.items())[:]:
    print("Hashtag: {}".format(h))
    describe_graph(g)

Hashtag: ALDUB16thMonthsary

    #nodes: 1417,
    #edges: 1535,
    size of top-3 component: [1153, 9, 8]
    ratio of top-3 components: [ 0.8136909   0.00635145  0.00564573]
    
Hashtag: ARIASJUSTINBIEBER

    #nodes: 683,
    #edges: 632,
    size of top-3 component: [404, 8, 8]
    ratio of top-3 components: [ 0.59150805  0.01171303  0.01171303]
    
Hashtag: VoiceSaveAaron

    #nodes: 1473,
    #edges: 1353,
    size of top-3 component: [987, 18, 8]
    ratio of top-3 components: [ 0.6700611   0.01221996  0.00543109]
    
Hashtag: AMAs

    #nodes: 8188,
    #edges: 8910,
    size of top-3 component: [5619, 171, 66]
    ratio of top-3 components: [ 0.68624817  0.02088422  0.00806058]
    
Hashtag: ARIASONEDIRECTION

    #nodes: 1198,
    #edges: 1186,
    size of top-3 component: [771, 18, 10]
    ratio of top-3 components: [ 0.64357262  0.01502504  0.00834725]
    
Hashtag: NoDAPL

    #nodes: 908,
    #edges: 703,
    size of top-3 component: [184, 60, 24]
    ratio of top-3 c

In [37]:
def largest_connected_subgraph(g):
    ccs = nx.connected_components(g)
    nodes = max(ccs, key=len)
    return g.subgraph(nodes)

In [38]:
for h, g in graph_by_hashtag.items():
    subg = largest_connected_subgraph(g)
    cuts, parts = metis.part_graph(subg, 2)
    print("""hashtag: {}
    cut ratio: {}
    """.format(h,
               cuts / subg.number_of_edges()))

hashtag: ALDUB16thMonthsary
    cut ratio: 0.05213613323678494
    
hashtag: ARIASJUSTINBIEBER
    cut ratio: 0.03547671840354767
    
hashtag: VoiceSaveAaron
    cut ratio: 0.01644100580270793
    
hashtag: AMAs
    cut ratio: 0.006196310378819884
    
hashtag: ARIASONEDIRECTION
    cut ratio: 0.03732162458836443
    
hashtag: NoDAPL
    cut ratio: 0.016216216216216217
    
hashtag: ALDUBMomAndDad
    cut ratio: 0.036020583190394515
    
hashtag: MannequinChallenge
    cut ratio: 0.49019607843137253
    
hashtag: Trump
    cut ratio: 0.5081967213114754
    
hashtag: TeenWolf
    cut ratio: 0.33519553072625696
    
