In [1]:
import networkx as nx
import numpy as np
from glob import glob
import pandas as pd
from collections import defaultdict, Counter
import metis

In [2]:
dfs = [pd.read_pickle(p) for p in glob('data/retweets_16july/*.pkl')]

In [3]:
df = pd.concat(dfs, axis=0)
if False:
    df.to_pickle('data/july.pkl')

In [None]:
df.shape

In [None]:
df['hashtag'].value_counts()[:10]

In [None]:
tag = 'MTVHottest'
hdf = df[df['hashtag'] == tag]

In [None]:
def build_graph_from_df(df):
    g = nx.Graph()
    for _, r in df.iterrows():
        g.add_edge(r['retweeter'], r['retweetee'])
    return g

In [None]:
subg = build_graph_from_df(hdf)
nx.write_gpickle(subg, 'data/{}.gpkl'.format(tag.lower()))
hdf.to_pickle('data/{}.pkl'.format(tag.lower()))

In [None]:
describe_graph(subg)

In [None]:
from rwc import controversy_score
print('rwc score {}'.format(controversy_score(subg)))

In [None]:
df = pd.read_pickle('data/retweets_test.pkl')

In [None]:
retweets_by_hashtags = defaultdict(list)
for i, r in df.iterrows():    
    for h in r['hashtags']:
        retweets_by_hashtags[h].append(r.to_dict())

In [None]:
tag2freq = Counter({h: len(retweets_by_hashtags[h])
                    for h in retweets_by_hashtags})
# print(tag2freq.most_common(10))
top_tags = [h for h, _ in tag2freq.most_common(10)]
print('Top 10 frequent tags: {}'.format(top_tags))

In [None]:
def build_graph_from_retweets(rts):
    g = nx.Graph()  # QUESTION: directed or undirected?
    g.add_edges_from((r['retweeter'], r['retweetee']) for r in rts)
    return g

In [None]:
def describe_graph(g):
    components = list(nx.connected_components(g))
    top_cc_sizes = list(sorted(map(len, components), reverse=True))[:3]
    top_cc_ratios = np.array(top_cc_sizes) / g.number_of_nodes()
    print("""
    #nodes: {},
    #edges: {},
    size of top-3 component: {}
    ratio of top-3 components: {}
    """.format(
            g.number_of_nodes(),
            g.number_of_edges(),
            top_cc_sizes,
            top_cc_ratios
        ))

In [None]:
graph_by_hashtag = {
    h: build_graph_from_retweets(retweets_by_hashtags[h])
    for h in top_tags}

In [None]:
# some general statistics
for h, g in list(graph_by_hashtag.items())[:]:
    print("Hashtag: {}".format(h))
    describe_graph(g)

In [None]:
def largest_connected_subgraph(g):
    ccs = nx.connected_components(g)
    nodes = max(ccs, key=len)
    return g.subgraph(nodes)

In [None]:
for h, g in graph_by_hashtag.items():
    subg = largest_connected_subgraph(g)
    cuts, parts = metis.part_graph(subg, 2)
    print("""hashtag: {}
    cut ratio: {}
    """.format(h,
               cuts / subg.number_of_edges()))