In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import networkx as nx
import metis
import pandas as pd
import random
from scipy.stats import pearsonr
from collections import defaultdict
from datetime import timedelta, datetime
from tqdm import tqdm
from matplotlib import pyplot as plt
from joblib import Parallel, delayed

from rwc import controversy_score
from util import tw_list, get_cut_ratio, add_edges, remove_edges
from leopard import update_partition



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
retweets = pd.read_pickle('data/july.pkl')

In [4]:
retweets.sort_values(by=['created_at'], axis='index', inplace=True)

In [5]:
DEBUG = False
incremental = True
update_interval = timedelta(minutes=10)
T_window = timedelta(minutes=60*12)
top_node_percent=0.01


In [6]:
# prevent pickling error in multiprocessing
def defaultdict_using_list_func(): 
    return defaultdict(list)

def earlist_date_func(): 
    return datetime(1970, 1, 1)

In [None]:
from simulation import run_simulation
# some test
result = run_simulation(retweets,
                        T_window=T_window,
                        top_node_percent=top_node_percent,
                        update_interval=update_interval,                        
                        incremental=True,
                        top_k=10,
                        top_k_computation_interval=timedelta(minutes=20),
                        min_rwc_score=0.8,
                        head_n=12*1e5,
                        return_graph=True
                       )
# print((result[1]['end_time'] - result[1]['start_time']).seconds)

77935it [01:00, 723.55it/s]

In [None]:
top_k_log = result[1]
tags = list(set([i['tag'] for l in top_k_log for i in l['list']]))

N, M = len(top_k_log), len(tags)

tag2id = {t: i for i, t in enumerate(tags)}
rwc_mat = np.zeros((M, N))
X = np.arange(N)

for i, log in enumerate(top_k_log):
    for item in log['list']:
        rwc_mat[tag2id[item['tag']], i] = item['size']

colors = "#aec7e8 #ff7f0e #ffbb78 #2ca02c #98df8a #d62728 #ff9896 #9467bd #c5b0d5 #8c564b #c49c94 #e377c2 #f7b6d2 #7f7f7f #c7c7c7 #bcbd22 #dbdb8d #17becf #9edae5 #1f77b4 #FFD000".split()
# colors = random.sample(all_colors, len(tags))
fig = plt.figure(figsize=(20, 13))
ax = fig.add_subplot(111)
ax.stackplot(X, rwc_mat, baseline='wiggle', colors=colors)
ax.legend(tags, loc='upper left', ncol=3, fontsize=15.5)
# ax.set_ylim([-1250, 1500])
ax.set_title('Sizes of controversial events over time')
fig.savefig('figs/events_stacked_area.pdf')

In [None]:
# update interval vs running time on (incremental, from-scratch)
update_intervals = [timedelta(minutes=i) for i in [4, 8, 16, 32]]
T_window = timedelta(minutes=60)

results = Parallel(n_jobs=8)(delayed(run_simulation)(update_interval=update_interval,
                                                     T_window=T_window,
                                                     top_node_percent=top_node_percent,
                                                     incremental=incremental)
                             for incremental in [True, False]
                             for update_interval in update_intervals)


In [None]:
df = pd.DataFrame.from_records([r[0] for r in results])
df['seconds'] = (df['end_time'] - df['start_time']).apply(lambda t: t.seconds)
df['update_interval'] = df['update_interval'].apply(lambda i: i.seconds/60)
df['throughput'] = 1e5 / df['seconds']
df

In [None]:
leopard = df[df['incremental'] == True]
metis = df[df['incremental'] == False]

plt.plot(leopard['update_interval'], leopard['throughput'], '-o')
plt.plot(metis['update_interval'], metis['throughput'], '-*')
plt.legend(['Incremental', 'From scratch'], loc='upper left')
plt.xlabel('Update interval (minutes)')
plt.ylabel('Throughput (#tweets / second)')
plt.ylim([100, 1800])
plt.title('Throughput vs update interval')
plt.savefig('figs/throughput_comparison.pdf')

In [None]:
# controversy score on incremental and non-incremental
update_intervals = [timedelta(minutes=i) for i in [5, 10, 15, 20, 25]]
T_window = timedelta(minutes=60)

results = Parallel(n_jobs=8)(delayed(run_simulation)(update_interval=update_interval,
                                                     T_window=T_window,
                                                     top_node_percent=top_node_percent,
                                                     incremental=incremental,
                                                     return_log=True)
                             for incremental in [True, False]
                             for update_interval in update_intervals)


In [None]:
key1, key2 = 'incremental', 'update_interval'
rows = defaultdict(list)
hashtags = list(results[0][2].keys())
for r in results:
    params, log = r[0], r[2]
    for h in hashtags:
        rows[(params[key1], params[key2])] += log[h]['rwc']
incs, upds = zip(*list(rows.keys()))
rwcs = list(rows.values())
df = pd.DataFrame.from_dict({'incremental': incs, 
                             'update_interval': upds, 
                             'rwcs':rwcs})

In [None]:
pcr = df.groupby('update_interval').apply(lambda sdf: pearsonr(*sdf['rwcs'])[0])
minutes = pcr.index.map(lambda m: m.seconds/60)
corrs = pcr.values
plt.plot(minutes, corrs, '-o')
plt.xlabel('update interval (minutes)')
plt.ylabel('Correlation coefficient')
plt.title('Pearson correlation coefficient vs update interval')
plt.ylim([0, 1.0])
plt.xlim([min(minutes) - 1, max(minutes) + 1])
plt.savefig('figs/corrs-vs-update-interval.pdf')

In [None]:
r1 = df[(df['incremental'] == True) & (df['update_interval'] == timedelta(minutes=5))].iloc[0]['rwcs']
r2 = df[(df['incremental'] == False) & (df['update_interval'] == timedelta(minutes=5))].iloc[0]['rwcs']

In [None]:
plt.scatter(r1, r2)
plt.xlim([0.5, 1.0])
plt.ylim([0.5, 1.0])
plt.xlabel('incremental')
plt.ylabel('from-scratch')
plt.title('RWC score from two approaches')
plt.savefig('figs/ic-fs-rwc-scatter.pdf')

In [None]:
# the RWC score evoluation of incremental and from-scratch method
h = max(results[0][2].keys(), key=lambda d: len(results[0][2][d]['rwc']))
print(h)
scores = {}
for r in results:
    if r[0]['update_interval'] == timedelta(minutes=5):
        if r[0]['incremental']:
            k = 'incremental'
        else:
            k = 'from scratch'
        scores[k] = r[2][h]['rwc']        

In [None]:
for k, v in scores.items():
    plt.plot(np.arange(len(v)), v)
plt.legend(scores.keys(), loc='lower right')    

In [None]:
tag2log, running_stat, top_hashtags_by_time, tag2g = ()
hottest_h = max(tag2g, key=lambda k: tag2g[k].number_of_nodes())

In [None]:
g = tag2g[hottest_h]
log = tag2log[hottest_h]

In [None]:
df = pd.DataFrame.from_dict(log)

In [None]:
df.plot(x='time', y='rwc', ylim=[0.5, 1])

In [None]:
df.plot(x='time', y='graph_size')

In [None]:
df.plot(x='time', y='cut_ratio')