In [93]:
import json
import numpy as np
import pickle as pkl
import re
import pandas as pd
from datetime import datetime
from graph_tool import Graph, GraphView
from graph_tool.topology import label_largest_component
from tqdm import tqdm
from gt_utils import extract_nodes, extract_edges

In [94]:
path = 'data/digg/graph.csv'

In [95]:
df = pd.read_csv(path, header=None, names=['mutual', 'date', 'src', 'tar'])

In [96]:
all_nodes = (set(df['src'].unique()) | set(df['tar'].unique()))

In [97]:
node_index = {n: i for i, n in enumerate(all_nodes)}

In [98]:
g = Graph(directed=False)
g.add_vertex(len(node_index))
edges = set()
for r in tqdm(df.as_matrix(), total=df.shape[0]):
    mutual, _, s, t = r
    s, t = node_index[s], node_index[t]
    edges.add(tuple(sorted((s, t))))

100%|██████████| 1731658/1731658 [00:13<00:00, 127632.62it/s]


In [99]:
g.add_edge_list(edges)

In [100]:
g.save('data/digg/graph.gt')

In [101]:
path = 'data/digg/votes.csv'
df = pd.read_csv(path, header=None, names=['date', 'voter', 'story'])

In [102]:
cascades = df.groupby('story')

In [103]:
cnts = cascades.count()

In [104]:
cnts.sort_values(by='voter', ascending=False).head(5)

Unnamed: 0_level_0,date,voter
story,Unnamed: 1_level_1,Unnamed: 2_level_1
714,24099,24099
1123,8521,8521
502,8492,8492
2719,8057,8057
2050,7825,7825


In [105]:
c1 = cascades.get_group(714)

In [106]:
c1['date'] -= c1['date'].min()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [107]:
c1['voter'] = c1['voter'].map(lambda k: int(node_index.get(k, -1)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [108]:
c1 = c1[c1['voter'] >= 0]

In [109]:
voter2time = dict(zip(c1['voter'].as_matrix(), c1['date'].as_matrix()))

In [161]:
infected_nodes = c1['voter'].as_matrix()

In [126]:
# extract the most voted story subgraph
# all nodes in this subgraph voted the story
vfilt = g.new_vertex_property('bool')
vfilt.a = False
for i in infected_nodes:
    vfilt[i] = True
gv = GraphView(g, vfilt=vfilt, directed=False)    

In [166]:
cc_label = label_largest_component(gv)
infected_nodes_connected = np.nonzero(cc_label.a)[0]

In [158]:
print('cascade size', sum(cc_label.a))

cascade size 8653


In [159]:
infection_times = np.ones(g.num_vertices()) * -1

In [172]:
for i in infected_nodes_connected:
    infection_times[i] = voter2time[i]

In [173]:
set((infection_times >= 0).nonzero()[0]) == set(infected_nodes_connected)

True

In [175]:
with open('data/digg/cascade.pkl', 'wb') as f:
    pkl.dump(infection_times, f)