In [20]:
import json
import numpy as np
import pickle as pkl
import re
import pandas as pd
from datetime import datetime
from graph_tool import Graph, GraphView
from graph_tool.topology import label_largest_component
from tqdm import tqdm
from gt_utils import extract_nodes, extract_edges

In [21]:
path = 'data/digg/graph.csv'

In [22]:
df = pd.read_csv(path, header=None, names=['mutual', 'date', 'src', 'tar'])

In [23]:
all_nodes = (set(df['src'].unique()) | set(df['tar'].unique()))

In [24]:
print('the node ids are not consecutive')
len(all_nodes), max(all_nodes)

the node ids are not consecutive


(279631, 336225)

In [25]:
node_index = {n: i for i, n in enumerate(all_nodes)}

In [26]:
g = Graph(directed=False)
g.add_vertex(len(node_index))
edges = set()
for r in tqdm(df.as_matrix(), total=df.shape[0]):
    mutual, _, s, t = r
    s, t = node_index[s], node_index[t]
    edges.add(tuple(sorted((s, t))))


  0%|          | 0/1731658 [00:00<?, ?it/s][A
  1%|          | 11718/1731658 [00:00<00:14, 117177.32it/s][A
  2%|▏         | 27641/1731658 [00:00<00:13, 127260.11it/s][A
  3%|▎         | 43731/1731658 [00:00<00:12, 135775.48it/s][A
  3%|▎         | 60288/1731658 [00:00<00:11, 143510.96it/s][A
  4%|▍         | 77753/1731658 [00:00<00:10, 151620.74it/s][A
  6%|▌         | 98796/1731658 [00:00<00:09, 165496.14it/s][A
  7%|▋         | 120280/1731658 [00:00<00:09, 177741.68it/s][A
  8%|▊         | 141934/1731658 [00:00<00:08, 187837.52it/s][A
  9%|▉         | 161003/1731658 [00:00<00:08, 188678.04it/s][A
 10%|█         | 179851/1731658 [00:01<00:10, 152591.91it/s][A
 11%|█▏        | 196210/1731658 [00:01<00:10, 146357.64it/s][A
 12%|█▏        | 212740/1731658 [00:01<00:10, 151566.93it/s][A
 13%|█▎        | 229744/1731658 [00:01<00:09, 156672.41it/s][A
 14%|█▍        | 246574/1731658 [00:01<00:09, 159986.07it/s][A
 15%|█▌        | 263770/1731658 [00:01<00:08, 163399.24it/s][

In [27]:
g.add_edge_list(edges)

In [28]:
g.save('data/digg/graph.gt')

In [29]:
path = 'data/digg/votes.csv'
df = pd.read_csv(path, header=None, names=['date', 'voter', 'story'])

In [30]:
cascades = df.groupby('story')

In [31]:
cnts = cascades.count()

In [32]:
sorted_story = cnts.sort_values(by='voter', ascending=False)

In [33]:
sorted_story.head(5)

Unnamed: 0_level_0,date,voter
story,Unnamed: 1_level_1,Unnamed: 2_level_1
714,24099,24099
1123,8521,8521
502,8492,8492
2719,8057,8057
2050,7825,7825


In [34]:
sorted_story.tail(5)

Unnamed: 0_level_0,date,voter
story,Unnamed: 1_level_1,Unnamed: 2_level_1
2104,162,162
2919,158,158
797,158,158
60,158,158
2976,122,122


In [35]:
def get_connected_infected_nodes(cascades, idx):
    c = cascades.get_group(idx)
    c['date'] -= c['date'].min()
    c['voter'] = c['voter'].map(lambda k: int(node_index.get(k, -1)))  # some voter is not in the graph, strange
    c = c[c['voter'] >= 0]  # filter them out    
    
    voter2time = dict(zip(c['voter'].as_matrix(), c['date'].as_matrix()))
    infected_nodes = c['voter'].as_matrix()
    
    # extract the most voted story subgraph
    # all nodes in this subgraph voted the story
    vfilt = g.new_vertex_property('bool')
    vfilt.a = False
    for i in infected_nodes:
        vfilt[i] = True
    gv = GraphView(g, vfilt=vfilt, directed=False)        
    
    # get the connected component
    cc_label = label_largest_component(gv)
    infected_nodes_connected = np.nonzero(cc_label.a)[0]
    
    # print('cascade size', sum(cc_label.a))
    
    infection_times = np.ones(g.num_vertices()) * -1
    
    for i in infected_nodes_connected:
        infection_times[i] = voter2time[i]    
        
    return infection_times

In [39]:
indices = sorted_story.index[:50]

In [41]:
infection_times_list = [get_connected_infected_nodes(cascades, idx) for idx in tqdm(indices)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
100%|██████████| 50/50 [01:15<00:00,  1.36s/it]


In [38]:
def cascade_size(l):
    return len((l>=0).nonzero()[0])

In [42]:
list(map(cascade_size, infection_times_list))

[8653,
 2660,
 1946,
 1624,
 1849,
 1748,
 1628,
 1844,
 1387,
 1787,
 2202,
 2627,
 2141,
 1407,
 1454,
 2071,
 1188,
 2030,
 1471,
 1083,
 1165,
 1109,
 1129,
 1268,
 1358,
 1393,
 1414,
 1381,
 1163,
 1209,
 1314,
 961,
 1780,
 1067,
 1772,
 1212,
 1819,
 1219,
 1416,
 1322,
 909,
 1084,
 873,
 1060,
 893,
 1179,
 1077,
 929,
 989,
 1004]

In [173]:
set((infection_times >= 0).nonzero()[0]) == set(infected_nodes_connected)

True

In [45]:
for i, infection_times in enumerate(infection_times_list[1:11]):
    with open('data/digg/cascade_{}.pkl'.format(i), 'wb') as f:
        pkl.dump(infection_times, f)

In [37]:
# FOR SMALL CASCADE
idx2cascade_small = {idx: get_connected_infected_nodes(cascades, idx) for idx in tqdm(sorted_story.tail(50).index)}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.

  2%|▏         | 1/50 [00:00<00:30,  1.62it/s][A
  4%|▍         | 2/50 [00:01<00:28,  1.66it/s][A
  6%|▌         | 3/50 [00:01<00:27,  1.70it/s][A
  8%|▊         | 4/50 [00:02<00:25,  1.80it/s][A
 10%|█         | 5/50 [00:02<00:25,  1.78it/s][A
 12%|█▏        | 6/50 [00:03<00:24,  1.77it/s][A
 14%|█▍        | 7/50 [00:03<00:24,  1.77it/s][A
 16%|█▌        | 8/50 [00:04<00:23

In [40]:
sorted_story.tail(50)

Unnamed: 0_level_0,date,voter
story,Unnamed: 1_level_1,Unnamed: 2_level_1
1883,209,209
2537,209,209
2229,208,208
3299,208,208
3212,208,208
1376,207,207
227,206,206
3426,205,205
3342,205,205
909,205,205


In [41]:
dict(zip(idx2cascade_small.keys(), map(cascade_size, idx2cascade_small.values())))

{23: 155,
 27: 109,
 60: 108,
 72: 145,
 129: 105,
 134: 114,
 158: 32,
 184: 118,
 227: 136,
 308: 131,
 323: 132,
 465: 110,
 765: 129,
 797: 73,
 909: 114,
 966: 137,
 1041: 88,
 1065: 123,
 1172: 112,
 1208: 116,
 1376: 116,
 1490: 127,
 1588: 96,
 1608: 118,
 1680: 122,
 1798: 111,
 1883: 135,
 1983: 113,
 2104: 69,
 2229: 136,
 2345: 104,
 2387: 118,
 2536: 108,
 2537: 133,
 2560: 72,
 2591: 117,
 2645: 150,
 2897: 156,
 2919: 103,
 2976: 90,
 3212: 162,
 3246: 117,
 3285: 144,
 3287: 120,
 3299: 132,
 3342: 143,
 3426: 108,
 3510: 138,
 3544: 86,
 3545: 128}

In [42]:
for i, infection_times in enumerate(idx2cascade_small.values()):
    with open('data/digg/small_cascade_{}.pkl'.format(i), 'wb') as f:
        pkl.dump(infection_times, f)