In [112]:
%matplotlib notebook
# starter code
from tqdm import tqdm
import networkx as nx
from graph_tool.all import load_graph, shortest_distance
from ic import simulate_cascade, observe_cascade, get_gvs, get_o2src_time, get_infection_time
from utils import get_rank_index
from joblib import Parallel, delayed 

# gtype = 'balanced-tree/2-6'
gtype = 'kr-peri/2-6'
# gtype = 'p2p-gnutella08'
g = load_graph('data/{}/graph.gt'.format(gtype))
gnx = nx.read_graphml('data/{}/graph.graphml'.format(gtype))
gnx = nx.relabel_nodes(gnx, {i: int(i) for i in gnx.nodes_iter()})
N1, N2 = 100, 100
p, q = 0.5, 0.2

gvs = get_gvs(g, p, N2)

def gen_nontrivial_cascade(g, p, q, source=None):
    while True:
        source, c = simulate_cascade(g, p, source=source)
        obs_nodes = observe_cascade(c, source, q, method='uniform')
        cascade_size = np.sum(c != -1)

        if cascade_size >= 5:  # avoid small cascade
            break

    return c, source, obs_nodes

In [113]:
c, source, obs_nodes = gen_nontrivial_cascade(g, p, q)
hidden_nodes = np.array(list(set(np.arange(g.num_vertices())) - set(obs_nodes)))

In [111]:
def sll_based_on_infection_without_time(g, gvs, obs_nodes, hidden_nodes):
    """for each hidden nodes, 
    compute its source likelihood based only on the infection status without time"""
    sll = np.zeros(g.num_vertices())
    true_bin_vect = np.zeros(g.num_vertices())
    true_bin_vect[obs_nodes] = 1
    for s in hidden_nodes:
        matching_fraction = 0.
        for gv in gvs:
            t = get_infection_time(gv, s)
            t[t != -1] = 1
            t[t == -1] = 0
            f = np.sum(np.logical_and(t, true_bin_vect)) / len(obs_nodes)
            assert f <= 1
            matching_fraction += float(f)
        sll[s] = matching_fraction
    return sll        
        

In [106]:
sll = sll_based_on_infection_without_time(g, gvs, obs_nodes, hidden_nodes)

print(get_rank_index(sll, source))
sll[source], np.sort(sll)

10


(10.5, array([  0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
          0. ,   0. ,   0. ,   0. ,   0. ,   0.5,   0.5,   0.5,   0.5,
          0.5,   0.5,   1. ,   1. ,   1.5,   1.5,   2. ,   3. ,   3. ,
          3.5,   3.5,   3.5,   4. ,   4.5,   5. ,   5. ,   5.5,   6. ,
          6. ,   6.5,   7. ,   8. ,   8.5,   9. ,   9. ,  10.5,  12. ,
         13. ,  13.5,  14.5,  16.5,  22. ,  27.5,  28.5,  30.5,  44.5]))

In [114]:
def one_run(g, p, q, gvs):
    c, source, obs_nodes = gen_nontrivial_cascade(g, p, q)
    hidden_nodes = np.array(list(set(np.arange(g.num_vertices())) - set(obs_nodes)))
    sll = sll_based_on_infection_without_time(g, gvs, obs_nodes, hidden_nodes)
    return get_rank_index(sll, source)

ranks = Parallel(n_jobs=-1)(delayed(one_run)(g, p, q, gvs) for i in tqdm(range(N1)))

100%|██████████| 100/100 [00:58<00:00,  1.67it/s]


In [115]:
np.mean(ranks), np.median(ranks)

(22.420000000000002, 23.0)