In [None]:
import networkx as nx
import numpy as np
import random
import math
from synthetic_data import add_p_and_delta, load_data_by_gtype
from network_stat import get_stat
from graph_tool.all import load_graph, GraphView, shortest_distance, pseudo_diameter
from tqdm import tqdm

In [None]:
maxint = np.iinfo(np.int32).max

In [None]:
g = load_graph('data/p2p-gnutella08/graph.gt')

In [None]:
p = 0.7

In [None]:
K = 100

In [None]:
def get_gvs(g, p, K):
    rands2d = np.random.random((K, g.num_edges()))
    edge_masks2d = (rands2d <= p)

    gvs = []
    for i in range(K):
        p = g.new_edge_property('bool')
        p.set_2d_array(edge_masks2d[i, :])    
        gvs.append(GraphView(g, efilt=p))
    return gvs

In [None]:
gvs = get_gvs(g, 0.7, K)

In [None]:
def activate_edges_by_p(g, p):
    """mask the edge according to probability p and return the masked graph"""
    flags = (np.random.random(g.num_edges()) <= p)
    p = g.new_edge_property('bool')
    p.set_2d_array(flags)
    g.set_edge_filter(p)
    return g

In [None]:
def test_activate_edge_by_p(g):
    g.set_edge_filter(None)
    num_edges = g.num_edges()
    percent = activate_edges_by_p(g, 0.7).num_edges() / num_edges
    np.testing.assert_almost_equal(percent, 0.7, decimal=1)
    g.set_edge_filter(None)

In [None]:
def simulate_cascade(g, p, source=None):
    """return np.ndarray on vertices as the infection time in cascade
    uninfected node has dist -1
    """
    if source is None:
        source = random.choice(np.arange(g.num_vertices(), dtype=int))
    activate_edges_by_p(g, p)
    
    dist = shortest_distance(g, source=g.vertex(source)).a
    dist[dist == maxint] = -1
    g.set_edge_filter(None)
    return source, dist

In [None]:
source, c = simulate_cascade(g, 0.7)

In [None]:
c.min(), c.max()

In [None]:
c[c != -1].shape[0]

In [None]:
def observe_cascade(c, q, method='uniform'):
    all_infection = np.nonzero(c != -1)[0]
    num_obs = int(math.ceil(all_infection.shape[0] * q))
    if method == 'uniform':
        return np.random.permutation(all_infection)[:num_obs]
    elif method == 'late':
        return np.argsort(c)[-num_obs:]

In [None]:
obs = observe_cascade(c, 0.01)
print(c[obs].min(), c[obs].max())
obs = observe_cascade(c, 0.01, 'late')
print(c[obs].min(), c[obs].max())

In [None]:
def get_o2src_time(obs_nodes, gvs):
    o2src_time = {}
    for o in obs_nodes:
        o2src_time[o] = np.array([shortest_distance(gv, source=o).a for gv in gvs])
    return o2src_time

In [None]:
import itertools
def source_likelihood_drs(g, obs_nodes, 
                          o2src_time,
                          infection_times,
                          source=None,
                          debug=False,
                          eps=1e-3,
                          nan_proba=1e-3):
    num_nodes = g.num_vertices()
    N2 = len(sim_flags)
    
    source_likelihood = np.ones(num_nodes, dtype=np.float64)
    obs_nodes = list(obs_nodes)
    
    for o1, o2 in itertools.combinations(obs_nodes, 2):
        t1, t2 = infection_times[o1], infection_times[o2]

        dists1, dists2 = o2src_time[o1], o2src_time[o2]
        mask = np.logical_and(dists1 != maxint, dists2 != maxint)
        counts = mask.sum(axis=0)
        probas = (((dists1 - dists2) == (t1 - t2)) * mask).sum(axis=0) / counts
        probas[np.isnan(probas)] = nan_proba
        
        if debug:
            print('t1={}, t2={}'.format(t1, t2))
            print('source reward: {:.2f}'.format(probas[source]))
            print('obs reward: {}'.format([probas[obs] for obs in set(obs_nodes)-{source}]))

        source_likelihood *= (probas + eps)
        source_likelihood /= source_likelihood.sum()
    return source_likelihood

In [None]:
source, c = simulate_cascade(g, 0.7)
obs_nodes = observe_cascade(c, 0.01, 'uniform')
print(source, len(obs_nodes))

In [None]:
qs = np.linspace(0.001, 0.01, 5)

In [None]:
rows = []
for q in tqdm(qs):
    row = []
    for _ in tqdm(range(50)):
        source, c = simulate_cascade(g, 0.7)
        obs_nodes = observe_cascade(c, q, 'uniform')
        o2src_time = get_o2src_time(obs_nodes, gvs)
        sll = source_likelihood_drs(g, obs_nodes,
                            o2src_time,
                            infection_times=c,
                            source=None,
                            debug=False,
                            eps=1e-3,
                            nan_proba=1e-3)
        row.append(sll[source])
    rows.append(row)

In [None]:
m = np.array(rows)
m.mean(axis=0)

In [None]:
sll = source_likelihood_drs(g, obs_nodes,
                            o2src_time,
                            infection_times=c,
                            source=None,
                            debug=False,
                            eps=1e-3,
                            nan_proba=1e-3)

In [None]:
print(sll.max())
print(sll[source])

In [None]:
pseudo_diameter(g)

In [None]:
inferred_source = np.argmax(sll)
print(shortest_distance(g, source=inferred_source, target=source))