In [1]:
%matplotlib inline

import numpy as np
import random
from graph_generator import kronecker_random_graph
from cascade import generate_cascade

In [2]:
P_peri = np.array([[0.9, 0.1], [0.1, 0.3]])

In [3]:
def run_simulation(P, k, n_edges):
    g = kronecker_random_graph(k, P, directed=False, n_edges=n_edges)
    g = g.subgraph(max(nx.connected_components(g), key=len))

    percentage = 0.2

    infected_times = None
    for _ in range(100):
        infected_times = generate_cascade(g)
        if len(infected_times) > 10:
            break

    assert infected_times is not None

    source = min(infected_times, key=lambda k: infected_times[k])
    infected_minus_source = set(infected_times.keys()) - {source}

    observed = {n: infected_times[n]
                for n in random.sample(infected_minus_source,
                                       int(len(infected_minus_source) * percentage))}
    assert source not in observed
    
    # BFS-style searching
    current_node = min(observed, key=lambda k: observed[k])
    query_n = 0
    while True:
        if current_node == source:  # to be fair with pagerank approach
            break
        # print('current eariliest infected node {} ({})'.format(current_node, observed[current_node]))
        queries = [n for n in g.neighbors(current_node)
                   if n not in observed]
        query_n += len(queries)
        newly_found_infections = {q: infected_times.get(q)
                                  for q in queries 
                                  if infected_times.get(q) is not None}
        newly_found_earlier_infections = [q
                                          for q in newly_found_infections
                                          if infected_times[q] < infected_times[current_node]]
        if len(newly_found_earlier_infections) == 0:
            break
        else:
            observed.update(newly_found_infections)        
            current_node = min(newly_found_earlier_infections, key=lambda k: observed[k])

    assert current_node == source, '{}({}) !== {}({})'.format(current_node, infected_times[current_node],
                                                              source, infected_times[source])
    return g, infected_times, query_n

In [4]:
from joblib import Parallel, delayed
rows = Parallel(n_jobs=8)(delayed(run_simulation)(P_peri, 8, 512) for _ in range(100))


In [5]:
data = map(lambda t: (len(t[1]), t[2]), rows)
df = pd.DataFrame(list(data), columns=['csize', 'qsize'])

In [6]:
cs = df['csize'].describe()
qs = df['qsize'].describe()
rs = (df['qsize'] / df['csize']).describe()

stat = pd.DataFrame([cs.as_matrix(), qs.as_matrix(), rs.as_matrix()],
                    columns=rs.index,
                    index=['cascade size', 'query size', 'query size / cascade size'])

In [7]:
stat

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cascade size,100.0,66.31,40.758142,11.0,26.0,60.0,107.25,129.0
query size,100.0,11.75,8.068363,1.0,6.0,9.5,16.25,33.0
query size / cascade size,100.0,0.327399,0.375362,0.008621,0.070244,0.173954,0.440994,1.727273
