In [68]:
import numpy as np
import pandas as pd
import pickle as pkl

from tqdm import tqdm
from glob import glob

from graph_tool import load_graph
from paper_experiment import get_tree
from cascade import observe_cascade
from gt_utils import extract_nodes, extract_edges
from evaluate import edge_order_accuracy


In [26]:
g = load_graph('data/digg/cascade_graph.gt')

In [27]:
infection_times = pkl.load(open('data/digg/cascade.pkl', 'rb'))

In [65]:
def run_k_runs(g, q, infection_times, method, 
               k, result_dir, 
               verbose=False):
    for i in range(k):
        obs = observe_cascade(infection_times, source=None, q=q)
        tree = get_tree(g, infection_times, source=None, obs_nodes=obs, method=method, verbose=verbose)

        pred_edges = extract_edges(tree)
        pkl.dump(pred_edges,
                 open(result_dir + '/{}.pkl'.format(i), 'wb'))    

In [66]:
def evaluate(pred_edges, infection_times):
    pred_nodes = set([i for e in pred_edges for i in e ])
    true_nodes = set(np.nonzero(infection_times>=0)[0])

    prec = len(pred_nodes.intersection(true_nodes)) / len(pred_nodes)
    rec = len(pred_nodes.intersection(true_nodes)) / len(true_nodes)

    order_acc = edge_order_accuracy(extract_edges(tree), infection_times)
    return prec, rec, order_acc

In [67]:
def evaluate_from_result_dir(result_dir, qs, infection_times):
    for q in tqdm(qs):
        rows = []
        for p in glob(result_dir + "/{}/*.pkl".format(q)):
            # print(p)
            # TODO: add root
            pred_edges = pkl.load(open(p, 'rb'))

            scores = evaluate(pred_edges, infection_times)
            rows.append(scores)
        path = result_dir + "/{}.pkl".format(q)
        if rows:
            df = pd.DataFrame(rows, columns=['n.prec', 'n.rec',
                                             'order accuracy'])
            yield (path, df)
        else:
            if os.path.exists(path):
                os.remove(path)
            yield None

In [64]:
q=0.005
k=10
method = 'closure'
result_dir = 'outputs/real_cascade'
run_k_runs(g, q, infection_times, method, k, result_dir, verbose=False)

False


KeyboardInterrupt: 

In [63]:
for path, df in evaluate_from_result_dir('outputs/real_cascade/test', qs=[""], infection_times=infection_times):
    summary = df.describe()
    print(summary)
    print('writing to {}'.format(path))
    summary.to_pickle(path)

100%|██████████| 1/1 [00:00<00:00, 12.03it/s]

       n.prec      n.rec  order accuracy
count    10.0  10.000000    1.000000e+01
mean      1.0   0.009072    4.779412e-01
std       0.0   0.000518    5.851389e-17
min       1.0   0.008436    4.779412e-01
25%       1.0   0.008696    4.779412e-01
50%       1.0   0.008899    4.779412e-01
75%       1.0   0.009476    4.779412e-01
max       1.0   0.010054    4.779412e-01
writing to outputs/real_cascade/test/.pkl



