In [1]:
import os
import pickle

import pandas as pd

from graphs import load_dataset_to_graph
from measurements import get_graph_measurements, compare_graph_measurements, \
    print_comparison_results
from recreate_graph import recreate_by_priority_rank_random_rankings, \
    recreate_by_priority_rank, get_trained_model

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
prepared_datasets_path = 'prepared_datasets'
prepared_dataframes_path = 'prepared_dataframes'
delimiter = '\t'

prepared_dataset_names = [
    #'primary_school', 'workplace', 
    'highschool_2011', #'highschool_2012', 'hospital', 
    #'moreno_blogs', 'moreno_sheep', 'moreno_seventh',
    #'moreno_seventh',
    # big datasets
    #'petster-hamster', 'email-Eu'
]

comparison_dir = 'results'
number_of_comparisons = 10

if not os.path.exists(comparison_dir):
    os.mkdir(comparison_dir)

random_rankings_list = []
random_rankings_path = os.path.join(comparison_dir, 'random_rankings.pkl')

graph_attrs_list = []
graph_attrs_path = os.path.join(comparison_dir, 'graph_attrs.pkl')

real_attrs_list = []
real_attrs_path = os.path.join(comparison_dir, 'real_attrs.pkl')

graph_real_attrs_list = []
graph_real_attrs_path = os.path.join(comparison_dir, 'graph_real_attrs.pkl')

real_y_list = []
real_y_path = os.path.join(comparison_dir, 'real_y.pkl')

In [3]:
# Random Rankings
print('Random rankings')
for dataset_name in prepared_dataset_names:
    dataset_path = os.path.join(prepared_datasets_path, dataset_name)
    dataframe_path = os.path.join(prepared_dataframes_path, dataset_name)
    graph = load_dataset_to_graph(dataset_path, node_limit=500)

    print('Processing {}'.format(dataset_name))

    for i in range(number_of_comparisons):
        print('Recreation number {}...'.format(i))
        new_graph = recreate_by_priority_rank_random_rankings(graph)

        graph_measurements = get_graph_measurements(graph)
        new_graph_measurements = get_graph_measurements(new_graph)
        comparison = compare_graph_measurements(graph_measurements, new_graph_measurements)
        random_rankings_list.append(comparison)
    with open(random_rankings_path, 'wb') as f:
        pickle.dump(random_rankings_list, f)

Random rankings
Processing highschool_2011
Recreation number 0...


  return (xy*(M-ab)).sum()/numpy.sqrt(vara*varb)


Recreation number 1...


  return (xy*(M-ab)).sum()/numpy.sqrt(vara*varb)


Recreation number 2...
Recreation number 3...
Recreation number 4...
Recreation number 5...
Recreation number 6...
Recreation number 7...
Recreation number 8...
Recreation number 9...


In [4]:
# Only graph attributes
print('Only graph attributes')
for dataset_name in prepared_dataset_names:
    dataset_path = os.path.join(prepared_datasets_path, dataset_name)
    dataframe_path = os.path.join(prepared_dataframes_path, dataset_name)
    graph = load_dataset_to_graph(dataset_path, node_limit=500)

    print('Processing {}'.format(dataset_name))
    graph_attrs_dataframe_path = os.path.join(dataframe_path, 'graph_attrs.csv')
    df = pd.read_csv(graph_attrs_dataframe_path, delimiter=delimiter)

    model = get_trained_model(df, epochs=4)
    # drop target column
    X_test = df.drop(['num_of_edges'], axis=1)
    # predict num_edges
    y_pred = model.predict(X_test)

    for i in range(number_of_comparisons):
        print('Recreation number {}...'.format(i))
        new_graph = recreate_by_priority_rank(graph, y_pred)

        graph_measurements = get_graph_measurements(graph)
        new_graph_measurements = get_graph_measurements(new_graph)
        comparison = compare_graph_measurements(graph_measurements, new_graph_measurements)
        graph_attrs_list.append(comparison)
    with open(graph_attrs_path, 'wb') as f:
        pickle.dump(graph_attrs_list, f)

Only graph attributes
Processing highschool_2011
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
loss: 0.0033145744811421325, accuracy: 0.7846434870244394
Recreation number 0...
Recreation number 1...


  return (xy*(M-ab)).sum()/numpy.sqrt(vara*varb)


Recreation number 2...


  return (xy*(M-ab)).sum()/numpy.sqrt(vara*varb)


Recreation number 3...


  return (xy*(M-ab)).sum()/numpy.sqrt(vara*varb)


Recreation number 4...
Recreation number 5...
Recreation number 6...
Recreation number 7...
Recreation number 8...
Recreation number 9...


In [5]:
# Only real attributes
print('Only real attributes')
for dataset_name in prepared_dataset_names:
    dataset_path = os.path.join(prepared_datasets_path, dataset_name)
    dataframe_path = os.path.join(prepared_dataframes_path, dataset_name)
    graph = load_dataset_to_graph(dataset_path, node_limit=500)

    print('Processing {}'.format(dataset_name))
    real_attrs_dataframe_path = os.path.join(dataframe_path, 'real_attrs.csv')
    df = pd.read_csv(real_attrs_dataframe_path, delimiter=delimiter)

    model = get_trained_model(df, epochs=4)
    # drop target column
    X_test = df.drop(['num_of_edges'], axis=1)
    # predict num_edges
    y_pred = model.predict(X_test)

    for i in range(number_of_comparisons):
        print('Recreation number {}...'.format(i))
        new_graph = recreate_by_priority_rank(graph, y_pred)

        graph_measurements = get_graph_measurements(graph)
        new_graph_measurements = get_graph_measurements(new_graph)
        comparison = compare_graph_measurements(graph_measurements, new_graph_measurements)
        real_attrs_list.append(comparison)
    with open(real_attrs_path, 'wb') as f:
        pickle.dump(real_attrs_list, f)

Only real attributes
Processing highschool_2011
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
loss: 0.003025658960149357, accuracy: 0.7846434870244394
Recreation number 0...


  return (xy*(M-ab)).sum()/numpy.sqrt(vara*varb)


Recreation number 1...
Recreation number 2...
Recreation number 3...
Recreation number 4...
Recreation number 5...
Recreation number 6...
Recreation number 7...
Recreation number 8...
Recreation number 9...


In [6]:
# Both graph and real attributes
print('Both graph and real attributes')
for dataset_name in prepared_dataset_names:
    dataset_path = os.path.join(prepared_datasets_path, dataset_name)
    dataframe_path = os.path.join(prepared_dataframes_path, dataset_name)
    graph = load_dataset_to_graph(dataset_path, node_limit=500)

    print('Processing {}'.format(dataset_name))
    graph_real_attrs_dataframe_path = os.path.join(dataframe_path, 'graph_real_attrs.csv')
    df = pd.read_csv(graph_real_attrs_dataframe_path, delimiter=delimiter)

    model = get_trained_model(df, epochs=4)
    # drop target column
    X_test = df.drop(['num_of_edges'], axis=1)
    # predict num_edges
    y_pred = model.predict(X_test)

    for i in range(number_of_comparisons):
        print('Recreation number {}...'.format(i))
        new_graph = recreate_by_priority_rank(graph, y_pred)

        graph_measurements = get_graph_measurements(graph)
        new_graph_measurements = get_graph_measurements(new_graph)
        comparison = compare_graph_measurements(graph_measurements, new_graph_measurements)
        graph_real_attrs_list.append(comparison)
    with open(graph_real_attrs_path, 'wb') as f:
        pickle.dump(graph_real_attrs_list, f)

Both graph and real attributes
Processing highschool_2011
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
loss: 0.0024736529162728963, accuracy: 0.7846434870244394
Recreation number 0...


  return (xy*(M-ab)).sum()/numpy.sqrt(vara*varb)


Recreation number 1...
Recreation number 2...


  return (xy*(M-ab)).sum()/numpy.sqrt(vara*varb)


Recreation number 3...
Recreation number 4...
Recreation number 5...
Recreation number 6...
Recreation number 7...
Recreation number 8...
Recreation number 9...


In [7]:
# Real y
print('Real y')
for dataset_name in prepared_dataset_names:
    dataset_path = os.path.join(prepared_datasets_path, dataset_name)
    dataframe_path = os.path.join(prepared_dataframes_path, dataset_name)
    graph = load_dataset_to_graph(dataset_path, node_limit=500)

    print('Processing {}'.format(dataset_name))
    real_y_dataframe_path = os.path.join(dataframe_path, 'no_attrs.csv')
    df = pd.read_csv(real_y_dataframe_path, delimiter=delimiter)

    for i in range(number_of_comparisons):
        print('Recreation number {}...'.format(i))
        new_graph = recreate_by_priority_rank(graph, df['num_of_edges'].as_matrix())

        graph_measurements = get_graph_measurements(graph)
        new_graph_measurements = get_graph_measurements(new_graph)
        comparison = compare_graph_measurements(graph_measurements, new_graph_measurements)
        real_y_list.append(comparison)
    with open(real_y_path, 'wb') as f:
        pickle.dump(real_y_list, f)

Real y
Processing highschool_2011
Recreation number 0...


  return (xy*(M-ab)).sum()/numpy.sqrt(vara*varb)


Recreation number 1...


  return (xy*(M-ab)).sum()/numpy.sqrt(vara*varb)


Recreation number 2...


  return (xy*(M-ab)).sum()/numpy.sqrt(vara*varb)


Recreation number 3...
Recreation number 4...
Recreation number 5...
Recreation number 6...
Recreation number 7...
Recreation number 8...
Recreation number 9...
