In [4]:
import os
import pickle

import pandas as pd

from graphs import load_dataset_to_graph
from measurements import get_graph_measurements, compare_graph_measurements, \
    print_comparison_results
from recreate_graph import recreate_by_priority_rank_random_rankings, \
    recreate_by_priority_rank, get_trained_model

In [5]:
prepared_datasets_path = 'prepared_datasets'
prepared_dataframes_path = 'prepared_dataframes'
delimiter = '\t'

prepared_dataset_names = [
    'primary_school', 'workplace', 
    'highschool_2011', 'highschool_2012', 'hospital', 
    'moreno_blogs', 'moreno_sheep', 'moreno_seventh',
    # big datasets
    'petster-hamster', 'email-Eu'
]

comparison_dir = 'results'
number_of_comparisons = 10

In [6]:
# Random Rankings
print('Random rankings')
for dataset_name in prepared_dataset_names:
    dataset_path = os.path.join(prepared_datasets_path, dataset_name)
    dataframe_path = os.path.join(prepared_dataframes_path, dataset_name)
    graph = load_dataset_to_graph(dataset_path)
    
    random_rankings_list = []
    results_dir = os.path.join(comparison_dir, dataset_name)
    random_rankings_path = os.path.join(comparison_dir, dataset_name, 'random_rankings.pkl')
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    
    print('Processing {}'.format(dataset_name))

    for i in range(number_of_comparisons):
        print('Recreation number {}...'.format(i))
        new_graph = recreate_by_priority_rank_random_rankings(graph)

        graph_measurements = get_graph_measurements(graph)
        new_graph_measurements = get_graph_measurements(new_graph)
        comparison = compare_graph_measurements(graph_measurements, new_graph_measurements)
        random_rankings_list.append(comparison)
    with open(random_rankings_path, 'wb') as f:
        pickle.dump(random_rankings_list, f)

Random rankings
Processing primary_school
Recreation number 0...


  return (xy*(M-ab)).sum()/numpy.sqrt(vara*varb)


Recreation number 1...


  return (xy*(M-ab)).sum()/numpy.sqrt(vara*varb)


Recreation number 2...
Recreation number 3...
Recreation number 4...
Recreation number 5...
Recreation number 6...
Recreation number 7...


  return (xy*(M-ab)).sum()/numpy.sqrt(vara*varb)


Recreation number 8...
Recreation number 9...
Processing workplace
Recreation number 0...
Recreation number 1...
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Recreation number 2...
Recreation number 3...
Recreation number 4...
Recreation number 5...
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Recreation number 6...
Recreation number 7...
Recreation number 8...
Recreation number 9...
Processing highschool_2011
Recreation number 0...
Recreation number 1...
Recreation number 2...
Recreation number 3...
Recreation number 4...
Recreation number 5...
Recreation number 6...
Recreation number 7...
Recreation number 8...
Recreation number 9...
Processing highschool_2012
Recreation number 0...
Recreation number 1...
Recreation number 2...
Recreation number 3...
Recreation number 4...
Recreation number 5...
Recreation number 6...
Recreation number 7...
Recreation number 8...
Recreation num

Cannot compute average_shortest_path_length - Graph is not weakly connected.
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Recreation number 8...
Cannot compute average_shortest_path_length - Graph is not weakly connected.
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Recreation number 9...
Cannot compute average_shortest_path_length - Graph is not weakly connected.
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Cutting nodes up to 600
Processing email-Eu
Recreation number 0...
Cannot compute average_shortest_path_length - Graph is not wea

In [7]:
# Only graph attributes
print('Only graph attributes')
for dataset_name in prepared_dataset_names:
    dataset_path = os.path.join(prepared_datasets_path, dataset_name)
    dataframe_path = os.path.join(prepared_dataframes_path, dataset_name)
    graph = load_dataset_to_graph(dataset_path)
    
    graph_attrs_list = []
    results_dir = os.path.join(comparison_dir, dataset_name)
    graph_attrs_path = os.path.join(comparison_dir, dataset_name, 'graph_attrs.pkl')
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)

    print('Processing {}'.format(dataset_name))
    graph_attrs_dataframe_path = os.path.join(dataframe_path, 'graph_attrs.csv')
    df = pd.read_csv(graph_attrs_dataframe_path, delimiter=delimiter)

    model = get_trained_model(df, epochs=16)
    # drop target column
    X_test = df.drop(['num_of_edges'], axis=1)
    # predict num_edges
    y_pred = model.predict(X_test)

    for i in range(number_of_comparisons):
        print('Recreation number {}...'.format(i))
        new_graph = recreate_by_priority_rank(graph, y_pred)

        graph_measurements = get_graph_measurements(graph)
        new_graph_measurements = get_graph_measurements(new_graph)
        comparison = compare_graph_measurements(graph_measurements, new_graph_measurements)
        graph_attrs_list.append(comparison)
    with open(graph_attrs_path, 'wb') as f:
        pickle.dump(graph_attrs_list, f)

Only graph attributes
Processing primary_school
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
loss: 0.0008328252598355457, accuracy: 0.7159688545864353
Recreation number 0...


  return (xy*(M-ab)).sum()/numpy.sqrt(vara*varb)


Recreation number 1...
Recreation number 2...
Recreation number 3...
Recreation number 4...
Recreation number 5...
Recreation number 6...
Recreation number 7...
Recreation number 8...
Recreation number 9...
Processing workplace
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
loss: 0.0015316372300249622, accuracy: 0.8215973534971645
Recreation number 0...
Recreation number 1...
Recreation number 2...
Recreation number 3...
Recreation number 4...


  return (xy*(M-ab)).sum()/numpy.sqrt(vara*varb)


Recreation number 5...
Recreation number 6...


  return (xy*(M-ab)).sum()/numpy.sqrt(vara*varb)


Recreation number 7...
Recreation number 8...
Recreation number 9...
Processing highschool_2011
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
loss: 0.0011981737440679941, accuracy: 0.7846434870244394
Recreation number 0...
Recreation number 1...
Recreation number 2...
Recreation number 3...
Recreation number 4...
Recreation number 5...
Recreation number 6...
Recreation number 7...
Recreation number 8...
Recreation number 9...
Processing highschool_2012
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
loss: 0.0006409419702121913, accuracy: 0.8629629629629629
Recreation number 0...
Recreation number 1...
Recreation number 2...
Recreation number 3...
Recreation number 4...
Recreation number 5...
Recreation number 6...
Recr

Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Recreation number 2...
Cannot compute average_shortest_path_length - Graph is not weakly connected.
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Recreation number 3...
Cannot compute average_shortest_path_length - Graph is not weakly connected.
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Recreation number 4...
Cannot compute average_shortest_path_length - Graph is not weakly connected.
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Recreat

Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
loss: 0.00519362663935553, accuracy: 0.9948111111111111
Recreation number 0...
Cannot compute average_shortest_path_length - Graph is not weakly connected.
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Recreation number 1...
Cannot compute average_shortest_path_length - Graph is not weakly connected.
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Recreation number 2...
Cannot compute average_shortest_path_length - Graph is not weakly connected.
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Cannot compute diameter - Found infinite pa

Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Recreation number 8...
Cannot compute average_shortest_path_length - Graph is not weakly connected.
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Recreation number 9...
Cannot compute average_shortest_path_length - Graph is not weakly connected.
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected
Cannot compute diameter - Found infinite path length because the digraph is not strongly connected


In [None]:
# Only real attributes
print('Only real attributes')
for dataset_name in prepared_dataset_names:
    dataset_path = os.path.join(prepared_datasets_path, dataset_name)
    dataframe_path = os.path.join(prepared_dataframes_path, dataset_name)
    graph = load_dataset_to_graph(dataset_path)

    real_attrs_list = []
    results_dir = os.path.join(comparison_dir, dataset_name)
    real_attrs_path = os.path.join(comparison_dir, dataset_name, 'real_attrs.pkl')
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
        
    print('Processing {}'.format(dataset_name))
    real_attrs_dataframe_path = os.path.join(dataframe_path, 'real_attrs.csv')
    df = pd.read_csv(real_attrs_dataframe_path, delimiter=delimiter)

    model = get_trained_model(df, epochs=16)
    # drop target column
    X_test = df.drop(['num_of_edges'], axis=1)
    # predict num_edges
    y_pred = model.predict(X_test)

    for i in range(number_of_comparisons):
        print('Recreation number {}...'.format(i))
        new_graph = recreate_by_priority_rank(graph, y_pred)

        graph_measurements = get_graph_measurements(graph)
        new_graph_measurements = get_graph_measurements(new_graph)
        comparison = compare_graph_measurements(graph_measurements, new_graph_measurements)
        real_attrs_list.append(comparison)
    with open(real_attrs_path, 'wb') as f:
        pickle.dump(real_attrs_list, f)

In [None]:
# Both graph and real attributes
print('Both graph and real attributes')
for dataset_name in prepared_dataset_names:
    dataset_path = os.path.join(prepared_datasets_path, dataset_name)
    dataframe_path = os.path.join(prepared_dataframes_path, dataset_name)
    graph = load_dataset_to_graph(dataset_path)
    
    graph_real_attrs_list = []
    results_dir = os.path.join(comparison_dir, dataset_name)
    graph_real_attrs_path = os.path.join(comparison_dir, dataset_name, 'graph_real_attrs.pkl')
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
        
    print('Processing {}'.format(dataset_name))
    graph_real_attrs_dataframe_path = os.path.join(dataframe_path, 'graph_real_attrs.csv')
    df = pd.read_csv(graph_real_attrs_dataframe_path, delimiter=delimiter)

    model = get_trained_model(df, epochs=16)
    # drop target column
    X_test = df.drop(['num_of_edges'], axis=1)
    # predict num_edges
    y_pred = model.predict(X_test)

    for i in range(number_of_comparisons):
        print('Recreation number {}...'.format(i))
        new_graph = recreate_by_priority_rank(graph, y_pred)

        graph_measurements = get_graph_measurements(graph)
        new_graph_measurements = get_graph_measurements(new_graph)
        comparison = compare_graph_measurements(graph_measurements, new_graph_measurements)
        graph_real_attrs_list.append(comparison)
    with open(graph_real_attrs_path, 'wb') as f:
        pickle.dump(graph_real_attrs_list, f)

In [None]:
# Real y
print('Real y')
for dataset_name in prepared_dataset_names:
    dataset_path = os.path.join(prepared_datasets_path, dataset_name)
    dataframe_path = os.path.join(prepared_dataframes_path, dataset_name)
    graph = load_dataset_to_graph(dataset_path)

    real_y_list = []
    results_dir = os.path.join(comparison_dir, dataset_name)
    real_y_path = os.path.join(comparison_dir, dataset_name, 'real_y.pkl')
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)

    print('Processing {}'.format(dataset_name))
    real_y_dataframe_path = os.path.join(dataframe_path, 'no_attrs.csv')
    df = pd.read_csv(real_y_dataframe_path, delimiter=delimiter)

    for i in range(number_of_comparisons):
        print('Recreation number {}...'.format(i))
        new_graph = recreate_by_priority_rank(graph, df['num_of_edges'].as_matrix())

        graph_measurements = get_graph_measurements(graph)
        new_graph_measurements = get_graph_measurements(new_graph)
        comparison = compare_graph_measurements(graph_measurements, new_graph_measurements)
        real_y_list.append(comparison)
    with open(real_y_path, 'wb') as f:
        pickle.dump(real_y_list, f)