## Run experiment to synthesize dataset without labelled nodes
This notebook compares run the experiment to synthesize a dataset. The synthesied dataset don't have labels therefore the performance is only measured on 'intrinsic' graph properties. The experiment consists of the following steps and is performed 10 time to account for stochasticity

- train and calculate the graphsage embedding
- train DDPM network and create synthetic nodes.
- for the LSTM, MLP and bi-MLP vairants:
    - train model and create synthetic edges.
    - create synthetic graph
    - measure performance
    - copy relevant data (synthetic graph, embedding to experiment folder)
    - save performance.

In [1]:
# Set root dir
import os
os.chdir('..')
print(os.getcwd())

/Users/tonpoppe/workspace/tigger_adj_rep/tigger_adj


In [2]:
# import modules
import pickle
import pathlib
from tigger_package.orchestrator import Orchestrator
from tigger_package.metrics.distribution_metrics import compare_metrics
from tigger_package.tools import plot_adj_matrix, plot_hist
from datetime import datetime, date
import networkx as nx 
import pandas as pd
import time
import glob


loaded


In [4]:
def copy_data(folder, variant_name, run_nr):
    path = pathlib.Path(folder + "exp_results/no_labels")
    path.mkdir(parents=True, exist_ok=True)
    #copy synthetic nodes
    tmp = pd.read_parquet(folder + 'synth_graph/node_attributes.parquet')
    tmp.to_parquet(folder + f'exp_results/no_labels/node_attributes_{variant_name}_run_{run_nr}.parquet')
    # copy synthetic edges of synth graph
    tmp = pd.read_parquet(folder + 'synth_graph/adjacency.parquet')
    tmp.to_parquet(folder + f'exp_results/no_labels/adjacency_{variant_name}_run_{run_nr}.parquet')
    # copy synthetic walks
    obj = pickle.load(open(folder + 'synth_walks.pickle', 'rb'))
    pickle.dump(obj, open(folder + f'exp_results/no_labels/synth_walks_{variant_name}_run_{run_nr}.pickle', 'wb'))
    
def measure_performance(orchestrator, variant_name):
    nodes = orchestrator._load_nodes()
    edges = orchestrator._load_edges()
    lstm_nodes = pd.read_parquet(folder + 'synth_graph/node_attributes.parquet')
    lstm_edges = pd.read_parquet(folder + 'synth_graph/adjacency.parquet')
    res = compare_metrics(nodes, edges, lstm_nodes, lstm_edges, variant_name)
    return res

def run_single_experiment(run_nr, folder):
    # create synthetic nodes and embedding
    orchestrator = Orchestrator(folder)
    nodes_sampled = 0
    while nodes_sampled == 0:
        orchestrator.create_graphsage_embedding()
        orchestrator.train_node_synthesizer()
        orchestrator.sample_node_synthesizer()
        nodes_sampled = orchestrator._load_synthetic_nodes().shape[0]

    res = []
    for variant_name in ['LSTM', 'MLP', 'Bi-MLP']:
        #train variant and sample edges
        target_cnt = 2 * orchestrator._load_edges().shape[0]
        orchestrator.init_graphsynthesizer(variant_name, seed=run_nr)
        orchestrator.train_graphsyntesizer()
        orchestrator.create_synthetic_walks(
            synthesizer=orchestrator.graphsynthesizer, 
            target_cnt=target_cnt
        )
               
        # create synth graph
        orchestrator.generate_synth_graph()

        #copy relevant data
        copy_data(folder, variant_name, run_nr)
        
        # measure performance
        res.append(measure_performance(orchestrator, variant_name))
        
    res = (res[0]
           .merge(res[1], on=['name', 'type', 'metric'])
           .merge(res[2], on=['name', 'type', 'metric'])
    )
    res['run_id'] = run_nr
    res['run_tijd'] = datetime.now().strftime("%d-%m-%Y %H:%M:%S")
    return res[['name', 'type', 'metric', 'LSTM', 'MLP', 'Bi-MLP','run_id', 'run_tijd']]
  
def  init_res_dataset(folder):
    matching_files = glob.glob(folder + f"exp_results/no_labels/combined_raw_results_*")
    if len(matching_files)> 0:
        max_date = max([int(f.split(".")[0][-8:]) for f in matching_files])
        res_path = f"{folder}exp_results/no_labels/combined_raw_results_{max_date}.parquet"
        res = [pd.read_parquet(res_path)]
        start_run = res[0]['run_id'].max() + 1
    else:
        res = []
        start_run = 0
    return start_run, res

def run_experiment(folder, runs=10):
    start_run, res = init_res_dataset(folder)
    for run_nr in range(start_run, start_run + runs):
        print(f"start processing run {run_nr}")
        res.append(run_single_experiment(run_nr, folder))  
        total_df = pd.concat(res, axis=0)
        date_str = date.today().strftime("%Y%m%d")
        total_df.to_parquet(folder + f"exp_results/no_labels/combined_raw_results_{date_str}.parquet")
    return total_df  

def calculate_Stats(res):
    node_attributes = res[res['type']=='node_attributes'].groupby('run_id').agg({'LSTM': 'mean', 'MLP': 'mean', 'Bi-MLP': 'mean', 'type': 'max'}).reset_index()
    edge_attributes = res[res['type']=='edge_attributes'].groupby('run_id').agg({'LSTM': 'mean', 'MLP': 'mean', 'Bi-MLP': 'mean', 'type': 'max'}).reset_index()
    dif_cluster_coef = res[res['name']=='dif_cluster_coef'][['name', 'LSTM', 'MLP', 'Bi-MLP', 'run_id'] ].rename(columns={'name': 'type'})
    delta_widget = res[res['name']=='mean_delta_widget'][['name', 'LSTM', 'MLP', 'Bi-MLP', 'run_id'] ].rename(columns={'name': 'type'})
    edge_cnt = (res[(res['type']=='edge_cnt') & (res['name']=='edge_count')].set_index('run_id')[['LSTM', 'MLP', 'Bi-MLP']] -  
                res[(res['type']=='edge_cnt') & (res['name']=='orig_edge_count')].set_index('run_id')[['LSTM', 'MLP', 'Bi-MLP']]
    )
    edge_cnt['type'] = "Delta_edge_count"
    edge_cnt = edge_cnt.reset_index(names='run_id')
    cluster_coef = res[res['name'].str.startswith('dif_')][['name', 'LSTM', 'MLP', 'Bi-MLP', 'run_id'] ].rename(columns={'name': 'type'})
    sum_res = pd.concat([node_attributes, edge_attributes, dif_cluster_coef, delta_widget, edge_cnt, cluster_coef], axis=0)
    sum_stat = sum_res.groupby('type').mean()
    return sum_stat      

# ERDOS

In [None]:
#dataset folder and load orchestrator
folder = "data/enron/"
res = run_experiment(folder, 8)


In [None]:
_, res = init_res_dataset(folder)
res = res[0]
calculate_Stats(res[res['run_id']!=0])

In [5]:
folder = "data/enron/"
_, res = init_res_dataset(folder)
res = res[0]
calculate_Stats(res[res['run_id']!=0])

Unnamed: 0_level_0,run_id,LSTM,MLP,Bi-MLP
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Delta_edge_count,5.5,-1971.5,-1904.7,-1404.4
dif_cluster_coef,5.5,0.283033,0.311139,0.307681
edge_attributes,5.5,0.10261,0.120324,0.124587
mean_delta_widget,5.5,0.119216,0.137549,0.067745
node_attributes,5.5,0.019775,0.019775,0.019775


In [6]:
orchestrator = Orchestrator(folder)
edges = orchestrator._load_edges()

In [8]:
edges.shape

(2689, 10)