## Run experiment to synthesize dataset without labelled nodes
This notebook compares run the experiment to synthesize a dataset. The synthesied dataset don't have labels therefore the performance is only measured on 'intrinsic' graph properties. The experiment consists of the following steps and is performed 10 time to account for stochasticity

- train and calculate the graphsage embedding
- train DDPM network and create synthetic nodes.
- for the LSTM, MLP and bi-MLP vairants:
    - train model and create synthetic edges.
    - create synthetic graph
    - measure performance
    - copy relevant data (synthetic graph, embedding to experiment folder)
    - save performance.

In [1]:
# Set root dir
import os
os.chdir('..')
print(os.getcwd())

/Users/tonpoppe/workspace/tigger_adj_rep/tigger_adj


In [23]:
# import modules
import pickle
import pathlib
from tigger_package.orchestrator import Orchestrator
from tigger_package.metrics.distribution_metrics import compare_metrics
from tigger_package.tools import plot_adj_matrix, plot_hist
from datetime import datetime, date
import networkx as nx 
import pandas as pd
import time


In [36]:
def copy_data(folder, variant_name, run_nr):
    path = pathlib.Path(folder + "exp_results/no_labels")
    path.mkdir(parents=True, exist_ok=True)
    #copy synthetic nodes
    tmp = pd.read_parquet(folder + 'synth_graph/node_attributes.parquet')
    tmp.to_parquet(folder + f'exp_results/no_labels/node_attributes_{variant_name}_run_{run_nr}.parquet')
    # copy synthetic edges of synth graph
    tmp = pd.read_parquet(folder + 'synth_graph/adjacency.parquet')
    tmp.to_parquet(folder + f'exp_results/no_labels/adjacency_{variant_name}_run_{run_nr}.parquet')
    # copy synthetic walks
    obj = pickle.load(open(folder + 'synth_walks.pickle', 'rb'))
    pickle.dump(obj, open(folder + f'exp_results/no_labels/synth_walks_{variant_name}_run_{run_nr}.pickle', 'wb'))
    
def measure_performance(orchestrator, variant_name):
    nodes = orchestrator._load_nodes()
    edges = orchestrator._load_edges()
    lstm_nodes = pd.read_parquet(folder + 'synth_graph/node_attributes.parquet')
    lstm_edges = pd.read_parquet(folder + 'synth_graph/adjacency.parquet')
    res = compare_metrics(nodes, edges, lstm_nodes, lstm_edges, variant_name)
    return res

def run_single_experiment(run_nr, folder):
    # create synthetic nodes and embedding
    orchestrator = Orchestrator(folder)
    # orchestrator.create_graphsage_embedding()
    # orchestrator.train_node_synthesizer()
    # orchestrator.sample_node_synthesizer()

    res = []
    for variant_name in ['LSTM', 'MLP', 'Bi-MLP']:
        #train variant and sample edges
        target_cnt = 2 * orchestrator._load_edges().shape[0]
        orchestrator.init_graphsynthesizer(variant_name, seed=run_nr)
        orchestrator.train_graphsyntesizer()
        orchestrator.create_synthetic_walks(
            synthesizer=orchestrator.graphsynthesizer2, 
            target_cnt=target_cnt
        )
               
        # create synth graph
        orchestrator.generate_synth_graph()

        #copy relevant data
        copy_data(folder, variant_name, run_nr)
        
        # measure performance
        res.append(measure_performance(orchestrator, variant_name))
        
    res = (res[0]
           .merge(res[1], on=['name', 'type', 'metric'])
           .merge(res[2], on=['name', 'type', 'metric'])
    )
    res['run_id'] = run_nr
    res['run_tijd'] = datetime.now().strftime("%d-%m-%Y %H:%M:%S")
    return res[['name', 'type', 'metric', 'LSTM', 'MLP', 'Bi-MLP','run_id', 'run_tijd']]
  
def run_experiment(folder, runs=10):
    res = []
    for run_nr in range(runs):
        res.append(run_single_experiment(run_nr, folder))  
        total_df = pd.concat(res, axis=0)
        date_str = date.today().strftime("%Y%m%d")
        total_df.to_parquet(folder + f"exp_results/no_labels/combined_raw_results_{date_str}.parquet")
    return total_df
    
          
   

# ERDOS

In [37]:
#dataset folder and load orchestrator
folder = "data/erdos/"
res = run_experiment(folder, 2)


  super()._check_params_vs_input(X, default_n_init=10)


  0%|          | 0/9995 [00:00<?, ?it/s]

Number of components, 22


  super()._check_params_vs_input(X, default_n_init=10)


  0%|          | 0/9995 [00:00<?, ?it/s]

Number of components, 22




Unnamed: 0,name,type,metric,LSTM,MLP,Bi-MLP,run_id,run_tijd
0,attr0,node_attributes,Wasserstein_distance,0.345531,0.345531,0.345531,0,28-12-2023 15:28:47
1,attr1,node_attributes,Wasserstein_distance,0.241310,0.241310,0.241310,0,28-12-2023 15:28:47
2,attr2,node_attributes,Wasserstein_distance,0.024972,0.024972,0.024972,0,28-12-2023 15:28:47
3,attr3,node_attributes,Wasserstein_distance,0.022558,0.022558,0.022558,0,28-12-2023 15:28:47
4,attr4,node_attributes,Wasserstein_distance,0.038248,0.038248,0.038248,0,28-12-2023 15:28:47
...,...,...,...,...,...,...,...,...
59,mean_delta_widget,widget_count,fraction,0.096716,0.096716,0.096716,1,28-12-2023 15:29:08
60,edge_count,edge_cnt,count,96560.000000,96560.000000,96560.000000,1,28-12-2023 15:29:08
61,orig_edge_count,edge_cnt,count,9995.000000,9995.000000,9995.000000,1,28-12-2023 15:29:08
62,node_count,edge_cnt,count,1000.000000,1000.000000,1000.000000,1,28-12-2023 15:29:08


In [38]:
res

Unnamed: 0,name,type,metric,LSTM,MLP,Bi-MLP,run_id,run_tijd
0,attr0,node_attributes,Wasserstein_distance,0.345531,0.345531,0.345531,0,28-12-2023 15:28:47
1,attr1,node_attributes,Wasserstein_distance,0.241310,0.241310,0.241310,0,28-12-2023 15:28:47
2,attr2,node_attributes,Wasserstein_distance,0.024972,0.024972,0.024972,0,28-12-2023 15:28:47
3,attr3,node_attributes,Wasserstein_distance,0.022558,0.022558,0.022558,0,28-12-2023 15:28:47
4,attr4,node_attributes,Wasserstein_distance,0.038248,0.038248,0.038248,0,28-12-2023 15:28:47
...,...,...,...,...,...,...,...,...
59,mean_delta_widget,widget_count,fraction,0.096716,0.096716,0.096716,1,28-12-2023 15:29:08
60,edge_count,edge_cnt,count,96560.000000,96560.000000,96560.000000,1,28-12-2023 15:29:08
61,orig_edge_count,edge_cnt,count,9995.000000,9995.000000,9995.000000,1,28-12-2023 15:29:08
62,node_count,edge_cnt,count,1000.000000,1000.000000,1000.000000,1,28-12-2023 15:29:08
