### DCSBM
Runs the DCSBM model on the 4 small synthetic datasets and shows their performance


In [1]:
# imports
import glob
import os
import numpy as np
import pandas as pd
from datetime import datetime, date
from graspologic.datasets import load_drosophila_right
from graspologic.plot import heatmap
from graspologic.utils import binarize, symmetrize
from graspologic.models import DCSBMEstimator
%matplotlib inline

os.chdir("../..")
print(os.getcwd())  # /Users/tonpoppe/workspace/tigger_adj_rep/tigger_adj

from tigger_package.metrics.distribution_metrics import compare_metrics, compare_metrics2, calculate_model_performance, model_smry_performance

/Users/tonpoppe/workspace/tigger_adj_rep/tigger_adj


  _edge_swap_numba = nb.jit(_edge_swap, nopython=False)


In [3]:

def create_adj_matrix(nodes, edges):
    """creates an adjacency matrix from the nodes and edge dataframe"""
    # Create a mapping of node IDs to indices
    node_idx = {node_id: idx for idx, node_id in enumerate(nodes['id'])}

    # Initialize the adjacency matrix
    n = len(nodes)
    adj = np.zeros((n, n), dtype=int)

    # Populate the adjacency matrix
    edge_array = edges[['start', 'end']].to_numpy()
    for edge in edge_array:
        source_idx = node_idx[edge[0]]
        target_idx = node_idx[edge[1]]
        adj[source_idx, target_idx] = 1  # Use 1 to indicate an edge, use higher integers if you're counting edges or weights
        
    print(f"len nodes {len(nodes)}, adj shape: {adj.shape} --- len edge {len(edges)}, sum adj {np.sum(adj)}")
    return adj
 
def train_dcsbm(adj, verbose=2):
    """ trains the dcsbm model on the given adjacency matrix"""
    ### TRAIN DCSBM MODEL
    dcsbme = DCSBMEstimator(directed=True, degree_directed=True, loops=False)
    dcsbme.fit(adj, y=None)
    if verbose > 1:
        print("DCSBM \"B\" matrix:")
        print(dcsbme.block_p_)
        heatmap(dcsbme.p_mat_,
                inner_hier_labels=dcsbme.vertex_assignments_,
                font_scale=0.5,
                title="DCSBM probability matrix",
                vmin=0,
                vmax=1,
                sort_nodes=True)
    return dcsbme

def sample_dcsbm(dcsbme, verbose=2):
    #### SAMPLE MODEL
    adj_sampled = dcsbme.sample()
    if verbose > 1:
        heatmap(adj_sampled[0],
                inner_hier_labels=dcsbme.vertex_assignments_,
                font_scale=0.5,
                title="Sampled adj probability matrix",
                vmin=0,
                vmax=1,
                sort_nodes=True)
    return adj_sampled

def adjacency_to_edge_list(adj_matrix, edges):
    """conveert adj matrix to edges dataframe with default attr values"""
    edges_sampled = []
    # Iterate over each element in the adjacency matrix
    for i in range(adj_matrix.shape[0]):
        for j in range(adj_matrix.shape[1]):
            if adj_matrix[i, j] != 0:
                # Append a tuple (source, target, weight)
                edges_sampled.append((i, j))
                
    # convert to pandas dataframe
    edges_sampled = pd.DataFrame(edges_sampled, columns=['src', 'dst'] )
    
    #add default values to edge attributes
    edge_cols = [c for c in edges.columns if c not in ('start', 'end')]
    for col in edge_cols:
        edges_sampled[col] = 0.5
    
    return edges_sampled
   
def get_stats(res):
    mdl_performance = calculate_model_performance(res)
    stats = model_smry_performance(mdl_performance)
    return stats

def run_single_experiment(config_dict, run_nr, verbose=2):
    edges = pd.read_parquet(config_dict['base_path'] + config_dict['edges_path'])
    nodes = pd.read_parquet(config_dict['base_path'] + config_dict['nodes_path'])
    adj = create_adj_matrix(nodes, edges)  # create adjacency matrix
    dcsbme = train_dcsbm(adj, verbose=verbose)  # train model
    adj_sampled = sample_dcsbm(dcsbme, verbose=verbose)
    edges_sampled = adjacency_to_edge_list(adj_sampled[0], edges) # transform adjance to edge pdf
    edges_sampled.to_parquet(config_dict['res_folder'] + f"synth_edges_run_{run_nr}")
    res = compare_metrics2(nodes, edges, nodes, edges_sampled, name='DCSBM')
    res['run_id'] = run_nr
    return res
 
def  init_res_dataset(config_dict):
    folder = config_dict['res_folder']
    
    # check if folder exists
    if not os.path.exists(folder):
        # Create the folder
        os.makedirs(folder)
    
    matching_files = glob.glob(folder + f"combined_raw_results_*")
    if len(matching_files)> 0:
        max_date = max([int(f.split(".")[0][-8:]) for f in matching_files])
        res_path = f"{folder}combined_raw_results_{max_date}.parquet"
        res = [pd.read_parquet(res_path)]
        start_run = res[0]['run_id'].max() + 1
    else:
        res = []
        start_run = 0
    return start_run, res
 
def run_experiment(config_dict, runs=10, verbose=1):
    start_run, res = init_res_dataset(config_dict)
    for run_nr in range(start_run, start_run + runs):
        print(f"start processing run {run_nr}")
        res.append(run_single_experiment(config_dict, run_nr=run_nr, verbose=1))  
        total_df = pd.concat(res, axis=0)
        date_str = date.today().strftime("%Y%m%d")
        total_df.to_parquet(config_dict['res_folder'] + f"combined_raw_results_{date_str}.parquet")
    return total_df    

 
# stats = run_single_experiment(config_dict, verbose=1)
# stats 


### Enron

In [3]:

config_dict = {
    'base_path': "data/enron/",
    'nodes_path': "enron_nodes.parquet",
    'edges_path': "enron_edges.parquet",
    "res_folder": "data/enron/exp_results/no_labels/dcsbm/"
}
res = run_experiment(config_dict, runs=10, verbose=1)
stats = get_stats(res)

start processing run 0
len nodes 150, adj shape: (150, 150) --- len edge 2689, sum adj 2689




~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/edges_adj.csv -o temp/dir3_edges.html 
~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/synth_edges_adj.csv -o temp/dir3_synth_edges.html 
start processing run 1
len nodes 150, adj shape: (150, 150) --- len edge 2689, sum adj 2689




~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/edges_adj.csv -o temp/dir3_edges.html 
~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/synth_edges_adj.csv -o temp/dir3_synth_edges.html 
start processing run 2
len nodes 150, adj shape: (150, 150) --- len edge 2689, sum adj 2689




~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/edges_adj.csv -o temp/dir3_edges.html 
~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/synth_edges_adj.csv -o temp/dir3_synth_edges.html 
start processing run 3
len nodes 150, adj shape: (150, 150) --- len edge 2689, sum adj 2689




~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/edges_adj.csv -o temp/dir3_edges.html 
~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/synth_edges_adj.csv -o temp/dir3_synth_edges.html 
start processing run 4
len nodes 150, adj shape: (150, 150) --- len edge 2689, sum adj 2689




~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/edges_adj.csv -o temp/dir3_edges.html 
~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/synth_edges_adj.csv -o temp/dir3_synth_edges.html 
start processing run 5
len nodes 150, adj shape: (150, 150) --- len edge 2689, sum adj 2689




~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/edges_adj.csv -o temp/dir3_edges.html 
~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/synth_edges_adj.csv -o temp/dir3_synth_edges.html 
start processing run 6
len nodes 150, adj shape: (150, 150) --- len edge 2689, sum adj 2689




~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/edges_adj.csv -o temp/dir3_edges.html 
~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/synth_edges_adj.csv -o temp/dir3_synth_edges.html 
start processing run 7
len nodes 150, adj shape: (150, 150) --- len edge 2689, sum adj 2689




~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/edges_adj.csv -o temp/dir3_edges.html 
~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/synth_edges_adj.csv -o temp/dir3_synth_edges.html 
start processing run 8
len nodes 150, adj shape: (150, 150) --- len edge 2689, sum adj 2689




~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/edges_adj.csv -o temp/dir3_edges.html 
~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/synth_edges_adj.csv -o temp/dir3_synth_edges.html 
start processing run 9
len nodes 150, adj shape: (150, 150) --- len edge 2689, sum adj 2689




~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/edges_adj.csv -o temp/dir3_edges.html 
~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/synth_edges_adj.csv -o temp/dir3_synth_edges.html 


In [4]:
stats

Unnamed: 0_level_0,value,std
model,DCSBM,DCSBM
type,Unnamed: 1_level_2,Unnamed: 2_level_2
node_attributes,0.0,0.0
edge_attributes,0.293875,2.3935640000000003e-17
dif_cluster_coef,0.053448,0.01099341
Delta_edge_fraction,-0.056266,0.01315637
mean_delta_widget,0.051153,0.001962641
cluster_coef,0.368439,0.01099341


Erdos 

In [4]:
config_dict = {
    'base_path': "data/erdos/",
    'nodes_path': "erdos_nodes2.parquet",
    'edges_path': "erdos_edges2.parquet",
    "res_folder": "data/erdos/exp_results/no_labels/dcsbm/"
}
res = run_experiment(config_dict, runs=10, verbose=1)
stats = get_stats(res)




start processing run 0
len nodes 1000, adj shape: (1000, 1000) --- len edge 9995, sum adj 9995
~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/edges_adj.csv -o temp/dir3_edges.html 
~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/synth_edges_adj.csv -o temp/dir3_synth_edges.html 
start processing run 1
len nodes 1000, adj shape: (1000, 1000) --- len edge 9995, sum adj 9995
~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/edges_adj.csv -o temp/dir3_edges.html 
~/Downloads/gtrieScanner_src_01/gtrieScanner -s 3 -m gtrie ~/Downloads/gtrieScanner_src_01/gtries/dir3.gt -d -t html -g temp/synth_edges_adj.csv -o temp/dir3_synth_edges.html 
start processing run 2
len nodes 1000, adj shape: (1000, 1000) --- len edge 9995, sum adj 9995
~/Downloads/gtrieScanner_src_01/gtr

In [5]:
stats

Unnamed: 0_level_0,value,std
model,DCSBM,DCSBM
type,Unnamed: 1_level_2,Unnamed: 2_level_2
node_attributes,0.0,0.0
edge_attributes,0.10103,4.488483e-18
dif_cluster_coef,-0.002631,0.0006804661
Delta_edge_fraction,0.00064,0.01010403
mean_delta_widget,0.003783,0.000264625
cluster_coef,0.021981,0.0006804661
