# Imports

In [1]:
import json
import pandas as pd
import torch as th
import networkx as nx
from matplotlib import pyplot as plt
import seaborn as sns
import re
import networkx as nx
import numpy as np
import os
import dgl

from sklearn.model_selection import GroupShuffleSplit
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.dummy import DummyClassifier, DummyRegressor


from ddagl import graph_level_nn, graph_feature_extraction, visualization, evaluation

import karateclub

Using backend: pytorch


# Data Acquisition

In [2]:
if not os.path.exists('datasets/odds/ODDS.json'):
    from urllib import request
    request.urlretrieve('https://zenodo.org/record/4633704/files/ODDS.json',
                        'datasets/odds/ODDS.json')

with open('datasets/odds/ODDS.json', 'r') as fp:
    graphs = pd.Series(list(map(nx.node_link_graph, json.load(fp)['graphs'])), dtype=object)

In [3]:
groups = graphs.apply(lambda g: g.graph['id'])
versions = graphs.apply(lambda g: g.graph['version'])

# Setup

## Evaluating Embeddings

We can evaluate a feature spacing or embedding regarding its suitability for representing workflows in different versions.

Generally speaking, we assume neighboring versions of a workflow to have high similarity and other versions to have low similarity. 

We divide an instances distances to similar workflows by the distance to neighbors.

In [4]:
def evaluate_embeddings(est, X=graphs, groups=groups, nb_trials:int = 5):
    emb_score_dicts = []
    for random_state in range(nb_trials):
        if isinstance(est, Pipeline):
            est.steps[-1][-1].random_state = random_state
        else:
            est.random_state = random_state
            
        embeddings = est.fit_transform(X, groups)
        emb_score_dicts.append(dict(gcs = group_cluster_score(groups, embeddings), 
                                           trs = triplet_ratio_score(groups, embeddings),
                                          random_seed = random_state))
        
    emb_score_df = pd.DataFrame(emb_score_dicts)
    return emb_score_df.aggregate(('mean', 'std'))

## GCN Default Parameters

In [5]:
graph_emb_default_kwargs = dict(batch_size = 1000, nb_epochs = 50, random_state=42, negatives_from_batch=True,
                               conv_cls = dgl.nn.TAGConv)

# Raw Graph Features

In [7]:
glfe = graph_feature_extraction.GraphLevelFeatureExtractor(n_jobs=4)
raw_features = glfe.fit_transform(graphs)

In [8]:
group_cluster_score(groups, raw_features), triplet_ratio_score(groups, raw_features)

(0.7911558472972228, 0.33886920294643724)

# SoTA Approaches

In [14]:
def score_kc_model(emb_fun):
    gcs_scores, trs_scores = [], []
    for random_seed in range(5):
        embs = emb_fun(random_seed)
        gcs_scores.append(group_cluster_score(groups, embs))
        trs_scores.append(triplet_ratio_score(groups, embs))

    print(round(np.mean(gcs_scores), 5), '+-', round(np.std(gcs_scores), 5), 
          round(np.mean(trs_scores), 5), '+-', round(np.std(trs_scores), 5))

## Graph2Vec

In [15]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from karateclub.utils.treefeatures import WeisfeilerLehmanHashing

def get_g2v_embs(random_seed = None):
    graph2vec = karateclub.graph_embedding.graph2vec.Graph2Vec(seed = random_seed)
    graph2vec._set_seed()
    documents = [WeisfeilerLehmanHashing(graph, graph2vec.wl_iterations, graph2vec.attributed, 
                                         graph2vec.erase_base_features) for graph in relevant_graphs]
    documents = [TaggedDocument(words=doc.get_graph_features(), tags=[str(i)]) for i, doc in enumerate(documents)]

    model = Doc2Vec(documents, vector_size=graph2vec.dimensions, window=0, min_count=graph2vec.min_count,
                    dm=0, sample=graph2vec.down_sampling, workers=graph2vec.workers, epochs=graph2vec.epochs,
                    alpha=graph2vec.learning_rate, seed=graph2vec.seed)

    return [model.docvecs[str(i)] for i, _ in enumerate(documents)]

score_kc_model(get_g2v_embs)

0.5964 +- 0.00451 0.45299 +- 0.00388


## FeatherGraph

In [16]:
def get_fg_embs(random_seed):
    fg = karateclub.graph_embedding.feathergraph.FeatherGraph(seed = random_seed)
    fg._set_seed()
    return [fg._calculate_feather(graph) for graph in relevant_graphs]

score_kc_model(get_fg_embs)

0.76015 +- 0.0 0.35072 +- 0.0


## GeoScattering

In [17]:
def get_gs_embs(random_seed):
    gs = karateclub.graph_embedding.geoscattering.GeoScattering(seed = random_seed)
    gs._set_seed()
    relevant_graphs_ = list(map(nx.to_undirected, relevant_graphs))
    return [gs._calculate_geoscattering(graph) for graph in relevant_graphs_]

score_kc_model(get_gs_embs)

0.6316 +- 0.0 0.33706 +- 0.0


## SF

In [18]:
def get_sf_embs(random_seed):
    sf = karateclub.graph_embedding.sf.SF(seed = random_seed)
    sf._set_seed()
    relevant_graphs_ = list(map(nx.to_undirected, relevant_graphs))
    return [sf._calculate_sf(graph) for graph in relevant_graphs_]

score_kc_model(get_sf_embs)

0.58091 +- 0.0 0.48793 +- 0.0


## FGSD

In [19]:
def get_fgsd_embs(random_seed):
    fgsd = karateclub.graph_embedding.fgsd.FGSD(seed = random_seed)
    fgsd._set_seed()
    relevant_graphs_ = list(map(nx.to_undirected, relevant_graphs))
    return [fgsd._calculate_fgsd(graph) for graph in relevant_graphs_]

score_kc_model(get_fgsd_embs)

0.68237 +- 0.0 0.40762 +- 0.0


# Untrained P-GCN embeddings

In [13]:
feature_transformer = graph_feature_extraction.NodeLevelFeatureTransformer(use_configs=False)
graph_emb = graph_level_nn.GroupedGraphEmbedder(**graph_emb_default_kwargs, verbose=True)
graph_emb.nb_epochs = 0
graph_emb_pipeline = make_pipeline(feature_transformer, graph_emb)

In [14]:
evaluate_embeddings(graph_emb_pipeline)

Unnamed: 0,gcs,trs,random_seed
mean,0.883996,0.399522,2.0
std,0.001323,0.006897,1.581139


# Triplet Loss P-GCN Embedding

## Feature Extraction

In [6]:
feature_transformer = graph_feature_extraction.NodeLevelFeatureTransformer(use_configs=False)

X = feature_transformer.fit_transform(graphs)
X[0][1].shape

(6, 110)

## Default parameters

In [7]:
graph_emb = graph_level_nn.GroupedGraphEmbedder(verbose=True, **graph_emb_default_kwargs)
evaluate_embeddings(graph_emb, X = X)

epoch 000 | lr 0.01000 | loss 0.09313 | loss_std 0.03746
epoch 001 | lr 0.00900 | loss 0.05785 | loss_std 0.01225
epoch 002 | lr 0.00810 | loss 0.03562 | loss_std 0.01007
epoch 003 | lr 0.00729 | loss 0.04028 | loss_std 0.00430
epoch 004 | lr 0.00656 | loss 0.03493 | loss_std 0.00973
epoch 005 | lr 0.00590 | loss 0.03200 | loss_std 0.01168
epoch 006 | lr 0.00531 | loss 0.03427 | loss_std 0.01402
epoch 007 | lr 0.00478 | loss 0.03210 | loss_std 0.00721
epoch 008 | lr 0.00430 | loss 0.02668 | loss_std 0.00681
epoch 009 | lr 0.00387 | loss 0.02262 | loss_std 0.00480
epoch 010 | lr 0.00349 | loss 0.02329 | loss_std 0.00573
epoch 011 | lr 0.00314 | loss 0.02424 | loss_std 0.00463
epoch 012 | lr 0.00282 | loss 0.02138 | loss_std 0.00435
epoch 013 | lr 0.00254 | loss 0.02298 | loss_std 0.00603
epoch 014 | lr 0.00229 | loss 0.02230 | loss_std 0.00552
epoch 015 | lr 0.00206 | loss 0.01514 | loss_std 0.00294
epoch 016 | lr 0.00185 | loss 0.01564 | loss_std 0.00506
epoch 017 | lr 0.00167 | loss 0

epoch 044 | lr 0.00010 | loss 0.00663 | loss_std 0.00232
epoch 045 | lr 0.00009 | loss 0.00818 | loss_std 0.00209
epoch 046 | lr 0.00008 | loss 0.00596 | loss_std 0.00160
epoch 047 | lr 0.00007 | loss 0.00609 | loss_std 0.00183
epoch 048 | lr 0.00006 | loss 0.00540 | loss_std 0.00200
epoch 049 | lr 0.00006 | loss 0.00550 | loss_std 0.00150
epoch 000 | lr 0.01000 | loss 0.08973 | loss_std 0.03185
epoch 001 | lr 0.00900 | loss 0.04058 | loss_std 0.01663
epoch 002 | lr 0.00810 | loss 0.05346 | loss_std 0.01764
epoch 003 | lr 0.00729 | loss 0.04820 | loss_std 0.01481
epoch 004 | lr 0.00656 | loss 0.03598 | loss_std 0.00674
epoch 005 | lr 0.00590 | loss 0.03181 | loss_std 0.00792
epoch 006 | lr 0.00531 | loss 0.03249 | loss_std 0.01141
epoch 007 | lr 0.00478 | loss 0.02990 | loss_std 0.00581
epoch 008 | lr 0.00430 | loss 0.02617 | loss_std 0.00607
epoch 009 | lr 0.00387 | loss 0.02376 | loss_std 0.00421
epoch 010 | lr 0.00349 | loss 0.02131 | loss_std 0.00701
epoch 011 | lr 0.00314 | loss 0

Unnamed: 0,gcs,trs,random_seed
mean,0.901183,0.113135,2.0
std,0.003752,0.003236,1.581139


## Network Architecture

In [8]:
graph_emb.model_

ResGCN(
  (conv_layers_): ModuleList(
    (0): TAGConv(
      (lin): Linear(in_features=330, out_features=128, bias=False)
    )
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): TAGConv(
      (lin): Linear(in_features=384, out_features=128, bias=False)
    )
    (4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): TAGConv(
      (lin): Linear(in_features=384, out_features=128, bias=False)
    )
    (7): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): ReLU()
    (9): TAGConv(
      (lin): Linear(in_features=384, out_features=128, bias=False)
    )
    (10): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): ReLU()
    (12): TAGConv(
      (lin): Linear(in_features=384, out_features=128, bias=False)
    )
    (13): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_s

## Negatives Sampling Method

In [7]:
graph_emb = graph_level_nn.GroupedGraphEmbedder(verbose=True, **graph_emb_default_kwargs)
graph_emb.negatives_from_batch = False
evaluate_embeddings(graph_emb, X=X)

epoch 000 | lr 0.01000 | loss 0.10299 | loss_std 0.03922
epoch 001 | lr 0.00900 | loss 0.04560 | loss_std 0.01143
epoch 002 | lr 0.00810 | loss 0.04043 | loss_std 0.00675
epoch 003 | lr 0.00729 | loss 0.02970 | loss_std 0.00896
epoch 004 | lr 0.00656 | loss 0.02855 | loss_std 0.00581
epoch 005 | lr 0.00590 | loss 0.02846 | loss_std 0.00659
epoch 006 | lr 0.00531 | loss 0.02987 | loss_std 0.00628
epoch 007 | lr 0.00478 | loss 0.02410 | loss_std 0.00514
epoch 008 | lr 0.00430 | loss 0.02650 | loss_std 0.00848
epoch 009 | lr 0.00387 | loss 0.02209 | loss_std 0.00533
epoch 010 | lr 0.00349 | loss 0.02587 | loss_std 0.00565
epoch 011 | lr 0.00314 | loss 0.02425 | loss_std 0.00550
epoch 012 | lr 0.00282 | loss 0.02472 | loss_std 0.00829
epoch 013 | lr 0.00254 | loss 0.02039 | loss_std 0.00882
epoch 014 | lr 0.00229 | loss 0.01608 | loss_std 0.00585
epoch 015 | lr 0.00206 | loss 0.01518 | loss_std 0.00422
epoch 016 | lr 0.00185 | loss 0.01362 | loss_std 0.00342
epoch 017 | lr 0.00167 | loss 0

epoch 044 | lr 0.00010 | loss 0.00567 | loss_std 0.00277
epoch 045 | lr 0.00009 | loss 0.00692 | loss_std 0.00281
epoch 046 | lr 0.00008 | loss 0.00496 | loss_std 0.00146
epoch 047 | lr 0.00007 | loss 0.00632 | loss_std 0.00228
epoch 048 | lr 0.00006 | loss 0.00561 | loss_std 0.00230
epoch 049 | lr 0.00006 | loss 0.00500 | loss_std 0.00147
epoch 000 | lr 0.01000 | loss 0.08042 | loss_std 0.02995
epoch 001 | lr 0.00900 | loss 0.04843 | loss_std 0.01860
epoch 002 | lr 0.00810 | loss 0.04790 | loss_std 0.00865
epoch 003 | lr 0.00729 | loss 0.03274 | loss_std 0.00600
epoch 004 | lr 0.00656 | loss 0.03004 | loss_std 0.00260
epoch 005 | lr 0.00590 | loss 0.02439 | loss_std 0.00449
epoch 006 | lr 0.00531 | loss 0.03426 | loss_std 0.00620
epoch 007 | lr 0.00478 | loss 0.03335 | loss_std 0.00952
epoch 008 | lr 0.00430 | loss 0.03178 | loss_std 0.00796
epoch 009 | lr 0.00387 | loss 0.03084 | loss_std 0.00805
epoch 010 | lr 0.00349 | loss 0.02753 | loss_std 0.01121
epoch 011 | lr 0.00314 | loss 0

Unnamed: 0,gcs,trs,random_seed
mean,0.902468,0.110187,2.0
std,0.003663,0.005564,1.581139
