## Source Data

In [None]:
import gdown
import tensorflow as tf
import numpy as np
import os
import random
import networkx as nx
import csv

!mkdir datasets
!pip install pyvis
from pyvis.network import Network

#############################################################################################
# Facebook DATASET DOWNLOADS
#############################################################################################

# Download facebook dataset edgelist in txt format (extracted from mtx)
!gdown https://drive.google.com/uc?id=1v03XWRternGLDpRfKbRGoMiVX3dpOW3G -O datasets/facebook_edges.txt


# Download pretrained facebook HARP embeddings
!gdown https://drive.google.com/uc?id=1415v_BZOgVijs40gPAoIA6j4JZ6f0ogM -O datasets/Facebook_HARP_deepwalk.npy
!gdown https://drive.google.com/uc?id=1V9I72BFDYBv3LNF6DLyK5Coo0XKkU5ex -O datasets/Facebook_HARP_node2vec.npy
!gdown https://drive.google.com/uc?id=1Ziaz4wWqcWPHqgc9gLwxbP6RptO5eSZy -O datasets/Facebook_HARP_line.npy

#############################################################################################
# Youtube DATASET DOWNLOADS
#############################################################################################
# Download youtube dataset edgelist in cvs format
!gdown https://drive.google.com/uc?id=12aGrbOZqVMfOP46X8lj5qwqQui4kbMjZ -O datasets/youtube_edges.csv

#############################################################################################
# Douban DATASET DOWNLOADS source: http://datasets.syr.edu/pages/datasets.html
#############################################################################################
# Download douban dataset edgelist in cvs format
!gdown https://drive.google.com/uc?id=1ssjgKF5WpiXcIk7DfF6BXwPoWkqr5rOS -O datasets/douban_edges.csv

# FRAMEWORK SETUPS
#############################################################################################
# Graph embeddings framework @see: https://github.com/shenweichen/GraphEmbedding
!gdown https://drive.google.com/uc?id=1QwaC2pz6wC8QGAA1N7208SxEzdPfkn3S -O GraphEmbedding.zip
!unzip GraphEmbedding.zip
# HARP
!gdown https://drive.google.com/uc?id=174k2qDmDhXrKFivGD00jBWAvJp9b1kiq -O HARP.zip
!unzip HARP.zip

Collecting pyvis
  Downloading pyvis-0.1.9-py3-none-any.whl (23 kB)
Collecting jsonpickle>=1.4.1
  Downloading jsonpickle-2.0.0-py2.py3-none-any.whl (37 kB)
Installing collected packages: jsonpickle, pyvis
Successfully installed jsonpickle-2.0.0 pyvis-0.1.9
Downloading...
From: https://drive.google.com/uc?id=1v03XWRternGLDpRfKbRGoMiVX3dpOW3G
To: /content/datasets/facebook_edges.txt
100% 854k/854k [00:00<00:00, 5.22MB/s]
Downloading...
From: https://drive.google.com/uc?id=1415v_BZOgVijs40gPAoIA6j4JZ6f0ogM
To: /content/datasets/Facebook_HARP_deepwalk.npy
100% 2.07M/2.07M [00:00<00:00, 9.66MB/s]
Downloading...
From: https://drive.google.com/uc?id=1V9I72BFDYBv3LNF6DLyK5Coo0XKkU5ex
To: /content/datasets/Facebook_HARP_node2vec.npy
100% 2.07M/2.07M [00:00<00:00, 9.69MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Ziaz4wWqcWPHqgc9gLwxbP6RptO5eSZy
To: /content/datasets/Facebook_HARP_line.npy
100% 2.07M/2.07M [00:00<00:00, 9.66MB/s]
Downloading...
From: https://drive.google.com/uc?id=

## Set Flags and Seed

In [None]:
dataset = "facebook" # possible values: facebook, douban, youtube
is_new_data_split = True # sets whether to create new data and train set or rely on existing files which were downloaded before
landmark_technique = "random" # possible values: random, coarsening, community_detection
embedding_method = "node2vec" # possible values: node2vec, struc2vec, deepwalk, line, sdne, harp_node2vec, harp_deepwalk, harp_line

# Setting different seeds for reproducability
seed_value= 122
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

## Preprocess Data

### Create graph

In [None]:
def read_data(dataset):
  if dataset == "facebook":
      G=nx.read_edgelist("./datasets/facebook_edges.txt")
  elif dataset == "douban":
      G = nx.read_edgelist('./datasets/douban_edges.csv', delimiter=',', nodetype=int, encoding="utf-8")
      G = nx.relabel.convert_node_labels_to_integers(G, first_label=0, ordering="sorted")
      mapping = {}
      for v in G.nodes():
        mapping[v] = str(v)
      G = nx.relabel.relabel_nodes(G, mapping)
  elif dataset == "youtube":
      G = nx.read_edgelist('./datasets/youtube_edges.csv', delimiter=',', nodetype=int, encoding="utf-8")
      G = nx.relabel.convert_node_labels_to_integers(G, first_label=0, ordering="sorted")
      mapping = {}
      for v in G.nodes():
        mapping[v] = str(v)
      G = nx.relabel.relabel_nodes(G, mapping)

  nodes = list(G.nodes())
  edges = list(G.edges())
  num_nodes = len(nodes)
  num_edges = len(edges)
  print("Number of nodes", num_nodes)
  print("Number of edges", num_edges)
  return G, nodes, edges, num_nodes, num_edges

G, nodes, edges, num_nodes, num_edges = read_data(dataset)

Number of nodes 4039
Number of edges 88234


### Split Train-Test

#### Landmark Selection

In [None]:
from networkx.algorithms.community.modularity_max import greedy_modularity_communities
from HARP.src.harp import run_coarsening, train_embedding

def select_landmarks(num_nodes, landmark_technique,  nodes, is_new_data_split):
  if is_new_data_split:
    print("#### LANDMARK SELECTION ####")

    # Select number of landmark nodes
    if (num_nodes>10000):
      k1=3
      k2=1
    else:
      k1=100
      k2=11

    # Select training landmarks
    if landmark_technique == "random":
      print("Selecting landmarks randomly...")
      k1_nodes = random.sample(nodes,k1)
    elif landmark_technique == "coarsening":
      print("Coarsening graph...")
      %cd HARP
      recursive_node_assosiations=run_coarsening("../datasets/facebook_edges.txt", None, "edgelist")[1]
      #embeddings = train_embedding("line", "Facebook_HARP_line.npx", "../datasets/facebook_edges.txt", "network")
      %cd ..
      for rec_level in recursive_node_assosiations[::-1]:
        rec_l = list(rec_level.keys())
        #print(len(rec_l))
        if len(rec_l) >= k1:
          print("Selecting landmarks as coarsened graph...")
          k1_nodes = random.sample(rec_l, k1)
          k1_nodes = [str(node) for node in k1_nodes]

          break
    elif landmark_technique == "community_detection":
      c = list(greedy_modularity_communities(G))
      ratios = []
      sum = 0
      k1_nodes = []
      print("Clustering into communities...")
      print("number of communities: ",c)
      for i in range(len(c)):
          percentage = int((len(c[i])/num_nodes)*100)
          if percentage==0:
              percentage = 1
          k1_nodes.extend(random.sample(c[i],percentage))
    print("Number of training landmark nodes:",len(k1_nodes))
    remaining_nodes_train = list(set(nodes)-set(k1_nodes))
    print("number of nodes except training (landmark) nodes", len(remaining_nodes_train))

    # Select testing landmarks
    k2_nodes = random.sample(remaining_nodes_train,k2)

    print("Number of testing landmark nodes:",len(k2_nodes))
    remaining_nodes_test = list(set(nodes)-set(k2_nodes))
    print("number of nodes except testing (landmark) nodes", len(remaining_nodes_test))
    return k1_nodes, k2_nodes, remaining_nodes_train, remaining_nodes_test
  else:
    pass

#k1_nodes, k2_nodes, remaining_nodes_train, remaining_nodes_test = select_landmarks(num_nodes, "coarsening", nodes, is_new_data_split) 

#### Create train set

In [None]:
# Create and save new train set
def generate_and_save_train_data(G, k1_nodes, is_new_data_split):
  train_set = []
  if is_new_data_split:
    for u in k1_nodes:
      for v in remaining_nodes_train:
          if nx.has_path(G, u, v):
            shortest_path = nx.shortest_path(G, u, v)
            length = 1
            for i in range(len(shortest_path)-1):
              train_set.append((shortest_path[0], shortest_path[i+1], length))
              length +=1

    print("Size of total training set before omission:",len(train_set))

    f_train = open('./datasets/train.txt', 'w')
    for i in range(len(train_set)): 
      if (1< train_set[i][2] <= 6):
        f_train.write(str(train_set[i][0])+' '+str(train_set[i][1])+' '+str(train_set[i][2]) )
        f_train.write('\n')
              
    f_train.close()
    print("Train file written")

generate_and_save_train_data(G, k1_nodes, is_new_data_split)
# Load train set from txt
def load_training_data():
  train_list1 = []
  train_list2 = []
  y_train = []
  f_train = open("./datasets/train.txt", 'r')
  for line in f_train:
    a=line.strip('\n').split(' ')
    train_list1.append(int(a[0])) 
    train_list2.append(int(a[1]))
    y_train.append(int(a[2]))
  f_train.close()
  print("Number of training pairs:", len(y_train))
  print("Path Lengths in train set: ",np.unique(np.array(y_train))," Size of each length: ", np.unique(np.array(y_train),return_counts=True)[1])
  return train_list1, train_list2, y_train
#train_list1, train_list2, y_train = load_training_data()

NameError: ignored

#### Create test set

In [None]:
test_set = []
# Create and save new test split
def generate_and_save_test_data(G, k2_nodes, is_new_data_split):
  if is_new_data_split:
    for u in k2_nodes:
      for v in remaining_nodes_test:
        if nx.has_path(G, u, v):
          shortest_path = nx.shortest_path(G,u,v)
          length = 1
          for i in range(len(shortest_path) - 1):
            test_set.append((shortest_path[0], shortest_path[i+1], length))
            length += 1
    print("Size of total training set before omission:",len(test_set))

    f_test = open("./datasets/test.txt", 'w')
    for i in range(len(test_set)):
      if (1< test_set[i][2] <= 6):
        f_test.write(str(test_set[i][0])+' '+ str(test_set[i][1]) +' '+ str(test_set[i][2]) )
        f_test.write('\n')

    f_test.close()
    print("Test file written")
#generate_and_save_test_data(G, k2_nodes, is_new_data_split)

def load_testing_data():

  y_test = []

  # Load test set from txt
  test_list1=[]
  test_list2=[]    
  f_test= open('./datasets/test.txt', 'r') 

  for line in f_test:
    a=line.strip('\n').split(' ')
    test_list1.append(int(a[0])) 
    test_list2.append(int(a[1]))
    y_test.append(int(a[2]))
  f_test.close()
  print("Number of testing pairs:", len(y_test))
  print("Path Lengths in test set: ",np.unique(np.array(y_test))," Size of each length: ", np.unique(np.array(y_test),return_counts=True)[1])
  return test_list1, test_list2, y_test
#test_list1, test_list2, y_test = load_testing_data()

### Visualize graph with landmarks

In [None]:
# Needs download from colab and reopen local
def visualise_graph_with_landmarks(G, k1_nodes, landmark_technique):
  net = Network(notebook=True)
  net.from_nx(G)
  # Coloring landmark nodes
  for node_id in k1_nodes:
    net.node_map[str(node_id)]['shape'] = 'box'
    net.node_map[str(node_id)]['color'] = 'red'
  net.save_graph('facebook_landmarks_128_'+landmark_technique+'.html')

## Create Emeddings

#### Train embeddings

In [None]:
from GraphEmbedding.ge.models import Node2Vec, DeepWalk, SDNE, Struc2Vec, LINE
from collections import OrderedDict

def train_embedding(G, params={}, embedding_method="node2vec"):
  print("Params for Embeddings: ", params, " with embedding type: ", embedding_method)

  if embedding_method == "node2vec":
    q=1
    p=1
    window_size = 5
    if params:
      if 'q' in params.keys():
        q = params['q']
      if 'p' in params.keys():
        p = params['p']
      if 'window_size' in params.keys():
        window_size = params['window_size']
    model = Node2Vec(G, 80, 10, q=q, p=p) # default params from paper implementation
    model.train(window_size = window_size)
    embedding_vectors = model.get_embeddings()
  elif embedding_method == "deepwalk":
    model = DeepWalk(G, 80, 40) # parameters as in the paper
    model.train()
    embedding_vectors = model.get_embeddings()
  elif embedding_method == "sdne":
    layer_config = [400, 128]
    if params:
      layer_config = params["layer_config"]
    model = SDNE(G, hidden_size=layer_config) # same hidden sizes as in paper for arxiv GR-QC as it has similar number of nodes as facebook
    model.train( epochs=40, verbose=0)
    embedding_vectors = model.get_embeddings()
  elif embedding_method == "struc2vec":
    model = Struc2Vec(G)
    model.train()
    embedding_vectors = model.get_embeddings()
  elif embedding_method == "line":
    order = "second"
    if params:
      order = params["order"]
    model = LINE(G, embedding_size= 128, order=order)
    model.train(epochs=50, verbose=0)
    embedding_vectors = model.get_embeddings()
  elif embedding_method == "harp_deepwalk":
    embedding_vectors_res = np.load("./datasets/Facebook_HARP_deepwalk.npy")
  elif embedding_method == "harp_node2vec":
    embedding_vectors_res = np.load("./datasets/Facebook_HARP_node2vec.npy")
  elif embedding_method == "harp_line":
    embedding_vectors_res = np.load("./datasets/Facebook_HARP_line.npy")

  if embedding_method not in ["harp_deepwalk", "harp_node2vec", "harp_line"]:
    #embedding_vectors = OrderedDict(sorted(embedding_vectors.items()))
    num_nodes = max([int(emb) for emb in embedding_vectors.keys()])

    embedding_vectors_res = np.array(list(embedding_vectors.values()))
    for i in range(num_nodes):
      embedding_vectors_res[i] = embedding_vectors[str(i)]
  return embedding_vectors_res


### Preprocess embeddings

In [None]:
def preprocess_embeddings(embedding_vectors, train_list1, train_list2, test_list1, test_list2, emb_size = 128, con_type = "avg"):
  X_train_binary = []
  X_test_binary = []
  for i in range(len(train_list1)):
    if con_type == "avg":
      con_x = (embedding_vectors[train_list1[i]] + embedding_vectors[train_list2[i]])/2
    elif con_type == "concat":
      con_x = np.concatenate( [embedding_vectors[train_list1[i]],embedding_vectors[train_list2[i]]], axis=0)
    elif con_type == "hadamard":
      con_x = embedding_vectors[train_list1[i]] * embedding_vectors[train_list2[i]]
    elif con_type =="sub":
      con_x = embedding_vectors[train_list1[i]] - embedding_vectors[train_list2[i]]
    elif con_type =="conv":
      con_x = [embedding_vectors[train_list1[i]],embedding_vectors[train_list2[i]]]


    X_train_binary.append(con_x)

  for i in range(len(test_list1)):
    if con_type == "avg":
      con_x = (embedding_vectors[test_list1[i]] + embedding_vectors[test_list2[i]])/2
    elif con_type == "concat":
      con_x = np.concatenate( [embedding_vectors[test_list1[i]],embedding_vectors[test_list2[i]]], axis=0)
    elif con_type == "hadamard":
      con_x = embedding_vectors[test_list1[i]] * embedding_vectors[test_list2[i]]
    elif con_type =="sub":
      con_x = embedding_vectors[test_list1[i]] - embedding_vectors[test_list2[i]]
    elif con_type =="conv":
      con_x = [embedding_vectors[test_list1[i]],embedding_vectors[test_list2[i]]]


    X_test_binary.append(con_x)

  print("Embedded training set size:", len(X_train_binary))
  print("Embedded test set size:", len(X_test_binary))

  train_test_split = [np.array(X_train_binary), np.array(y_train), np.array(X_test_binary), np.array(y_test)]
  if con_type == "conv":
    train_test_split[0] = train_test_split[0].reshape(-1, 2, emb_size, 1)
    train_test_split[2] = train_test_split[2].reshape(-1, 2, emb_size, 1)

  print("Train test split summary: \n", "Train X shape:", train_test_split[0].shape\
        ,"Train y shape:", train_test_split[1].shape,
        "\n Test X shape:", train_test_split[2].shape,
        "Test y shape:", train_test_split[3].shape)
  return train_test_split

# Shuffle for train data before training
def shuffle_in_unison(a, b):
    rng_state = np.random.get_state()
    np.random.shuffle(a)
    np.random.set_state(rng_state)
    np.random.shuffle(b)

# Models

## MLP Regression

In [None]:
import tensorflow.keras as keras 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D,BatchNormalization,Dense, Dropout, Activation, Flatten
from tensorflow.keras.optimizers import SGD

def create_regression_model(input_dim, emb_size=128):
  model = Sequential()
  model.add(Dense(emb_size, input_dim=input_dim))
  model.add(Activation('relu'))
  model.add(Dropout(0.4))

  dense_size= int(0.2*input_dim)
  model.add(Dense(dense_size))
  model.add(Activation('relu'))
  model.add(Dropout(0.4))

  model.add(Dense(1))
  model.add(Activation('softplus'))

  opt=SGD(learning_rate=0.05, momentum=0.2)
  model.compile(loss='mse', optimizer='sgd', metrics=['mae'])
  #model.summary()
  return model

## MLP Regression with Conv Concat

In [None]:
def create_regression_conv_model(emb_size=128):

  model = Sequential()
  model.add(Conv2D(1, (2,1),kernel_constraint="non_neg",use_bias=False,input_shape=(2,emb_size,1)))
  model.add(Flatten())
  model.add(Dense(emb_size))
  model.add(Activation('relu'))
  model.add(Dropout(0.4))

  dense_size= int(0.2*emb_size)
  model.add(Dense(dense_size))

  model.add(Activation('relu'))
  model.add(Dropout(0.4))

  model.add(Dense(1))
  model.add(Activation('softplus'))

  opt=SGD(learning_rate=0.05, momentum=0.01)
  model.compile(loss='mse', optimizer=opt, metrics=['mae'])
  return model

# Train Model

In [None]:
import keras.callbacks as cb
from keras.callbacks import LambdaCallback

def train_regressor(model, X_train, y_train, epochs=15, batch_size = 32):
  class LossHistory(cb.Callback):
      def on_train_begin(self, logs={}):
          self.losses = []

      def on_batch_end(self, batch, logs={}):
          batch_loss = logs.get('loss')
          self.losses.append(batch_loss)
  history = LossHistory()
  model.fit(X_train, y_train, epochs= epochs, batch_size= batch_size, callbacks=[history], validation_split= 0.3, verbose=2)

# Test Model

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
import matplotlib.pyplot as plt

def test_model(model,X_train, X_test, y_test):
  preds = model.predict(X_test)
  pred = []

  for i in range(len(preds)):
    pred.append(round(float(preds[i][0])))

  rmse = (mean_squared_error(y_test, pred))**0.5
  mae = mean_absolute_error(y_test, pred)
  print("RMSE:", rmse, "MAE:", mae, "\nSummary:", "\nNodes:", len(G.nodes()),
        "Edges:", len(G.edges()), "Train size:", len(X_train), "Test size:", len(X_test))
  
  accuracy = accuracy_score(y_test,pred)
  print('Accuracy: %f' % accuracy)
  print('Paths: \t\t',np.unique(y_test))
  precision = precision_score(y_test,pred,average=None)
  print('Precision: \t' , precision)
  recall = recall_score(y_test,pred,average=None)
  print('Recall: \t' , recall)
  f1 = f1_score(y_test,pred,average=None)
  print('F1 score: \t', f1)

  return rmse, mae, accuracy, precision, recall, f1

def test_model_detailed(model, X_test, y_test):
  preds = model.predict(X_test)
  pred = []

  indexes_2 = []
  indexes_3 = []
  indexes_4 = []
  indexes_5 = []
  indexes_6 = []
  y_test_2 = []
  y_test_3 = []
  y_test_4 = []
  y_test_5 = []
  y_test_6 = []

  pred_2 = []
  pred_3 = []
  pred_4 = []
  pred_5 = []
  pred_6 = []

  for i in range(len(y_test)):
      if y_test[i]==2:
          indexes_2.append(i)
          y_test_2.append(y_test[i])
      if y_test[i]==3:
          indexes_3.append(i)
          y_test_3.append(y_test[i])
      if y_test[i]==4:
          indexes_4.append(i)
          y_test_4.append(y_test[i])
      if y_test[i]==5:
          indexes_5.append(i)
          y_test_5.append(y_test[i])
      if y_test[i]==6:
          indexes_6.append(i)
          y_test_6.append(y_test[i])

  for i in range(len(y_test_2)):
    pred_2.append(round(float(preds[indexes_2[i]][0])))

  for i in range(len(y_test_3)):
    pred_3.append(round(float(preds[indexes_3[i]][0])))

  for i in range(len(y_test_4)):
    pred_4.append(round(float(preds[indexes_4[i]][0])))

  for i in range(len(y_test_5)):
    pred_5.append(round(float(preds[indexes_5[i]][0])))

  for i in range(len(y_test_6)):
    pred_6.append(round(float(preds[indexes_6[i]][0])))

  rmse2 = (mean_squared_error(y_test_2, pred_2))**0.5
  mae2 = mean_absolute_error(y_test_2, pred_2)

  rmse3 = (mean_squared_error(y_test_3, pred_3))**0.5
  mae3 = mean_absolute_error(y_test_3, pred_3)

  rmse4 = (mean_squared_error(y_test_4, pred_4))**0.5
  mae4 = mean_absolute_error(y_test_4, pred_4)

  rmse5 = (mean_squared_error(y_test_5, pred_5))**0.5
  mae5 = mean_absolute_error(y_test_5, pred_5)

  rmse6 = (mean_squared_error(y_test_6, pred_6))**0.5
  mae6 = mean_absolute_error(y_test_6, pred_6)

  print("RMSE at length 2:", rmse2, "MAE at length 2:", mae2)
  print("RMSE at length 3:", rmse3, "MAE at length 3:", mae3)
  print("RMSE at length 4:", rmse4, "MAE at length 4:", mae4)
  print("RMSE at length 5:", rmse5, "MAE at length 5:", mae5)
  print("RMSE at length 6:", rmse6, "MAE at length 6:", mae6)
  return rmse2, mae2, rmse3, mae3, rmse4, mae4, rmse5, mae5, rmse6, mae6

def plot_test_losses(model, X_test, y_test):
    rmse2, mae2, rmse3, mae3, rmse4, mae4, rmse5, mae5, rmse6, mae6 = test_model_detailed(model, X_test, y_test)

    labels = ['SP2', 'SP3', 'SP4', 'SP5', 'SP6']
    rmse_values = [rmse2, rmse3, rmse4, rmse5, rmse6]
    mae_values = [mae2, mae3, mae4, mae5, mae6]

    x = np.arange(len(labels))  # the label locations
    width = 0.35  # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(x - width/2, rmse_values, width, label='RMSE')
    rects2 = ax.bar(x + width/2, mae_values, width, label='MAE')

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Loss values')
    ax.set_title('Losses by shortest path distance')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend()

    fig.tight_layout()

    plt.show()

### Train test loop

In [None]:
def train_test_loop_optim(emb_method, emb_size, epochs, emb_params):
  embedding_vectors = train_embedding(G, embedding_method= emb_method, params = emb_params)

  # Test with avg binary operator
  print("Embedding method: ", emb_method, "Binary operator: conv")
  X_train, y_train, X_test, y_test = preprocess_embeddings(embedding_vectors, train_list1, train_list2, test_list1, test_list2, emb_size,"conv")
  shuffle_in_unison(X_train, y_train)
  print("\n")
  model = create_regression_conv_model(emb_size =128)
  print("\n")
  train_regressor(model, X_train, y_train, epochs=epochs)
  print("\n")
  test_model(model, X_train, X_test, y_test)


In [None]:
def train_test_loop(emb_method, emb_size, epochs, emb_params ):
  print("#### TRAIN TEST LOOP")
  # Create Embeddings
  embedding_vectors = train_embedding(G, embedding_method= emb_method, params = emb_params)

  # Test with avg binary operator
  print("Embedding method: ", emb_method, "Binary operator: avg")
  X_train, y_train, X_test, y_test = preprocess_embeddings(embedding_vectors, train_list1, train_list2, test_list1, test_list2, emb_size,"avg")
  shuffle_in_unison(X_train, y_train)
  print("\n")
  model = create_regression_model(X_train.shape[1])
  print("\n")
  train_regressor(model, X_train, y_train, epochs=epochs)
  print("\n")
  test_model(model, X_train, X_test, y_test)
  plot_test_losses(model, X_test, y_test)
  # Test with conc binary operator
  print("Embedding method: ", emb_method, "Binary operator: concat")
  X_train, y_train, X_test, y_test = preprocess_embeddings(embedding_vectors, train_list1, train_list2, test_list1, test_list2, emb_size,"concat")
  shuffle_in_unison(X_train, y_train)
  print("\n")
  model = create_regression_model(X_train.shape[1])
  print("\n")
  train_regressor(model, X_train, y_train, epochs=epochs)
  print("\n")
  test_model(model, X_train, X_test, y_test)
  plot_test_losses(model, X_test, y_test)
  # Test with convolution
  print("Embedding method: ", emb_method, "Binary operator: conv")
  X_train, y_train, X_test, y_test = preprocess_embeddings(embedding_vectors, train_list1, train_list2, test_list1, test_list2, emb_size,"conv")
  shuffle_in_unison(X_train, y_train)
  print("\n")
  model = create_regression_conv_model(emb_size =emb_size)
  print("\n")
  train_regressor(model, X_train, y_train, epochs=epochs)
  print("\n")
  test_model(model, X_train, X_test, y_test)
  plot_test_losses(model, X_test, y_test)

### Running tests

In [None]:
landmark_techniques = ["random", "community_detection", "coarsening"]
embedding_techniques = ["node2vec", "harp_node2vec","sdne", "line", "harp_line"]
for landmark_technique in landmark_techniques:
  is_new_data_split = landmark_technique != "random"
  print("### LANDMARK RUN", landmark_technique, "###")
  G, nodes, edges, num_nodes, num_edges = read_data(dataset)
  if is_new_data_split:
    k1_nodes, k2_nodes, remaining_nodes_train, remaining_nodes_test = select_landmarks(num_nodes, landmark_technique, nodes, is_new_data_split)
    visualise_graph_with_landmarks(G, k1_nodes, landmark_technique)
    generate_and_save_train_data(G, k1_nodes, is_new_data_split)
    generate_and_save_test_data(G, k2_nodes, is_new_data_split)

  train_list1, train_list2, y_train = load_training_data()
  test_list1, test_list2, y_test = load_testing_data()
  
  for embedding_technique in embedding_techniques:
    print("### EMBEDDING RUN", embedding_technique, "###")
    train_test_loop(emb_method = embedding_technique, emb_size = 128, epochs= 15, emb_params = {}) # No embeddings params set as default values are the optimised ones


### Embedding Optimization

In [None]:
def optimize_embeddings(): # requires to have a num_nodes variable before and also a train_test_loop method as defined above
  embedding_types = ["sdne", "line", "node2vec"] #,
  for embedding_type in embedding_types:
    params = {}
    if embedding_type == "node2vec":
      ps = [0.1, 0.75, 1, 1.25, 2] 
      qs = [0.1, 0.75, 1, 1.25, 2]
      for p in ps:
        for q in qs:
          params["p"] = p
          params["q"] = q
          train_test_loop_optim(emb_method="node2vec", emb_size= 128, epochs = 15, emb_params = params)
    elif embedding_type == "line":
      first_or_second = [ "first", "second"]
      for f_s in first_or_second:
        params["order"] = f_s
        train_test_loop_optim(emb_method="line", emb_size= 128, epochs = 15, emb_params = params)
    elif embedding_type == "sdne":
      layer_configs = [[int((num_nodes*0.1)), 128], [int((num_nodes*0.1)), int((num_nodes*0.05)), 128], [int(num_nodes), int(num_nodes*0.5), 128] ] # conservative as in paper vs deeper vs large vertically and deep
      for layer_config in layer_configs:
        params["layer_config"] = layer_config
        train_test_loop_optim(emb_method="sdne", emb_size= layer_config[-1], epochs= 15, emb_params = params)

#optimize_embeddings()