In [12]:
import sys
# Add the parent folder path to the sys.path list
sys.path.append('../..')

import networkx as nx
import numpy as np
import matplotlib.pyplot as plt

from node2vec.edges import HadamardEmbedder
from utils import read_graph, remove_random_edges

## Create data files required by SNE

In [13]:
alpha_data_file = '../../db/BTCAlphaNet-T.csv'
otc_data_file = '../../db/OTCNet-T.csv'

data_out_dir = './data/'

In [14]:
def file_name_base(data_file):
    return (data_file.split('/')[-1]).split('.')[0]

def file_name_binarized(data_file):
    return data_out_dir + file_name_base(data_file) + '-binarized.txt'

def transform_data_to_binary(data_in_file, data_out_file, in_sep = ',', out_sep = '\t'):
    '''
    Takes an input edgelist, where the first two columns are nodes, and the third is the
    corresponding edge-weight (from -1 to 1), where columns are separated by in_sep
    and writes out an edge list to the given output file path, with columns separated
    by out_sep.
    '''
    with open(data_in_file, 'r') as f_in:
        with open(data_out_file, 'w') as f_out:
            for line in f_in:
                u, v, weight = line.split(in_sep)
                weight_binary = 1 if float(weight) >= 0 else 0
                f_out.write("{}{}{}{}{}\n".format(u, out_sep, v, out_sep, weight_binary))

In [15]:
#for data_in_file in [alpha_data_file, otc_data_file]:
#    data_out_file = file_name_binarized(data_in_file)
#    transform_data_to_binary(data_in_file, data_out_file)

In [16]:
#!tail ./data/OTCNet-binarized.txt

## Train SNE

In [122]:
# Prepare graphs with removed test/validation edges
G_alpha = read_graph(alpha_data_file, temporal=True)
test_edges_alpha, test_weights_alpha = remove_random_edges(G_alpha, int(nx.number_of_edges(G_alpha)*0.2))
val_edges_alpha, val_weights_alpha = remove_random_edges(G_alpha, int(nx.number_of_edges(G_alpha)*0.125))

G_otc = read_graph(otc_data_file, temporal=True)
test_edges_otc, test_weights_otc = remove_random_edges(G_otc, int(nx.number_of_edges(G_otc)*0.2))
val_edges_otc, val_weights_otc = remove_random_edges(G_otc, int(nx.number_of_edges(G_otc)*0.125))

In [167]:
removed_edges, removed_weights = remove_random_edges(G_otc, int(nx.number_of_edges(G_otc)*(3.0/5)))
test_edges_otc.extend(removed_edges)
test_weights_otc.extend(removed_weights)

In [168]:
import walk as sne_walk

G_by_data_file = {alpha_data_file: G_alpha, otc_data_file: G_otc}
# Generate walks and train on them for each of the graphs
for data_file in [otc_data_file]:
    untemporal_file = data_file.split('-')[0] + '.csv'
    print(untemporal_file)
    walk_file = data_out_dir + file_name_base(data_file) + '.walks'
    save_file = file_name_base(data_file) + '-model.pkl'
    G = G_by_data_file[data_file]
    sne_walk.write_walks_to_disk(G, f = walk_file,
                        num_paths = 40,         # Default value based on paper = 20
                        path_length = 80,       # Default value based on paper = 40
                        temporal = True)
    !python3 SNE.py --train_data $untemporal_file --walks_data $walk_file --save_path $save_file --context_size 3

../../db/OTCNet.csv
  return f(*args, **kwds)
2018-12-08 13:39:06.472025: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
INFO:root:Edge number : 35592
INFO:root:Vertex number : 5881
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
INFO:root:number of data 19671
Total batch number 393
loss:  1.57 average loss: 1.523965 edges/sec:   28865%
Done training


## Testing SNE

In [169]:
from test import load_model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR

In [170]:
emb_vertex, sign_w, emb_context, id2vertex, vertex2id, edge_source_id, edge_target_id, edge_sign = \
    load_model('OTCNet-T-model.pkl')

In [171]:
G = G_otc
test_edges = test_edges_otc
test_weights = test_weights_otc

m = nx.number_of_edges(G)
X_train = np.zeros((m, emb_vertex[0].shape[0]))
y_train = np.zeros((m))
for i, (u,v) in enumerate(G.edges):
    u_idx, v_idx = vertex2id[u], vertex2id[v]
    edge_emb = emb_vertex[u_idx] * emb_context[v_idx]
    X_train[i] = edge_emb
    y_train[i] = G.get_edge_data(u,v)['weight']

X_test = np.zeros((len(test_edges), emb_vertex[0].shape[0]))
y_test = np.zeros((len(test_edges)))
for i, (u,v) in enumerate(test_edges):
    u_idx, v_idx = vertex2id[u], vertex2id[v]
    edge_emb = emb_vertex[u_idx] * emb_context[v_idx]
    X_test[i] = edge_emb
    y_test[i] = test_weights[i]

In [172]:
reg = SVR(gamma='scale').fit(X_train, y_train)

In [173]:
from sklearn.metrics import mean_squared_error
# Make predictions using the testing set
y_pred = reg.predict(X_test)

# The mean squared error
print("Root mean squared error: {}".format(np.sqrt(mean_squared_error(y_test, y_pred))))

Root mean squared error: 0.34993928608959163
