In [1]:
import networkx as nx
import numpy as np
import pandas as pd
from stellargraph import StellarGraph, datasets
from stellargraph.data import EdgeSplitter
from sklearn.model_selection import train_test_split
from node2vec import Node2Vec
from gensim.models import KeyedVectors
from sklearn import metrics
from tqdm import tqdm

from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.layers import concatenate

In [2]:
graph_train_pos_nodes = np.load('data/graph_train_pos_nodes.npy')
graph_train_pos_edges = np.load('data/graph_train_pos_edges.npy')
edge_test = np.load('data/edge_test.npy')
label_test = np.load('data/label_test.npy')
edge_val = np.load('data/edge_val.npy')
label_val = np.load('data/label_val.npy')
edge_train = np.load('data/edge_train.npy')
label_train = np.load('data/label_train.npy')
graph_train_pos = nx.Graph()
graph_train_pos.add_nodes_from(graph_train_pos_nodes)
graph_train_pos.add_edges_from(graph_train_pos_edges)

In [3]:
edge_test_coef = np.load('data/edge_test_coef.npy')
edge_val_coef = np.load('data/edge_val_coef.npy')
edge_train_coef = np.load('data/edge_train_coef.npy')

In [4]:
node_embedding = np.load('data/features_preprocessed_128.npy')

def get_edge_embedding(edges, node_embedding, binaryoperator):
    if binaryoperator=='multiplication':
        def binary_operator(u,v):
            return u*v
    elif binaryoperator=='absolute':
        def binary_operator(u,v):
            return np.abs(u-v)
    elif binaryoperator=='average':
        def binary_operator(u,v):
            return (u+v)/2 
    elif binaryoperator=='square difference':
        def binary_operator(u,v):
            return (u-v)**2
        
    edge_embedding = []
    for i in range(len(edges)):
        u = node_embedding[edges[i][0]]
        v = node_embedding[edges[i][1]]
        uv = binary_operator(u,v)
        edge_embedding.append(uv)
    edge_embedding = np.array(edge_embedding)
    return edge_embedding


edge_train_embedding = get_edge_embedding(edge_train, node_embedding, 'multiplication')
edge_val_embedding = get_edge_embedding(edge_val, node_embedding, 'multiplication')
edge_test_embedding = get_edge_embedding(edge_test, node_embedding, 'multiplication')

edge_train_embedding_abs = get_edge_embedding(edge_train, node_embedding, 'absolute')
edge_val_embedding_abs = get_edge_embedding(edge_val, node_embedding, 'absolute')
edge_test_embedding_abs = get_edge_embedding(edge_test, node_embedding, 'absolute')

edge_train_embedding_avg = get_edge_embedding(edge_train, node_embedding, 'average')
edge_val_embedding_avg = get_edge_embedding(edge_val, node_embedding, 'average')
edge_test_embedding_avg = get_edge_embedding(edge_test, node_embedding, 'average')

edge_train_embedding_sqdiff = get_edge_embedding(edge_train, node_embedding, 'square difference')
edge_val_embedding_sqdiff = get_edge_embedding(edge_val, node_embedding, 'square difference')
edge_test_embedding_sqdiff = get_edge_embedding(edge_test, node_embedding, 'square difference')

In [5]:
def get_edge_ml(edges, node_ml):
    edge_ml = []
    for i in range(len(edges)):
        u = node_ml[edges[i][0]]
        v = node_ml[edges[i][1]]
        uv = np.abs(u-v)
        edge_ml.append(uv)
    edge_ml = np.array(edge_ml)
    return edge_ml

all_ml = pd.read_csv('data/musae_git_target.csv')
node_ml = all_ml['ml_target'].values
edge_train_ml = get_edge_ml(edge_train, node_ml)
edge_val_ml = get_edge_ml(edge_val, node_ml)
edge_test_ml = get_edge_ml(edge_test, node_ml)

In [6]:
predictions_test = pd.read_csv('data/predictions_test_APPNP_feature.csv', index_col=0)
all_nodes = np.array(range(len(graph_train_pos_nodes)))
nodeclassification_test_node = predictions_test['node_id'].values
nodeclassification_train_node = np.delete(all_nodes, nodeclassification_test_node)

def get_observable_edges(edge, edge_label, observable_node):    
    observable_idx = []
    nonobservable_idx = []
    
    for i in range(len(edge)):
        x = edge[i]
        if x[0] in observable_node and x[1] in observable_node:
            observable_idx.append(i)
        else:
            nonobservable_idx.append(i)

    return observable_idx, nonobservable_idx

observable_edge_test_idx, nonobservable_edge_test_idx = get_observable_edges(edge_test, label_test, nodeclassification_train_node)
observable_edge_val_idx, nonobservable_edge_val_idx = get_observable_edges(edge_val, label_val, nodeclassification_train_node)
observable_edge_train_idx, nonobservable_edge_train_idx = get_observable_edges(edge_train, label_train, nodeclassification_train_node)

In [7]:
observable_edge_test_coef = edge_test_coef[observable_edge_test_idx]
observable_edge_val_coef = edge_val_coef[observable_edge_val_idx]
observable_edge_train_coef = edge_train_coef[observable_edge_train_idx]

observable_edge_test_embedding = edge_test_embedding[observable_edge_test_idx]
observable_edge_val_embedding = edge_val_embedding[observable_edge_val_idx]
observable_edge_train_embedding = edge_train_embedding[observable_edge_train_idx]

observable_edge_test_embedding_abs = edge_test_embedding_abs[observable_edge_test_idx]
observable_edge_val_embedding_abs = edge_val_embedding_abs[observable_edge_val_idx]
observable_edge_train_embedding_abs = edge_train_embedding_abs[observable_edge_train_idx]

observable_edge_test_embedding_avg = edge_test_embedding_avg[observable_edge_test_idx]
observable_edge_val_embedding_avg = edge_val_embedding_avg[observable_edge_val_idx]
observable_edge_train_embedding_avg = edge_train_embedding_avg[observable_edge_train_idx]

observable_edge_test_embedding_sqdiff = edge_test_embedding_sqdiff[observable_edge_test_idx]
observable_edge_val_embedding_sqdiff = edge_val_embedding_sqdiff[observable_edge_val_idx]
observable_edge_train_embedding_sqdiff = edge_train_embedding_sqdiff[observable_edge_train_idx]

observable_edge_test_ml = edge_test_ml[observable_edge_test_idx]
observable_edge_val_ml = edge_val_ml[observable_edge_val_idx]
observable_edge_train_ml = edge_train_ml[observable_edge_train_idx]

observable_edge_test_label = label_test[observable_edge_test_idx]
observable_edge_val_label = label_val[observable_edge_val_idx]
observable_edge_train_label = label_train[observable_edge_train_idx]

In [8]:
#train link prediction model
inputA = Input(shape=(6))
inputB = Input(shape=(128))
inputC = Input(shape=(1))

x = Dense(128, activation="relu")(inputA)
x = Dense(64, activation="relu")(x)
x = Dense(32, activation="relu")(x)
x = Dense(16, activation="relu")(x)
x = Dense(4, activation="relu")(x)
x = Dense(1)(x)
x = Model(inputs=inputA, outputs=x)

y = Dense(64, activation="relu")(inputB)
y = Dense(16, activation="relu")(y)
y = Dense(4, activation="relu")(y)
y = Dense(1)(y)
y = Model(inputs=inputB, outputs=y)

z = Dense(1, activation="relu")(inputC)
z = Model(inputs=inputC, outputs=z)

#combine the above models
combined = concatenate([x.output, y.output, z.output])
w = Dense(8, activation="relu")(combined)
w = Dense(4, activation="relu")(w)
w = Dense(1)(w)

model = Model(inputs=[x.input, y.input, z.input], outputs=w)
model.compile('adam', 'mean_squared_error', metrics=['accuracy'])
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 6)]          0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 128)          896         input_1[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 64)           8256        dense[0][0]                      
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 128)]        0                                            
____________________________________________________________________________________________

In [9]:
model.fit([observable_edge_train_coef, observable_edge_train_embedding, observable_edge_train_ml], observable_edge_train_label, batch_size=512, epochs=40)
model.save('model/link_prediction_mult_givenfeatures.h5')
model.fit([observable_edge_train_coef, observable_edge_train_embedding_abs, observable_edge_train_ml], observable_edge_train_label, batch_size=512, epochs=40)
model.save('model/link_prediction_abs_givenfeatures.h5')
model.fit([observable_edge_train_coef, observable_edge_train_embedding_avg, observable_edge_train_ml], observable_edge_train_label, batch_size=512, epochs=40)
model.save('model/link_prediction_avg_givenfeatures.h5')
model.fit([observable_edge_train_coef, observable_edge_train_embedding_sqdiff, observable_edge_train_ml], observable_edge_train_label, batch_size=512, epochs=40)
model.save('model/link_prediction_sqdiff_givenfeatures.h5')

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epo

In [10]:
def evaluate(y_test, y_predict):
    accuracy = metrics.accuracy_score(y_test, y_predict)
    precision = metrics.precision_score(y_test, y_predict, average = 'macro')
    recall = metrics.recall_score(y_test, y_predict, average = 'macro')
    f1 = metrics.f1_score(y_test, y_predict, average = 'macro')
    return accuracy, precision, recall, f1

def get_val_result(model, edge_embed):
    prediction = model.predict([observable_edge_test_coef, edge_embed, observable_edge_test_ml])

    y_predict = []
    for i in range(len(prediction)):
        y_predict.append(round(prediction[i][0]))
    accuracy, precision, recall, f1 = evaluate(observable_edge_test_label, y_predict)

    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1: {f1}')

In [11]:
model = keras.models.load_model('model/link_prediction_mult_givenfeatures.h5')
print('Model with multiplication in edge embedding')
get_val_result(model, observable_edge_test_embedding)
print('')

model = keras.models.load_model('model/link_prediction_abs_givenfeatures.h5')
print('Model with absolute in edge embedding')
get_val_result(model, observable_edge_test_embedding_abs)
print('')

model = keras.models.load_model('model/link_prediction_avg_givenfeatures.h5')
print('Model with average in edge embedding')
get_val_result(model, observable_edge_test_embedding_avg)
print('')

model = keras.models.load_model('model/link_prediction_sqdiff_givenfeatures.h5')
print('Model with sqaure difference in edge embedding')
get_val_result(model, observable_edge_test_embedding_sqdiff)
print('')

Model with multiplication in edge embedding
Accuracy: 0.8530538795384778
Precision: 0.8531107945539147
Recall: 0.8529506668175078
F1: 0.8530005125126426

Model with absolute in edge embedding
Accuracy: 0.8568011008785857
Precision: 0.8568330315355064
Recall: 0.8568886080895806
F1: 0.8567981340137754

Model with average in edge embedding
Accuracy: 0.8959034614163226
Precision: 0.8959389341173594
Recall: 0.8959996273273545
F1: 0.8959014428475955

Model with sqaure difference in edge embedding
Accuracy: 0.8584524187572774
Precision: 0.8584720211703797
Recall: 0.8583770492163076
F1: 0.8584114991709555



In [12]:
#final test results

#we test on the edges for nodes that the label for ML developers are hidden 
nonobservable_edge_test_coef = edge_test_coef[nonobservable_edge_test_idx]
nonobservable_edge_test_embedding_avg = edge_test_embedding_avg[nonobservable_edge_test_idx]
nonobservable_edge_test_label = label_test[nonobservable_edge_test_idx]
model = keras.models.load_model('model/link_prediction_avg_givenfeatures.h5')

#get results from the node classification model
output_ml = predictions_test['predicted_label'].values
ml = pd.read_csv('data/musae_git_target.csv')
ml_target = ml['ml_target'].values
nonobservable_edge_test = edge_test[nonobservable_edge_test_idx]

#combine the output results from the node classification model
nonobservable_edge_test_ml = []
for x in nonobservable_edge_test:   
    if x[0] in nodeclassification_test_node:
        idx = np.where(nodeclassification_test_node==x[0])
        u = output_ml[idx]
    else:
        u = ml_target[x[0]]       
        
    if x[1] in nodeclassification_test_node:
        idx = np.where(nodeclassification_test_node==x[1])
        v = output_ml[idx]   
    else:
        v = ml_target[x[1]]
       
    nonobservable_edge_test_ml.append(np.abs(u-v))    
nonobservable_edge_test_ml=np.array(nonobservable_edge_test_ml)


#test the link prediction model
prediction = model.predict([nonobservable_edge_test_coef, nonobservable_edge_test_embedding_avg, nonobservable_edge_test_ml])
y_predict = []
for i in range(len(prediction)):
    y_predict.append(round(prediction[i][0]))
accuracy, precision, recall, f1 = evaluate(nonobservable_edge_test_label, y_predict)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1: {f1}')

Accuracy: 0.8909607193563653
Precision: 0.8912452682948799
Recall: 0.8898505793948265
F1: 0.8904069564525843


In [16]:
#Model not using the node label ML developer
inputA = Input(shape=(6))
inputB = Input(shape=(128))

x = Dense(128, activation="relu")(inputA)
x = Dense(64, activation="relu")(x)
x = Dense(32, activation="relu")(x)
x = Dense(16, activation="relu")(x)
x = Dense(4, activation="relu")(x)
x = Dense(1)(x)
x = Model(inputs=inputA, outputs=x)

y = Dense(64, activation="relu")(inputB)
y = Dense(16, activation="relu")(y)
y = Dense(4, activation="relu")(y)
y = Dense(1)(y)
y = Model(inputs=inputB, outputs=y)


#combine the above models
combined = concatenate([x.output, y.output])
w = Dense(8, activation="relu")(combined)
w = Dense(4, activation="relu")(w)
w = Dense(1)(w)

model = Model(inputs=[x.input, y.input], outputs=w)
model.compile('adam', 'mean_squared_error', metrics=['accuracy'])
#model.summary()

#train and save model
model.fit([observable_edge_train_coef, observable_edge_train_embedding_avg], observable_edge_train_label, batch_size=512, epochs=40)
model.save('model/link_prediction_avg_noml_givenfeatures.h5')

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [21]:
model = keras.models.load_model('model/link_prediction_avg_noml_givenfeatures.h5')
print('Model without ML label and with average in edge embedding')
print('')
prediction = model.predict([nonobservable_edge_test_coef,nonobservable_edge_test_embedding_avg])

y_predict = []
for i in range(len(prediction)):
    y_predict.append(round(prediction[i][0]))
accuracy, precision, recall, f1 = evaluate(nonobservable_edge_test_label, y_predict)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1: {f1}')

Model without ML label and with average in edge embedding

Accuracy: 0.8872692853762423
Precision: 0.8878934975059478
Recall: 0.8858737576331359
F1: 0.8866191848068862
