### Imports

In [1]:
%run utils.ipynb
%run SDNE+.ipynb
%run LINE+.ipynb

### Link Prediction

#### Read In Graph

In [None]:
# Create Graph
print ("Reading Graph...", end = '\r')
with open('/scratch/shafi.z/foursquare.pkl', 'rb') as file: 
    graph_dict  = pkl.load(file)
    graph = graph_dict['graph']

#### Create Splits

In [None]:
# Hide Edges
# Generate network with hidden edges and ensure a connected network
print ("Creating Graph Copies...", end = '\r')
graph_hidden_one = graph.copy()
graph_hidden_two = graph.copy()
graph_hidden_three = graph.copy()

print ("Computing MST...", end = '\r')
ig = igraph.Graph([[e[0], e[1]] for e in nx.to_edgelist(graph)])
mst = ig.spanning_tree()

print ("Computing Edges To Remove...", end = '\r')
all_edges = ig.get_edgelist()
edge_no_remove = mst.get_edgelist()
removable_edges = list(set(all_edges).difference(set(edge_no_remove)))

num_hidden_edges = int(0.6 * len(all_edges))
hidden_edges = np.array(removable_edges)[np.random.choice(len(removable_edges), num_hidden_edges, replace = False)]

print ("Creating Train Graphs...", end = '\r')
graph_hidden_one.remove_edges_from(hidden_edges[:int(hidden_edges.shape[0] / 3), :])
graph_hidden_two.remove_edges_from(hidden_edges[int(hidden_edges.shape[0] / 3) : 2 * int(hidden_edges.shape[0] / 3), :])
graph_hidden_three.remove_edges_from(hidden_edges[2 * int(hidden_edges.shape[0] / 3) : hidden_edges.shape[0], :])

print ("Running Sanity Checks...", end = '\r')
hidden_edges_one = [(e[0], e[1]) for e in hidden_edges[:int(hidden_edges.shape[0] / 3)]]
hidden_edges_two = [(e[0], e[1]) for e in hidden_edges[int(hidden_edges.shape[0] / 3) : 2 * int(hidden_edges.shape[0] / 3), :]]
hidden_edges_three = [(e[0], e[1]) for e in hidden_edges[2 * int(hidden_edges.shape[0] / 3) : hidden_edges.shape[0], :]]

print ("Set One Intersection : ", len(set(list(graph_hidden_one.edges)).intersection(set(hidden_edges_one))))
print ("Set Two Intersection : ", len(set(list(graph_hidden_two.edges)).intersection(set(hidden_edges_two))))
print ("Set Three Intersection : ", len(set(list(graph_hidden_three.edges)).intersection(set(hidden_edges_three))))


all_test_edges_pos = hidden_edges_one + hidden_edges_two + hidden_edges_three
ig = igraph.Graph([[e[0], e[1]] for e in nx.to_edgelist(graph)])
all_edges = set(ig.get_edgelist())
neg_indices = np.where(nx.to_numpy_array(graph) == 0)

neg_edges = set()
for _ in tqdm(range(9 * len(all_test_edges_pos))):
    
    idx = np.random.randint(neg_indices[0].shape[0])
    neg_edges.add((neg_indices[0][idx], neg_indices[1][idx]))
    
neg_edges = list(neg_edges)

ig_1 = igraph.Graph([[e[0], e[1]] for e in nx.to_edgelist(graph_hidden_one)])
ig_1_edges = ig_1.get_edgelist()
np.random.shuffle(ig_1_edges)

train_1 = ig_1_edges[:2 * len(hidden_edges_one)]
train_1_neg = neg_edges[:2 * len(hidden_edges_one)]

ig_2 = igraph.Graph([[e[0], e[1]] for e in nx.to_edgelist(graph_hidden_two)])
ig_2_edges = ig_2.get_edgelist()
np.random.shuffle(ig_2_edges)

train_2 = ig_2_edges[:2 * len(hidden_edges_two)]
train_2_neg = neg_edges[2 * len(hidden_edges_one) : 4 * len(hidden_edges_one)]

ig_3 = igraph.Graph([[e[0], e[1]] for e in nx.to_edgelist(graph_hidden_three)])
ig_3_edges = ig_3.get_edgelist()
np.random.shuffle(ig_3_edges)

train_3 = ig_3_edges[:2 * len(hidden_edges_three)]
train_3_neg = neg_edges[4 * len(hidden_edges_one) : 6 * len(hidden_edges_one)]

test_1_neg = neg_edges[6 * len(hidden_edges_one) : 7 * len(hidden_edges_one)]
test_2_neg = neg_edges[7 * len(hidden_edges_one) : 8 * len(hidden_edges_one)]
test_3_neg = neg_edges[8 * len(hidden_edges_one) : 9 * len(hidden_edges_one)]

intersection_1 = set(train_1 + train_1_neg).intersection(test_1_neg + hidden_edges_one)
intersection_2 = set(train_2 + train_2_neg).intersection(test_2_neg + hidden_edges_two)
intersection_3 = set(train_3 + train_3_neg).intersection(test_3_neg + hidden_edges_three)


train_1 = [(x, y) for (x, y) in train_1 if (x, y) not in intersection_1]
train_1_neg = [(x, y) for (x, y) in train_1_neg if (x, y) not in intersection_1]

train_2 = [(x, y) for (x, y) in train_2 if (x, y) not in intersection_2]
train_2_neg = [(x, y) for (x, y) in train_2_neg if (x, y) not in intersection_2]

train_3 = [(x, y) for (x, y) in train_3 if (x, y) not in intersection_3]
train_3_neg = [(x, y) for (x, y) in train_3_neg if (x, y) not in intersection_3]


print ("Train Set 1 - Positive : ", len(train_1))
print ("Train Set 1 - Negative : ", len(train_1_neg))
print ("Test Set 1 - Positive : ", len(hidden_edges_one))
print ("Test Set 1 - Negative : ", len(test_1_neg))
print ("Intersection : ", set(train_1 + train_1_neg).intersection(test_1_neg + hidden_edges_one))
print ()

print ("Train Set 2 - Positive : ", len(train_2))
print ("Train Set 2 - Negative : ", len(train_2_neg))
print ("Test Set 2 - Positive : ", len(hidden_edges_two))
print ("Test Set 2 - Negative : ", len(test_2_neg))
print ("Intersection : ", set(train_2 + train_2_neg).intersection(test_2_neg + hidden_edges_two))
print ()


print ("Train Set 3 - Positive : ", len(train_3))
print ("Train Set 3 - Negative : ", len(train_3_neg))
print ("Test Set 3 - Positive : ", len(hidden_edges_three))
print ("Test Set 3 - Negative : ", len(test_3_neg))
print ("Intersection : ", set(train_3 + train_3_neg).intersection(test_3_neg + hidden_edges_three))

graph_dict = {'graph' : graph, 
              'graph_hidden_1' : graph_hidden_one, 
              'graph_hidden_2' : graph_hidden_two, 
              'graph_hidden_3' : graph_hidden_three,
              'train_1' : train_1,
              'train_2' : train_2,
              'train_3' : train_3,
              'train_1_neg' : train_1_neg,
              'train_2_neg' : train_2_neg,
              'train_3_neg' : train_3_neg,
              'test_1' : hidden_edges_one, 
              'test_2' : hidden_edges_two, 
              'test_3' : hidden_edges_three,
              'test_1_neg' : test_1_neg,
              'test_2_neg' : test_2_neg,
              'test_3_neg' : test_3_neg,
        }

with open('/scratch/shafi.z/foursquare.pkl', 'wb') as file: 
    pkl.dump(graph_dict, file)



print ("Computing Sense Features For Fold 1...", end = '\r')
sense_feat_dict_one, sense_features_one = get_sense_features(graph_hidden_one, ppr_flag = 'std')

graph_dict = {'graph' : graph, 
              'graph_hidden_1' : graph_hidden_one, 
              'graph_hidden_2' : graph_hidden_two, 
              'graph_hidden_3' : graph_hidden_three,
              'train_1' : train_1,
              'train_2' : train_2,
              'train_3' : train_3,
              'train_1_neg' : train_1_neg,
              'train_2_neg' : train_2_neg,
              'train_3_neg' : train_3_neg,
              'test_1' : hidden_edges_one, 
              'test_2' : hidden_edges_two, 
              'test_3' : hidden_edges_three,
              'test_1_neg' : test_1_neg,
              'test_2_neg' : test_2_neg,
              'test_3_neg' : test_3_neg,
              'sense_feat_dict' : sense_feat_dict_one,
              'sense_features_1' : sense_features_one, 
        }

with open('/scratch/shafi.z/foursquare.pkl', 'wb') as file: 
    pkl.dump(graph_dict, file)

print ("Computing Sense Features For Fold 2...", end = '\r')
sense_feat_dict_two, sense_features_two = get_sense_features(graph_hidden_two, ppr_flag = 'std')

graph_dict = {'graph' : graph, 
              'graph_hidden_1' : graph_hidden_one, 
              'graph_hidden_2' : graph_hidden_two, 
              'graph_hidden_3' : graph_hidden_three,
              'train_1' : train_1,
              'train_2' : train_2,
              'train_3' : train_3,
              'train_1_neg' : train_1_neg,
              'train_2_neg' : train_2_neg,
              'train_3_neg' : train_3_neg,
              'test_1' : hidden_edges_one, 
              'test_2' : hidden_edges_two, 
              'test_3' : hidden_edges_three,
              'test_1_neg' : test_1_neg,
              'test_2_neg' : test_2_neg,
              'test_3_neg' : test_3_neg,
              'sense_feat_dict' : sense_feat_dict_one,
              'sense_features_1' : sense_features_one, 
              'sense_features_2' : sense_features_two, 
        }

with open('/scratch/shafi.z/email_pos.pkl', 'wb') as file: 
    pkl.dump(graph_dict, file)

print ("Computing Sense Features For Fold 3...", end = '\r')
sense_feat_dict_three, sense_features_three = get_sense_features(graph_hidden_three, ppr_flag = 'std')

graph_dict = {'graph' : graph, 
              'graph_hidden_1' : graph_hidden_one, 
              'graph_hidden_2' : graph_hidden_two, 
              'graph_hidden_3' : graph_hidden_three,
              'train_1' : train_1,
              'train_2' : train_2,
              'train_3' : train_3,
              'train_1_neg' : train_1_neg,
              'train_2_neg' : train_2_neg,
              'train_3_neg' : train_3_neg,
              'test_1' : hidden_edges_one, 
              'test_2' : hidden_edges_two, 
              'test_3' : hidden_edges_three,
              'test_1_neg' : test_1_neg,
              'test_2_neg' : test_2_neg,
              'test_3_neg' : test_3_neg,
              'sense_feat_dict' : sense_feat_dict_one,
              'sense_features_1' : sense_features_one, 
              'sense_features_2' : sense_features_two, 
              'sense_features_3' : sense_features_three,
        }

with open('/scratch/shafi.z/email_pos.pkl', 'wb') as file: 
    pkl.dump(graph_dict, file)

#### Embed and Predict - SDNE

In [None]:
for idx in tqdm(range(1, 4)):
    
    graph = graph_dict['graph_hidden_' + str(idx)]
    sense_feat_dict = graph_dict['sense_feat_dict']
    sense_features = graph_dict['sense_features_' + str(idx)]
    
    
    train = graph_dict['train_' + str(idx)] 
    train_neg = graph_dict['train_' + str(idx) + '_neg'] 
    test_pos = graph_dict['test_' + str(idx)]
    test_neg = graph_dict['test_' + str(idx) + '_neg'] 

    
    
    # Run SDNE 
    sdne = SDNE_plus(graph, 
                      hidden_size = [64, 128], 
                      lr = 1e-3,
                      sense_features = sense_features.astype(np.float32),
                      alpha = 0.1, 
                      beta = 10, 
                      gamma = 0, 
                      delta = 0)

    history = sdne.train(epochs = 50, batch_size = 10000)
    print ("Training Complete")
    embed = sdne.get_embeddings()
    embedding = np.array([embed[node_name] for node_name in graph.nodes()])
    print ("Embeddings Generated")
    
    sdne.model.save_weights('/scratch/shafi.z/foursquare_' + str(idx) + '_.model')
    with open('/scratch/shafi.z/foursquare_embed_' + str(idx) + '_.pkl', 'wb') as file: 
        pkl.dump(embedding, file)

    
    
    # Run SDNE+ Init
    sdne_plus_init = SDNE_plus(graph, 
                              hidden_size = [64, 128], 
                              lr = 1e-3,
                              sense_features = sense_features.astype(np.float32),
                              alpha = 5, 
                              beta = 5, 
                              gamma = 10, 
                              delta = 5)
    
    
    sdne_plus_init.model.load_weights('/scratch/shafi.z/foursquare_' + str(idx) + '_.model')
    
    # sdne_plus_init.model.set_weights(sdne.model.get_weights())
    history_plus_init = sdne_plus_init.train(epochs = 50, batch_size = 10000)
    embed_plus = sdne_plus_init.get_embeddings()
    embedding_plus_init = np.array([embed_plus[node_name] for node_name in graph.nodes()])
    
    sdne_plus_init.model.save_weights('/scratch/shafi.z/foursquare_plsu_' + str(idx) + '_.model')
    with open('/scratch/shafi.z/foursquare_embed_plus_' + str(idx) + '_.pkl', 'wb') as file: 
        pkl.dump(embedding_plus_init, file)
    
    
    with open('/scratch/shafi.z/foursquare_embed_' + str(idx) + '_.pkl', 'rb') as file: 
        embedding = pkl.load(file)
        
    with open('/scratch/shafi.z/foursquare_embed_plus_' + str(idx) + '_.pkl', 'rb') as file: 
        embedding_plus_init = pkl.load(file)
    # Compare

    feature_dict = find_feature_membership(input_embed = embedding,
                                           embed_name = 'SDNE',
                                           sense_features = sense_features,
                                           sense_feat_dict = sense_feat_dict,
                                           top_k = 8,
                                           solver = 'nmf')

    feature_dict_plus_init = find_feature_membership(input_embed = embedding_plus_init,
                                                     embed_name = 'SDNE+ Init',
                                                     sense_features = sense_features,
                                                     sense_feat_dict = sense_feat_dict,
                                                     top_k = 8,
                                                     solver = 'nmf')
    
    # Run link prediction models
    sdne_results = get_embed_perf(input_embed = embedding,
                             input_dict = feature_dict,
                             data = None, 
                             labels = None, 
                             graph = graph, 
                             hidden_edges = test_pos,
                                  train_set = train,
                                  train_set_neg = train_neg,
                                  test_set = test_pos,
                                  test_set_neg = test_neg, 
                                    epochs = 100)


    sdne_plus_init_results = get_embed_perf(input_embed = embedding_plus_init,
                                           input_dict = feature_dict_plus_init,
                                           data = None, 
                                           labels = None, 
                                           graph = graph, 
                                           hidden_edges = test_pos,
                                            train_set = train,
                                            train_set_neg = train_neg,
                                            test_set = test_pos,
                                            test_set_neg = test_neg, 
                                            epochs = 100)
    
    
    # Sum of Variances
    variance = np.sum(np.square(np.std(feature_dict['explain_norm'], axis = 1)))
    variance_plus_init = np.sum(np.square(np.std(feature_dict_plus_init['explain_norm'], axis = 1)))
    
    # Other Metrics
    e = feature_dict['explain_norm']
    e_plus_init = feature_dict_plus_init['explain_norm']

    perc_zero = len(np.where(e == 0)[0]) / np.product(e.shape)
    perc_zero_plus_init = len(np.where(e_plus_init == 0)[0]) / np.product(e_plus_init.shape)

    ortho = np.sum(e @ e.T)
    ortho_plus_init = np.sum(e_plus_init @ e_plus_init.T)

    # Put into pretty dataframes
    info = pd.DataFrame([[variance_plus_init, variance,], [perc_zero_plus_init, perc_zero], [ortho_plus_init, ortho]])
    info.index = ['Sum Of Variances', 'Percentage of Zero Entries (Sparsity)', 'Orthogonality']
    info.columns = ['SDNE+ Init', 'SDNE']

    results = pd.concat([sdne_plus_init_results.T, sdne_results.T])
    results.index = ['SDNE+ Init', 'SDNE']
    results = results.T
    results = results.append(info)

    return_dict = {'graph' : graph_dict['graph'], 
                   'graph_hidden' : graph_hidden,
                   'hidden_edges' : hidden_edges,
                   'results' : results, 
                   'embedding' : embedding,
                   'embedding_plus_init' : embedding_plus_init,
                   'sense_feat_dict' : sense_feat_dict,
                   'sense_features' : sense_features,
                   'feature_dict' : feature_dict, 
                   'feature_dict_plus_init' : feature_dict_plus_init,
                   'history' : history.history, 
                   'history_plus_init' : history_plus_init.history,}

    with open('/scratch/shafi.z/foursquare_cv_linkpred_' + str(idx) + '_.pkl', 'wb') as file:
        pkl.dump(return_dict, file)

#### EMbed and Predict - LINE

In [None]:
# for idx in tqdm(range(1, 4)):
    
#     graph = graph_dict['graph_hidden_' + str(idx)]
#     sense_feat_dict = graph_dict['sense_feat_dict']
#     sense_features = graph_dict['sense_features_' + str(idx)]
    
    
#     train = graph_dict['train_' + str(idx)] 
#     train_neg = graph_dict['train_' + str(idx) + '_neg'] 
#     test_pos = graph_dict['test_' + str(idx)]
#     test_neg = graph_dict['test_' + str(idx) + '_neg'] 

    
    
#     # Run LINE 
#     sdne = SDNE_plus(graph, 
#                       hidden_size = [64, 128], 
#                       lr = 1e-3,
#                       sense_features = sense_features.astype(np.float32),
#                       alpha = 0.1, 
#                       beta = 10, 
#                       gamma = 0, 
#                       delta = 0)

#     history = sdne.train(epochs = 50, batch_size = 10000)
#     print ("Training Complete")
#     embed = sdne.get_embeddings()
#     embedding = np.array([embed[node_name] for node_name in graph.nodes()])
#     print ("Embeddings Generated")
    
#     sdne.model.save_weights('/scratch/shafi.z/foursquare_' + str(idx) + '_.model')
#     with open('/scratch/shafi.z/foursquare_embed_' + str(idx) + '_.pkl', 'wb') as file: 
#         pkl.dump(embedding, file)

    
    
#     # Run SDNE+ Init
#     sdne_plus_init = SDNE_plus(graph, 
#                               hidden_size = [64, 128], 
#                               lr = 1e-3,
#                               sense_features = sense_features.astype(np.float32),
#                               alpha = 5, 
#                               beta = 5, 
#                               gamma = 10, 
#                               delta = 5)
    
    
#     sdne_plus_init.model.load_weights('/scratch/shafi.z/foursquare_' + str(idx) + '_.model')
    
#     # sdne_plus_init.model.set_weights(sdne.model.get_weights())
#     history_plus_init = sdne_plus_init.train(epochs = 50, batch_size = 10000)
#     embed_plus = sdne_plus_init.get_embeddings()
#     embedding_plus_init = np.array([embed_plus[node_name] for node_name in graph.nodes()])
    
#     sdne_plus_init.model.save_weights('/scratch/shafi.z/foursquare_plsu_' + str(idx) + '_.model')
#     with open('/scratch/shafi.z/foursquare_embed_plus_' + str(idx) + '_.pkl', 'wb') as file: 
#         pkl.dump(embedding_plus_init, file)
    
    
#     with open('/scratch/shafi.z/foursquare_embed_' + str(idx) + '_.pkl', 'rb') as file: 
#         embedding = pkl.load(file)
        
#     with open('/scratch/shafi.z/foursquare_embed_plus_' + str(idx) + '_.pkl', 'rb') as file: 
#         embedding_plus_init = pkl.load(file)
#     # Compare

#     feature_dict = find_feature_membership(input_embed = embedding,
#                                            embed_name = 'SDNE',
#                                            sense_features = sense_features,
#                                            sense_feat_dict = sense_feat_dict,
#                                            top_k = 8,
#                                            solver = 'nmf')

#     feature_dict_plus_init = find_feature_membership(input_embed = embedding_plus_init,
#                                                      embed_name = 'SDNE+ Init',
#                                                      sense_features = sense_features,
#                                                      sense_feat_dict = sense_feat_dict,
#                                                      top_k = 8,
#                                                      solver = 'nmf')
    
#     # Run link prediction models
#     sdne_results = get_embed_perf(input_embed = embedding,
#                              input_dict = feature_dict,
#                              data = None, 
#                              labels = None, 
#                              graph = graph, 
#                              hidden_edges = test_pos,
#                                   train_set = train,
#                                   train_set_neg = train_neg,
#                                   test_set = test_pos,
#                                   test_set_neg = test_neg, 
#                                     epochs = 100)


#     sdne_plus_init_results = get_embed_perf(input_embed = embedding_plus_init,
#                                            input_dict = feature_dict_plus_init,
#                                            data = None, 
#                                            labels = None, 
#                                            graph = graph, 
#                                            hidden_edges = test_pos,
#                                             train_set = train,
#                                             train_set_neg = train_neg,
#                                             test_set = test_pos,
#                                             test_set_neg = test_neg, 
#                                             epochs = 100)
    
    
#     # Sum of Variances
#     variance = np.sum(np.square(np.std(feature_dict['explain_norm'], axis = 1)))
#     variance_plus_init = np.sum(np.square(np.std(feature_dict_plus_init['explain_norm'], axis = 1)))
    
#     # Other Metrics
#     e = feature_dict['explain_norm']
#     e_plus_init = feature_dict_plus_init['explain_norm']

#     perc_zero = len(np.where(e == 0)[0]) / np.product(e.shape)
#     perc_zero_plus_init = len(np.where(e_plus_init == 0)[0]) / np.product(e_plus_init.shape)

#     ortho = np.sum(e @ e.T)
#     ortho_plus_init = np.sum(e_plus_init @ e_plus_init.T)

#     # Put into pretty dataframes
#     info = pd.DataFrame([[variance_plus_init, variance,], [perc_zero_plus_init, perc_zero], [ortho_plus_init, ortho]])
#     info.index = ['Sum Of Variances', 'Percentage of Zero Entries (Sparsity)', 'Orthogonality']
#     info.columns = ['SDNE+ Init', 'SDNE']

#     results = pd.concat([sdne_plus_init_results.T, sdne_results.T])
#     results.index = ['SDNE+ Init', 'SDNE']
#     results = results.T
#     results = results.append(info)

#     return_dict = {'graph' : graph_dict['graph'], 
#                    'graph_hidden' : graph_hidden,
#                    'hidden_edges' : hidden_edges,
#                    'results' : results, 
#                    'embedding' : embedding,
#                    'embedding_plus_init' : embedding_plus_init,
#                    'sense_feat_dict' : sense_feat_dict,
#                    'sense_features' : sense_features,
#                    'feature_dict' : feature_dict, 
#                    'feature_dict_plus_init' : feature_dict_plus_init,
#                    'history' : history.history, 
#                    'history_plus_init' : history_plus_init.history,}

#     with open('/scratch/shafi.z/foursquare_cv_linkpred_' + str(idx) + '_.pkl', 'wb') as file:
#         pkl.dump(return_dict, file)