### Imports

In [1]:
%run utils.ipynb
%run SDNE+.ipynb
%run LINE+.ipynb
%run Hyperparameters.ipynb

### SDNE vs SDNE+

In [None]:
with open('./email.pkl', 'rb') as file: 
    graph_dict = pkl.load(file)
    
graph = nx.Graph(nx.to_numpy_array(graph_dict['graph']))    
graph = nx.Graph(nx.to_numpy_array(graph_dict))


sense_feat_dict, sense_features = get_sense_features(graph, ppr_flag = 'std')



nuc_list = []
error_list = []

hyp = sdne_email_hyp

for run_idx in tqdm(range(40)):

    models = {x : {} for x in hyp}
    models_p_init = {x : {} for x in hyp}
    

    for d in hyp:
        
        sdne = SDNE_plus(graph, 
                  hidden_size = [32, d], 
                  lr = hyp[d]['sdne']['lr'],
                  sense_features = sense_features.astype(np.float32),
                  alpha = hyp[d]['sdne']['alpha'], 
                  beta = hyp[d]['sdne']['beta'], 
                  gamma = hyp[d]['sdne']['gamma'], 
                  delta = hyp[d]['sdne']['delta'])
        history = sdne.train(epochs = 200, batch_size = 1024)
        e = sdne.get_embeddings()
        embed_og = np.array([e[node_name] for node_name in graph.nodes()])
        embed_og = (embed_og - np.min(embed_og)) / np.ptp(embed_og)
        



        feature_dict_og = find_feature_membership(input_embed = embed_og,
                                                            embed_name = 'SDNE',
                                                            sense_features = sense_features,
                                                            sense_feat_dict = sense_feat_dict,
                                                            top_k = 8,
                                                            solver = 'nmf')

        explain_og = feature_dict_og['explain_norm']
        error_og = sense_features * np.log((sense_features + 1e-10) / ((embed_og @ feature_dict_og['explain_norm']) + 1e-10)) - sense_features + (embed_og @ feature_dict_og['explain_norm'])
        
        explain_og = (explain_og - np.min(explain_og)) / np.ptp(explain_og)
        
        
        
        models[d]['e_norm_nuc'] = np.linalg.norm(explain_og, ord = 'nuc')
        models[d]['e_norm_l1'] = np.linalg.norm(explain_og, ord = 1)
        models[d]['e_norm_l2'] = np.linalg.norm(explain_og, ord = 2)
        models[d]['entropy'] = np.std(explain_og, axis = 1)
        models[d]['error'] = np.sum(error_og)
        
        
        sdne_plus_init = SDNE_plus(graph, 
                          hidden_size = [32, d], 
                          lr = hyp[d]['sdne+ init']['lr'],
                          sense_features = sense_features.astype(np.float32),
                          alpha = hyp[d]['sdne+ init']['alpha'], 
                          beta = hyp[d]['sdne+ init']['beta'], 
                          gamma = hyp[d]['sdne+ init']['gamma'], 
                          delta = hyp[d]['sdne+ init']['delta'])

        sdne_plus_init.model.set_weights(sdne.model.get_weights())
        history = sdne_plus_init.train(epochs = hyp[d]['sdne+ init']['epochs'], batch_size = 1024)
        e = sdne_plus_init.get_embeddings()
        embed_plus_init = np.array([e[node_name] for node_name in graph.nodes()])
        embed_plus_init = (embed_plus_init - np.min(embed_plus_init)) / np.ptp(embed_plus_init)
        

        feature_dict_plus_init = find_feature_membership(input_embed = embed_plus_init,
                                                            embed_name = 'SDNE+ Init',
                                                            sense_features = sense_features,
                                                            sense_feat_dict = sense_feat_dict,
                                                            top_k = 8,
                                                            solver = 'nmf')

        explain_plus_init = feature_dict_plus_init['explain_norm']
        error_plus_init = sense_features * np.log((sense_features + 1e-10) / ((embed_plus_init @ feature_dict_plus_init['explain_norm']) + 1e-10)) - sense_features + (embed_plus_init @ feature_dict_plus_init['explain_norm'])
        
        explain_plus_init = (explain_plus_init - np.min(explain_plus_init)) / np.ptp(explain_plus_init)
        
        models_p_init[d]['e_norm_nuc'] = np.linalg.norm(explain_plus_init, ord = 'nuc')
        models_p_init[d]['e_norm_l1'] = np.linalg.norm(explain_plus_init, ord = 1)
        models_p_init[d]['e_norm_l2'] = np.linalg.norm(explain_plus_init, ord = 2)
        models_p_init[d]['entropy'] = np.std(explain_plus_init, axis = 1)
        models_p_init[d]['error'] = np.sum(error_plus_init)
        
        del sdne
        del sdne_plus_init
        
    nuc_norm = np.array([[models[d]['e_norm_nuc'] for d in hyp], 
                        [models_p_init[d]['e_norm_nuc'] for d in hyp],]) 
    error = np.array([[models[d]['error'] for d in hyp], 
                        [models_p_init[d]['error'] for d in hyp],]) 
    
    
    nuc_list.append(nuc_norm)
    error_list.append(error)

    with open('./email_runs_40_sdne.pkl', 'wb') as file:
        pkl.dump([nuc_list, error_list], file)
        

In [None]:
nuc_mean = np.mean(np.array(nuc_list), axis = 0)
nuc_std = np.std(np.array(nuc_list), axis = 0) / np.sqrt(np.array(nuc_list).shape[0])

model_cost = np.array([32 * d * (len(graph) + 15) for d in hyp])
model_cost = (model_cost - np.min(model_cost)) / np.ptp(model_cost)

error_array = np.array(error_list)

for runs in range(error_array.shape[0]):
    for cat in range(error_array.shape[1]):
        error_array[runs][cat] = ((error_array[runs][cat] - np.min(error_array[runs][cat])) / np.ptp(error_array[runs][cat])) + model_cost
        
error_mean = np.mean(np.array(error_array), axis = 0)
error_std = np.std(np.array(error_array), axis = 0) / np.sqrt(np.array(error_array).shape[0])

fig = go.Figure()


fig.add_trace(go.Scatter(x = list(hyp.keys()), 
                         y = error_mean[1, :], 
                         error_y = dict(type = 'data', 
                                        array = error_std[1, :]),
                         name = 'SDNE+', 
                         mode = 'markers'))

fig.update_layout(title_text = 'Model Selection - SDNE', 
                  xaxis_title_text = '# Dimensions (log scale)', 
                  yaxis_title_text = 'Description Length', 
                  font = dict(size = 18))
fig.update_xaxes(type = 'log')

fig.show()

fig = go.Figure()


fig.add_trace(go.Scatter(x = list(hyp.keys()), 
                         y = nuc_mean[1, :], 
                         error_y = dict(type = 'data', 
                                        array = nuc_std[1, :]),
                         name = 'SDNE+', 
                         mode = 'markers'))
fig.add_trace(go.Scatter(x = list(hyp.keys()), 
                         y = nuc_mean[0, :],
                         error_y = dict(type = 'data', 
                                        array = nuc_std[0, :]),
                         name = 'SDNE', 
                         mode = 'markers'))
fig.update_layout(title_text = 'Nuclear Norm', 
                  xaxis_title_text = '# Dimensions (log scale)', 
                  yaxis_title_text = 'Normalized Nuclear Norm', 
                  font = dict(size = 18))
fig.update_xaxes(type = 'log')

fig.show()

### LINE vs LINE+

In [None]:
with open('./email.pkl', 'rb') as file: 
    graph_dict = pkl.load(file)
    
graph = nx.Graph(nx.to_numpy_array(graph_dict['graph']))    
graph = nx.Graph(nx.to_numpy_array(graph_dict))


sense_feat_dict, sense_features = get_sense_features(graph, ppr_flag = 'std')

hyp = line_email_hyp

nuc_list = []
error_list = []

for run_idx in tqdm(range(40)):

    models = {x : {} for x in hyp}
    models_p_init = {x : {} for x in hyp}
    


    for d in tqdm(hyp):
        line = LINE(graph, 
                embedding_size = d,
                sense_features = sense_features,
                alpha = hyp[d]['line']['alpha'], 
                ortho = hyp[d]['line']['ortho'], 
                sparse = hyp[d]['line']['sparse'],
                learning_rate =  hyp[d]['line']['lr'],
                order = 'second', 
                batch_size = len(graph))

        history = line.train(epochs = 50)

        e = line.get_embeddings()
        embed_og = np.array([e[node_name] for node_name in graph.nodes()])
        embed_og = (embed_og - np.min(embed_og)) / np.ptp(embed_og)


        feature_dict_og = find_feature_membership(input_embed = embed_og,
                                                            embed_name = 'LINE',
                                                            sense_features = sense_features,
                                                            sense_feat_dict = sense_feat_dict,
                                                            top_k = 8,
                                                            solver = 'nmf')

        explain_og = feature_dict_og['explain_norm']
        error_og = sense_features * np.log((sense_features + 1e-10) / ((embed_og @ feature_dict_og['explain_norm']) + 1e-10)) - sense_features + (embed_og @ feature_dict_og['explain_norm'])

        explain_og = (explain_og - np.min(explain_og)) / np.ptp(explain_og)
        
        models[d]['e_norm_nuc'] = np.linalg.norm(explain_og, ord = 'nuc')
        models[d]['e_norm_l1'] = np.linalg.norm(explain_og, ord = 1)
        models[d]['e_norm_l2'] = np.linalg.norm(explain_og, ord = 2)
        models[d]['entropy'] = np.std(explain_og, axis = 1)
        models[d]['error'] = np.sum(error_og)


        line_plus_init = LINE(graph, 
                        embedding_size = d,
                        sense_features = sense_features, 
                        alpha = hyp[d]['line+ init']['alpha'], 
                        ortho = hyp[d]['line+ init']['ortho'], 
                        sparse = hyp[d]['line+ init']['sparse'],
                        learning_rate =  hyp[d]['line+ init']['lr'],
                        order = 'second', 
                        batch_size = len(graph))

        line_plus_init.model.set_weights(line.model.get_weights())
        history = line_plus_init.train(epochs = hyp[d]['line+ init']['epochs'])
        
        e = line_plus_init.get_embeddings()
        embed_plus_init = np.array([e[node_name] for node_name in graph.nodes()])
        embed_plus_init = (embed_plus_init - np.min(embed_plus_init)) / np.ptp(embed_plus_init)
        

        feature_dict_plus_init = find_feature_membership(input_embed = embed_plus_init,
                                                            embed_name = 'LINE+ Init',
                                                            sense_features = sense_features,
                                                            sense_feat_dict = sense_feat_dict,
                                                            top_k = 8,
                                                            solver = 'nmf')

        explain_plus_init = feature_dict_plus_init['explain_norm']
        error_plus_init = sense_features * np.log((sense_features + 1e-10) / ((embed_plus_init @ feature_dict_plus_init['explain_norm']) + 1e-10)) - sense_features + (embed_plus_init @ feature_dict_plus_init['explain_norm'])

        explain_plus_init = (explain_plus_init - np.min(explain_plus_init)) / np.ptp(explain_plus_init)
        
        models_p_init[d]['e_norm_nuc'] = np.linalg.norm(explain_plus_init, ord = 'nuc')
        models_p_init[d]['e_norm_l1'] = np.linalg.norm(explain_plus_init, ord = 1)
        models_p_init[d]['e_norm_l2'] = np.linalg.norm(explain_plus_init, ord = 2)
        models_p_init[d]['entropy'] = np.std(explain_plus_init, axis = 1)
        models_p_init[d]['error'] = np.sum(error_plus_init)

        del line
        del line_plus_init
        
        
        
    nuc_norm = np.array([[models[d]['e_norm_nuc'] for d in hyp], 
                        [models_p_init[d]['e_norm_nuc'] for d in hyp]])
    
    error = np.array([[models[d]['error'] for d in hyp], 
                        [models_p_init[d]['error'] for d in hyp],]) 

    
    nuc_list.append(nuc_norm)
    error_list.append(error)

    
    with open('./email_runs_40_line.pkl', 'wb') as file:
        pkl.dump([nuc_list, error_list], file)

In [None]:
nuc_mean = np.mean(np.array(nuc_list), axis = 0)
nuc_std = np.std(np.array(nuc_list), axis = 0) / np.sqrt(np.array(nuc_list).shape[0])

model_cost = np.array([32 * d * (len(graph) + 15) for d in hyp])
model_cost = (model_cost - np.min(model_cost)) / np.ptp(model_cost)

error_array = np.array(error_list)

for runs in range(error_array.shape[0]):
    for cat in range(error_array.shape[1]):
        error_array[runs][cat] = ((error_array[runs][cat] - np.min(error_array[runs][cat])) / np.ptp(error_array[runs][cat])) + model_cost
        
error_mean = np.mean(np.array(error_array), axis = 0)
error_std = np.std(np.array(error_array), axis = 0) / np.sqrt(np.array(error_array).shape[0])

fig = go.Figure()


fig.add_trace(go.Scatter(x = list(hyp.keys()), 
                         y = error_mean[1, :], 
                         error_y = dict(type = 'data', 
                                        array = error_std[1, :]),
                         name = 'LINE+', 
                         mode = 'markers'))

fig.update_layout(title_text = 'Model Selection - LINE', 
                  xaxis_title_text = '# Dimensions (log scale)', 
                  yaxis_title_text = 'Description Length', 
                  font = dict(size = 18))
fig.update_xaxes(type = 'log')

fig.show()

fig = go.Figure()


fig.add_trace(go.Scatter(x = list(hyp.keys()), 
                         y = nuc_mean[1, :], 
                         error_y = dict(type = 'data', 
                                        array = nuc_std[1, :]),
                         name = 'LINE+', 
                         mode = 'markers'))
fig.add_trace(go.Scatter(x = list(hyp.keys()), 
                         y = nuc_mean[0, :],
                         error_y = dict(type = 'data', 
                                        array = nuc_std[0, :]),
                         name = 'LINE', 
                         mode = 'markers'))
fig.update_layout(title_text = 'Nuclear Norm', 
                  xaxis_title_text = '# Dimensions (log scale)', 
                  yaxis_title_text = 'Normalized Nuclear Norm', 
                  font = dict(size = 18))
fig.update_xaxes(type = 'log')

fig.show()