### Imports

In [1]:
import numpy as np
import networkx as nx
import igraph
import matplotlib.pyplot as plt
import tensorflow as tf
import plotly.graph_objects as go
import pandas as pd
import pickle as pkl

from tqdm import tqdm
from sklearn.decomposition import NMF, non_negative_factorization
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import average_precision_score, roc_auc_score

from IPython.display import Image
from plotly.subplots import make_subplots
from scipy.stats import wasserstein_distance

from tensorflow.keras import layers, models, Model, Sequential
from tensorflow.keras.layers import Dense, Flatten, Input, Dropout, Concatenate
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint

%matplotlib inline

### Graph Generators

In [2]:
def add_weights(G, weights):

    '''
        Add weights to a graph
        Input : 
            G       : nx Graph object - Input graph
            weights : String - Poisson | Uniform
    '''

    num_weights = G.number_of_edges()
    
    if weights == 'Poisson':
        w = 1 + np.random.poisson(20, (num_weights))
    elif weights == 'Uniform':
        w = 1 + np.random.randint(41, size = (num_weights))
    else:
        w = np.ones((num_weights))

    for idx, e in enumerate(G.edges):
        G.edges[e]['weight'] = w[idx]

In [None]:
def generate_graph(graph_type, num_nodes, param, weights):
    
    assert(weights in ['Poisson', 'Uniform', 'Equal'])

    # Erdos-Renyi Graph
    if graph_type == 'er':    
        graph = nx.erdos_renyi_graph(n = num_nodes, p = param[0])
        add_weights(graph, weights)

    # Barabasi-Albert Graph
    elif graph_type == 'ba':   
        graph = nx.barabasi_albert_graph(n = num_nodes, m = param[0])
        add_weights(graph, weights)
        
    # Watts-Strogatz Graph
    elif graph_type == 'ws':   
        graph = nx.watts_strogatz_graph(n = num_nodes, k = param[0], p = param[1])
        add_weights(graph, weights)
        
    # Lattice Graph
    elif graph_type == 'lattice':   
        graph = nx.Graph(nx.adjacency_matrix(nx.grid_2d_graph(num_nodes, num_nodes)))
        add_weights(graph, weights)
    
    # Complete Graph
    elif graph_type == 'complete':
        graph = nx.complete_graph(num_nodes)
        add_weights(graph, weights)

    else:
        print('Invalid graph name. Please try one of : er, ba, ws, lattice, complete')
        raise
    
    return graph

### Sense Features

In [4]:
def get_sense_features(graph, ppr_flag = 'std', weighted = False):
    
    if weighted: 
        sense_feat_dict = {

            'Degree' : 0,
            'Weighted Degree' : 1, 
            'Clustering Coefficient' : 2, 
            'Personalized Page Rank - Median' : 3,
            'Personalized Page Rank - Standard Deviation' : 4,
            'Structural Holes Constraint' : 5, 
            'Average Neighbor Degree' : 6,
            'EgoNet Edges' : 7, 
            'Average Neighbor Clustering' : 8,
            'Node Betweenness' : 9, 
            'Page Rank' : 10, 
            'Eccentricity' : 11,
            'Degree Centrality' : 12, 
            'Eigen Centrality' : 13,
            'Katz Centrality' : 14
        }
        
    else: 
        sense_feat_dict = {

            'Degree' : 0,
            'Clustering Coefficient' : 1, 
            'Personalized Page Rank - Median' : 2,
            'Personalized Page Rank - Standard Deviation' : 3,
            'Structural Holes Constraint' : 4, 
            'Average Neighbor Degree' : 5,
            'EgoNet Edges' : 6, 
            'Average Neighbor Clustering' : 7,
            'Node Betweenness' : 8, 
            'Page Rank' : 9, 
            'Eccentricity' : 10,
            'Degree Centrality' : 11, 
            'Eigen Centrality' : 12,
            'Katz Centrality' : 13,
        }
    
    if len(list(nx.algorithms.components.connected_components(graph))) > 1:
        print ("Disconnected Network")
        sense_feat_dict = {

            'Degree' : 0,
            'Weighted Degree' : 1, 
            'Clustering Coefficient' : 2, 
            'Personalized Page Rank - Median' : 3,
            'Personalized Page Rank - Standard Deviation' : 4,
            'Structural Holes Constraint' : 5, 
            'Average Neighbor Degree' : 6,
            'EgoNet Edges' : 7, 
            'Average Neighbor Clustering' : 8,
            'Node Betweenness' : 9, 
            'Page Rank' : 10, 
            'Degree Centrality' : 11, 
            'Eigen Centrality' : 12, 
            'Katz Centrality' : 13
          }
        
    if ppr_flag == 'mean': 
        print ("Using Means For PPR")
        sense_feat_dict = {
    
            'Degree' : 0,
            'Weighted Degree' : 1, 
            'Clustering Coefficient' : 2, 
            'Personalized Page Rank - Mean' : 3,
            'Structural Holes Constraint' : 4, 
            'Average Neighbor Degree' : 5,
            'EgoNet Edges' : 6, 
            'Average Neighbor Clustering' : 7,
            'Node Betweenness' : 8, 
            'Page Rank' : 9, 
            'Eccentricity' : 10,
            'Degree Centrality' : 11,
            'Eigen Centrality' : 12, 
            'Katz Centrality' : 13
        }
    
    ig = igraph.Graph([[e[0], e[1]] for e in nx.to_edgelist(graph)])
    sense_features = np.zeros((len(graph), len(sense_feat_dict)))

    print ("Calculating Degrees...                                   ", end = '\r')
    # Degree
    sense_features[:, sense_feat_dict['Degree']] = list(dict(graph.degree).values())

    if weighted: 
        print ("Calculating Weighted Degrees...                           ", end = '\r')
        # Weighted Degree
        sense_features[:, sense_feat_dict['Weighted Degree']] = list(dict(graph.degree(weight = 'weight')).values())
    
    print ("Calculating Average Neighbor Degree...                    ", end = '\r')
    # Neighbor Degree Average
    sense_features[:, sense_feat_dict['Average Neighbor Degree']] = [np.mean([graph.degree[neighbor] for neighbor in dict(graph[node]).keys()]) for node in graph.nodes]

    print ("Calculating Clustering Coefficient...                     ", end = '\r')
    # Clustering Coefficient
    cluster_dict = nx.clustering(graph)
    sense_features[:, sense_feat_dict['Clustering Coefficient']] = list(cluster_dict.values())

    print ("Calculating Average Neighbor Clustering Coefficients...   ", end = '\r')
    # Neighbor Average Clustering 
    sense_features[:, sense_feat_dict['Average Neighbor Clustering']] = [np.mean([cluster_dict[neighbor] for neighbor in list(graph[node])]) for node in graph.nodes]
    
    print ("Calculating Eccentricity...                               ", end = '\r')
    # Eccentricity
    try:
        sense_features[:, sense_feat_dict['Eccentricity']] = ig.eccentricity() #list(nx.algorithms.distance_measures.eccentricity(graph).values())
    except Exception as e: 
        print ("Could not compute Eccentricity : ", e)
    
    print ("Calculating Page Rank...                                  ", end = '\r')
    # Page Rank
    sense_features[:, sense_feat_dict['Page Rank']] = ig.pagerank(directed = False) #list(nx.pagerank(graph).values())
    
    print ("Calculating Personalized Page Rank...                     ", end = '\r')
    
    if ppr_flag == 'mean':
        ppr = np.zeros((1, len(graph)))
        for node_idx, node in tqdm(enumerate(range(len(graph)))):
            r = np.zeros((len(graph)))
            r[node] = 1
            ppr = ppr + ig.personalized_pagerank(reset = r, directed = False)
        ppr = ppr / len(graph)
        sense_features[:, sense_feat_dict['Personalized Page Rank - Mean']] = ppr
        
        
    else: 
        ppr = np.zeros((len(graph), len(graph)))
        for node_idx, node in tqdm(enumerate(range(len(graph)))):
            r = np.zeros((len(graph)))
            r[node] = 1
            ppr[node_idx, :] = ig.personalized_pagerank(reset = r, directed = False)

        sense_features[:, sense_feat_dict['Personalized Page Rank - Standard Deviation']] = np.std(ppr, axis = 0)
        sense_features[:, sense_feat_dict['Personalized Page Rank - Median']] = np.median(ppr, axis = 0)
    
    print ("Calculating Node Betweenness...                           ", end = '\r')
    # Node Betweenness 
    sense_features[:, sense_feat_dict['Node Betweenness']] = ig.betweenness(directed = False) #list(nx.algorithms.centrality.betweenness_centrality(graph).values())

    print ("Calculating Number Of Edges In Ego Nets...                ", end = '\r')
    # EgoNet Edges
    sense_features[:, sense_feat_dict['EgoNet Edges']] = [len(nx.ego_graph(graph, n = node).edges) for node in graph.nodes]

    print ("Calculating Structural Hole Constraint Scores...         ", end = '\r')
    # Structual Holes
    sense_features[:, sense_feat_dict['Structural Holes Constraint']] = ig.constraint() #list(nx.algorithms.structuralholes.constraint(graph, weight = 'weight').values())

    
    print ("Calculating Degree Centrality...                         ", end = '\r')
    sense_features[:, sense_feat_dict['Degree Centrality']] =  list(dict(nx.degree_centrality(graph)).values())
    
    print ("Calculating Eigen Centrality...                          ", end = '\r')
    sense_features[:, sense_feat_dict['Eigen Centrality']] = ig.eigenvector_centrality(directed = False)
    
    print ("Calculating Katz Centrality...                           ", end = '\r')
    sense_features[:, sense_feat_dict['Katz Centrality']] =  list(dict(nx.katz_centrality_numpy(graph)).values())
    
    
    print ("Normalizing Features Between 0 And 1...                   ", end = '\r')
    # Normalise to between 0 and 1 
    sense_features = (sense_features - np.min(sense_features, axis = 0)) / np.ptp(sense_features, axis = 0)
    
    print ("Done                                                      ", end = '\r')
    
    return sense_feat_dict, sense_features
    

In [5]:
def get_positional_sense_features(graph, num_anchors, anchor_list = None):
        
    graph.remove_edges_from(nx.selfloop_edges(graph))
    
    core_numbers = np.array(list(dict(nx.core_number(graph)).values()))
    core_p = core_numbers / np.sum(core_numbers)
    
    if anchor_list is None:
        core_anchors = np.random.choice(len(graph), p = core_p, size = num_anchors)
    else: 
        core_anchors = anchor_list
    
    sense_feat_dict = []

    sense_features = np.zeros((len(graph), 1 + (2 * num_anchors)))

    ig = igraph.Graph([[e[0], e[1]] for e in nx.to_edgelist(graph)])

    print ("Computing Core Number...", end = '\r')
    sense_features[:, len(sense_feat_dict)] = core_numbers
    sense_feat_dict.append("Core Number")

    print ("Computing PPR to Core Random Nodes...", end = '\r')
    for idx, node in tqdm(enumerate(core_anchors)):
        r = np.zeros((len(graph)))
        r[node] = 1
        sense_features[:, len(sense_feat_dict)] = ig.personalized_pagerank(reset = r, directed = False)
        sense_feat_dict.append("PPR To Random Node " + str(idx)) 

    print ("Computing Hops to Core Random Nodes...", end = '\r')
    for idx, node in tqdm(enumerate(core_anchors)):
        sp_ = nx.single_source_shortest_path_length(graph, source = node)
        sense_features[:, len(sense_feat_dict)] = [sp_[n] for n in range(len(graph))]
        sense_feat_dict.append("Hops To Random Node " + str(idx))


    print ("Normalizing Features Between 0 And 1...                   ", end = '\r')
    # Normalise to between 0 and 1 
    sense_features = (sense_features - np.min(sense_features, axis = 0)) / np.ptp(sense_features, axis = 0)
    sense_feat_dict = {sense_feat_dict[idx] : idx for idx in range(len(sense_feat_dict))}
    
    return sense_feat_dict, sense_features

### Sense Making

In [6]:
def find_feature_membership(input_embed, embed_name, sense_features, sense_feat_dict, top_k = 8, gd_steps = 1000, solver = 'nmf', plot = False, constraints = False):
    
    
    if solver == 'gd' :
        # Tensorflow Variables For Optimization
        # Input embedding - fixed
        embeddings = tf.Variable(initial_value = input_embed,
                    shape = input_embed.shape,
                    dtype = tf.float32, trainable = False)

        # Matrix explaining membership - trainable
        explain = tf.Variable(initial_value = np.random.randn(input_embed.shape[1], sense_features.shape[1]),
                        shape = (input_embed.shape[1], sense_features.shape[1]),
                        dtype = tf.float32, trainable = True)

        # Explainable features - fixed
        sense = tf.Variable(initial_value = sense_features,
                        shape = sense_features.shape,
                        dtype = tf.float32, trainable = False)

        # Set up an optimizer
        optimizer = tf.keras.optimizers.Nadam(learning_rate = 0.001)

        # Optimize 
        losses = []
        for i in tqdm(range(gd_steps)):

            with tf.GradientTape() as tape:

                # Minimize || sense - embeddings @ explain ||
                prod = tf.matmul(embeddings, explain)
                loss = tf.norm(sense - prod, ord = 2)
                
                if constraints == True:
                    loss = loss + (0.5 * tf.linalg.norm(tf.matmul(explain, explain, transpose_b = True))) + (0.5 * tf.math.reduce_sum(tf.linalg.norm(explain, axis = 0)))
        
            gradients = tape.gradient(loss, [explain])
            optimizer.apply_gradients(zip(gradients, [explain]))
            explain.assign(tf.clip_by_value(explain, clip_value_min = 0, clip_value_max = tf.math.reduce_max(explain)))
            
            losses.append(loss)
        
        reconstruction_loss = losses[-1]
        print ("Reconstruction Loss : ", float(loss))

        # Default embed vector - assume one dimension only - non trainable
        embeddings_default = tf.Variable(initial_value = np.ones((input_embed.shape[0], 1)),
                                     shape = (input_embed.shape[0], 1),
                                     dtype = tf.float32, trainable = False)

        # Default explanantion vector - trainable
        explain_default = tf.Variable(initial_value = np.random.randn(1, sense_features.shape[1]),
                                      shape = (1, sense_features.shape[1]),
                                      dtype = tf.float32, trainable = True)

        optimizer = tf.keras.optimizers.Nadam(learning_rate = 0.001)

        default_losses = []
        for i in tqdm(range(gd_steps)):
            
                                  
            with tf.GradientTape() as tape:

                prod = tf.matmul(embeddings_default, explain_default)
                loss = tf.norm(sense - prod, ord = 2)

            gradients = tape.gradient(loss, [explain_default])
            optimizer.apply_gradients(zip(gradients, [explain_default]))
            explain_default.assign(tf.clip_by_value(explain_default, clip_value_min = 0, clip_value_max = tf.math.reduce_max(explain_default)))
            default_losses.append(loss)

        print ("Default Loss : ", float(loss))
        
    if solver == 'nmf':
        
        # Ensure proper dtypes
        sense_features = sense_features.astype(np.float32)
        input_embed = input_embed.astype(np.float32)

        # Play around with transposes to make it make sense
        explain, embed_recon, _ = non_negative_factorization(n_components = input_embed.shape[1],
                                                                     init = 'custom',
                                                                     max_iter = 4000,
                                                                     X = sense_features.T,
                                                                     H = input_embed.T,
                                                                     update_H = False)

        explain = explain.T
        embed_recon = embed_recon.T

        reconstruction_loss = np.linalg.norm(sense_features - (input_embed @ explain))
        
        default_embed = np.ones((input_embed.shape[0], 1)).astype(np.float32)
        explain_default, _, _ = non_negative_factorization(n_components = default_embed.shape[1],
                                                             init = 'custom',
                                                             max_iter = 2000,
                                                             X = sense_features.T,
                                                             H = default_embed.T,
                                                             update_H = False)
        explain_default = explain_default.T
        loss_2 = np.linalg.norm(sense_features - (default_embed @ explain_default))
                            
                                
        
    
    # Normalize matrix by the default matrix learned
    explain_norm = np.array(explain / explain_default)    
    explain_norm_softmax = np.array([np.exp(x) / sum(np.exp(x)) for x in explain_norm])
    explain_variance = np.square(np.std(explain_norm, axis = 1))
    
    # Plot variance in explanability of each dimension
    embed_dimensions = input_embed.shape[1]

    if plot: 
        fig = go.Figure()
        fig.add_trace(go.Bar(x = list(range(embed_dimensions)), 
                             y = explain_variance,
                             name = 'Variance of Embedding Dimensions'))
        fig.add_trace(go.Scatter(x = list(range(embed_dimensions)), 
                                 y = [np.mean(explain_variance)] * embed_dimensions, 
                                 mode = 'lines', 
                                 name = 'Mean of Variance'))
        fig.add_trace(go.Scatter(x = list(range(embed_dimensions)), 
                                 y = [np.median(explain_variance)] * embed_dimensions, 
                                 mode = 'lines', 
                                 name = 'Median of Variance'))
        fig.update_layout(title_text = 'Variance of Explanability Across Dimensions - ' + embed_name,
                          xaxis_title_text = 'Dimensions', 
                          yaxis_title_text = 'Variance')
        fig.show()
    
    # Figure out which dimensions to keep - ones with most variance 
    dimensions_idx_to_keep = np.where(explain_variance > np.mean(explain_variance))[0]
    dimensions_to_keep = np.array(explain_norm)[dimensions_idx_to_keep]
    dimensions_to_keep_softmax = explain_norm_softmax[dimensions_idx_to_keep]
    top_k_dims = np.argsort(explain_variance)[-top_k:]
    
    # Plot membership of sense features vs remaining dimensions
    features = list(sense_feat_dict.keys())
    
    if plot: 
        fig = go.Figure()

        for idx in range(len(dimensions_to_keep)):
            fig.add_trace(go.Bar(x = features, 
                                 y = dimensions_to_keep[idx],
                                 name = 'Dimension ' + str(dimensions_idx_to_keep[idx])))

        fig.update_layout(title_text = 'Embedding Dimension Feature Membership - ' + embed_name,
                          xaxis_title_text = 'Sense Features',
                          yaxis_title_text = 'Membership',
                          barmode = 'group')
        fig.show()


    return_dict = {
        'explain' : explain,
        'explain_norm' : explain_norm,
        'explain_default' : explain_default,
        'dimensions_idx_to_keep' : dimensions_idx_to_keep,
        'top_k_dims' : top_k_dims,
        'reconstruction_loss' : reconstruction_loss
    }
    
    return return_dict



### Link Prediction

In [7]:
def decoder_model(input_shape):
    
    node_a = Input(shape = input_shape)
    node_b = Input(shape = input_shape)
    
    X = Concatenate()([node_a, node_b])
    X = Dense(64, activation = 'relu')(X)
    X = Dense(2, activation = 'softmax')(X)
    
    return Model(inputs = [node_a, node_b], outputs = X)

In [8]:
def get_embed_perf(input_embed, input_dict, data = None, labels = None, graph = None, epochs = 200, hidden_edges = None, train_set = None, train_set_neg = None, test_set = None, test_set_neg = None):
    
    results = np.zeros((10, 1))
            
    # All Dimensions 
    all_train_acc, all_eval_acc, all_embed_dim, all_auc, all_aup  = get_link_perf(input_embed = input_embed,
                                                               data = data, 
                                                               labels = labels,
                                                               graph = graph,
                                                               hidden_edges = hidden_edges, 
                                                               train_set = train_set, 
                                                               train_set_neg = train_set_neg, 
                                                               test_set = test_set, 
                                                               test_set_neg = test_set_neg, 
                                                                                 epochs = epochs)

    # Important Dimensions 
    embed_imp = input_embed[:, input_dict['dimensions_idx_to_keep']]
    imp_train_acc, imp_eval_acc, imp_embed_dim, imp_auc, imp_aup = get_link_perf(input_embed = embed_imp,
                                                               data = data, 
                                                               labels = labels,
                                                               graph = graph,
                                                               hidden_edges = hidden_edges, 
                                                               train_set = train_set, 
                                                               train_set_neg = train_set_neg, 
                                                               test_set = test_set, 
                                                               test_set_neg = test_set_neg, 
                                                                                epochs = epochs)


    results[:, 0] = all_train_acc, all_eval_acc, all_embed_dim, all_aup, all_auc, imp_train_acc, imp_eval_acc, imp_embed_dim, imp_aup, imp_auc#, top_train_acc, top_eval_acc, top_embed_dim, top_aup, top_auc

    results = pd.DataFrame(results)
    results.index = ['Training Accuracy - All', 'Test Accuracy - All', 'Embedding Dimensions - All', 'AUP - All', 'AUC - All',
                       'Training Accuracy - Thresholded', 'Test Accuracy - Thresholded', 'Embedding Dimensions - Thresholded', 'AUP - Thresholded', 'AUC - Thresholded',]

    results.columns = ['Values']
    # display (results)
    return results

In [9]:
def get_link_perf(input_embed, graph = None, hidden_edges = None, data = None, labels = None, train_set = None, train_set_neg = None, test_set = None, test_set_neg = None, epochs = 200, learning_rate = 0.001, train_size = 0.7, display_results = False, return_model = False, random_state = 2021):
    
    
    embed_dim = input_embed.shape[1]
    
    if type(hidden_edges) == type(None):
        X_0 = np.zeros((data.shape[0], embed_dim))
        X_1 = np.zeros((data.shape[0], embed_dim))

        for idx in tqdm(range(len(data))): 

            node_0 = data[idx][0]
            node_1 = data[idx][1]

            X_0[idx, :] = input_embed[node_0]
            X_1[idx, :] = input_embed[node_1]

        Y = to_categorical(labels)

        X_0_train, X_0_test, X_1_train, X_1_test, y_train, y_test = train_test_split(X_0,
                                                                                     X_1,
                                                                                     Y,
                                                                                     train_size = train_size,
                                                                                     shuffle = True, 
                                                                                     random_state = random_state)
    else: 
        
        X_0_train, X_0_test, X_1_train, X_1_test, y_train, y_test = generate_link_data(input_embed = input_embed,
                                                                                       train_set = train_set,
                                                                                       train_set_neg = test_set_neg,
                                                                                       test_set = test_set,
                                                                                       test_set_neg = test_set_neg)
    
    model = decoder_model(input_shape = (embed_dim,))
    model.compile(loss = tf.keras.losses.binary_crossentropy,
                  optimizer = tf.keras.optimizers.Adam(learning_rate),
                  metrics = ["accuracy"])
    
    history = model.fit([X_0_train, X_1_train], y_train, epochs = epochs)
    eval_loss, eval_acc = model.evaluate([X_0_test, X_1_test], y_test)
    
    train_acc = history.history['accuracy'][-1]
    
    y_pred = model.predict([X_0_test, X_1_test])
    auc = roc_auc_score(y_test, y_pred)
    aup = average_precision_score(y_test, y_pred)
    
    if return_model:
        return train_acc, eval_acc, embed_dim, auc, aup, model
    
    return train_acc, eval_acc, embed_dim, auc, aup
    
    

In [10]:
def generate_link_data(input_embed, train_set, train_set_neg, test_set, test_set_neg):
    
    train_set = np.array(train_set)
    train_set_neg = np.array(train_set_neg)
    test_set = np.array(test_set)
    test_set_neg = np.array(test_set_neg)
    
    train_data = np.vstack((train_set, train_set_neg))
    train_labels = np.vstack((np.ones((train_set.shape[0], 1)), np.zeros((train_set_neg.shape[0], 1))))

    test_data = np.vstack((np.array(test_set), test_set_neg))
    test_labels = np.vstack((np.ones((len(test_set), 1)), np.zeros((test_set_neg.shape[0], 1))))
    
    # Put into right format 
    embed_dim = input_embed.shape[1]
    X_0_train = np.zeros((train_data.shape[0], embed_dim))
    X_1_train = np.zeros((train_data.shape[0], embed_dim))

    X_0_test = np.zeros((test_data.shape[0], embed_dim))
    X_1_test = np.zeros((test_data.shape[0], embed_dim))

    for idx in tqdm(range(len(train_data))): 

        node_0 = train_data[idx][0]
        node_1 = train_data[idx][1]

        X_0_train[idx, :] = input_embed[node_0]
        X_1_train[idx, :] = input_embed[node_1]

    for idx in tqdm(range(len(test_data))): 

        node_0 = test_data[idx][0]
        node_1 = test_data[idx][1]

        X_0_test[idx, :] = input_embed[node_0]
        X_1_test[idx, :] = input_embed[node_1]

    Y_train = to_categorical(train_labels)
    Y_test = to_categorical(test_labels)

    print ("Train Data : ", train_data.shape)
    print ("Test Data : ", test_data.shape) 

    print ("X0 Train: ", X_0_train.shape)
    print ("X1 Train: ", X_1_train.shape)
    print ("X0 Test: ", X_0_test.shape)
    print ("X1 Test: ", X_1_test.shape)
    print ("Y Train: ", Y_train.shape)
    print ("Y Test: ", Y_test.shape)
    
    return X_0_train, X_0_test, X_1_train, X_1_test, Y_train, Y_test

### Miscellanious

In [11]:
def distance_matrix(mat_1, mat_2):
    
    # Initialise empty distance matrix
    distances = np.zeros((mat_1.shape[0], mat_2.shape[0]))
    
    # Iterate through all data points of Mat_1
    for idx_one in tqdm(range(len(mat_1))):

        # Iterate through all data points of Mat_2
        for idx_two in range(len(mat_2)):

            # Set the symmetric distances
            distances[idx_one][idx_two] = np.linalg.norm(mat_1[idx_one] - mat_2[idx_two])
            
    return distances

In [12]:
def get_confusion(y_true, y_pred, normalize = 'true'):
    conf = confusion_matrix(y_true, y_pred, normalize = normalize)
    conf = pd.DataFrame(conf)
    conf.columns = ['Predicted False', 'Predicted True']
    conf.index = ['Label False', 'Label True']
    return conf

In [13]:
def sort_dict(in_dict, reverse = False):

    '''
        Returns a dictionary sorted by keys 
        Inputs : 
            in_dict : dict - Dictionary to sort
            reverse : Bool - Ascending / Descending
    '''

    return sorted(in_dict.items(), key = lambda kv : kv[1], reverse = reverse)
