In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Machine Learning in Network Science
Lab 3: Network representation learning
March 19, 2021
"""
%matplotlib inline
from helper import *
import os
import networkx as nx
from gensim.models import Word2Vec
from node2vec import Node2Vec
from scipy.sparse import *
from scipy.stats.stats import pearsonr
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from sklearn.preprocessing import LabelBinarizer 

In [2]:
# Read network files
G = nx.read_gml("karate.gml")

print("The number of nodes: {}".format(G.number_of_nodes()))
print("The number of edges: {}".format(G.number_of_edges()))

# Get the node community labels and the number of communities
node2comm, num_of_communities = get_node2community(g=G)

The number of nodes: 34
The number of edges: 78


### Part I: Network Representation Learning

#### Exercise 1.1: Implementation of a random walking strategy

In [None]:
def perform_random_walks(graph, N, L):
    '''
    :param graph: networkx graph
    :param N: the number of walks for each node
    :param L: the walk length
    :return walks: the list of walks
    '''
    walks = []

    ...
    ...
    
        
    return walks

#### Exercise 1.2: Learning representations of nodes

In [None]:
num_of_walks=
walk_length=
embedding_size = 
window_size = 
output_filename="./graph.embedding"

# Perform random walks - call function
walks = #
# Learn representations of nodes - use Word2Vec
model = #
# Save the embedding vectors
model.wv.save_word2vec_format(output_filename)

In [None]:
# Visualise embeddings
visualize(graph=G, node2embedding=model.wv)

### Part II: Link Predicition

In [7]:
# Functions seen in previous lab - Nothing to do

def generate_samples(graph, train_set_ratio):
    """
    Graph pre-processing step required to perform supervised link prediction
    Create training and test sets
    """
        
    # --- Step 0: The graph must be connected ---
    if nx.is_connected(G) is not True:
        raise ValueError("The graph contains more than one connected component!")
       
    
    # --- Step 1: Generate positive edge samples for testing set ---
    residual_g = graph.copy()
    test_pos_samples = []
      
    # Store the shuffled list of current edges of the graph
    edges = list(residual_g.edges())
    np.random.shuffle(edges)
    
    # Define number of positive test samples desired
    test_set_size = int((1.0 - train_set_ratio) * graph.number_of_edges())
    train_set_size = graph.number_of_edges() - test_set_size
    num_of_pos_test_samples = 0
    
    # Remove random edges from the graph, leaving it connected
    for edge in edges:
        
        # Remove the edge
        residual_g.remove_edge(edge[0], edge[1])
        
        # Add the removed edge to the positive sample list if the network is still connected
        if nx.is_connected(residual_g):
            num_of_pos_test_samples += 1
            test_pos_samples.append(edge)
        # Otherwise, re-add the edge to the network
        else: 
            residual_g.add_edge(edge[0], edge[1])
        
        # If we have collected enough number of edges for testing set, we can terminate the loop
        if num_of_pos_test_samples == test_set_size:
            break
    
    # Check if we have the desired number of positive samples for testing set 
    if num_of_pos_test_samples != test_set_size:
        raise ValueError("Enough positive edge samples could not be found!")

        
    # --- Step 2: Generate positive edge samples for training set ---
    # The remaining edges are simply considered for positive samples of the training set
    train_pos_samples = list(residual_g.edges())
        
        
    # --- Step 3: Generate the negative samples for testing and training sets ---
    non_edges = list(nx.non_edges(graph))
    np.random.shuffle(non_edges)
    
    train_neg_samples = non_edges[:train_set_size] 
    test_neg_samples = non_edges[train_set_size:train_set_size + test_set_size]

    
    # --- Step 4: Combine sample lists and create corresponding labels ---
    # For training set
    train_samples = train_pos_samples + train_neg_samples
    train_labels = [1 for _ in train_pos_samples] + [0 for _ in train_neg_samples]
    # For testing set
    test_samples = test_pos_samples + test_neg_samples
    test_labels = [1 for _ in test_pos_samples] + [0 for _ in test_neg_samples]
    
    return residual_g, train_samples, train_labels, test_samples, test_labels


def edge_prediction(node2embedding, train_samples, test_samples, train_labels, test_labels, feature_func=None):
    
    # --- Construct feature vectors for edges ---
    if feature_func is None:
        feature_func = lambda x,y: abs(x-y)
    
    train_features = [feature_func(node2embedding[edge[0]], node2embedding[edge[1]]) for edge in train_samples]
    test_features = [feature_func(node2embedding[edge[0]], node2embedding[edge[1]]) for edge in test_samples]
    
    # --- Build the model and train it ---
    clf = LogisticRegression()
    clf.fit(train_features, train_labels)

    train_preds = clf.predict_proba(train_features)[:, 1]
    test_preds = clf.predict_proba(test_features)[:, 1]

    # --- Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from predictions ---
    fpr, tpr, _ = roc_curve(test_labels, test_preds)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(6,6))
    plt.plot(fpr, tpr, color='darkred', label='ROC curve (area = %0.3f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='lightgray', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic Curve')
    plt.legend(loc="lower right")
    plt.show()
    
    return roc_auc

##### Exercise 2

In [None]:
# Data - construct training and testing sets 

# Perform random walks over the residual network

# Learn representations of nodes

# Perform the edge prediction and plot the ROC curve


In [None]:
# Same as above but using Node2Vec instead of Word2Vec

### Part III: Node Classification

#### Exercise 3.1

In [None]:
def node_classif(graph, node2embedding, train_set_ratio): 
    """Perform node classification on graph 
    
    args node2embedding: dictionnary of node embedding
    args return: accuracy score 
    """

    # Get the ground-truth labels 
    node2community, K = get_node2community(G)

    # Create feature matrix (np.array)
    x = # 
    # Create label matrix 
    labels = # 

    # Get the training size
    train_set_size = #

    # Shuffle the data
    shuffled_features, shuffled_labels = shuffle(x, labels)

    # Divide the data into the training and test sets
    train_features = shuffled_features[0:train_set_size, :]
    train_labels = shuffled_labels[0:train_set_size]

    test_features = #
    test_labels = #

    # Build the model and train it
    clf = OneVsRestClassifier(LogisticRegression())
    clf.fit(train_features, train_labels)

    # Find the predictions, each node can have multiple labels
    test_prob = #
    y_pred = # careful to format for accuracy score


    return accuracy_score(test_labels, y_pred)

node_classif(graph, model.wv, 0.6)

##### Exercise 3.2

Use the facebook page-page network, also contained in the folder downloaded. 

In [None]:
# Create graph

# Import labels 

# Pre-process labels (Categorical variables are not desirable - Want multiple dummies instead)


In [None]:
# Learn embeddings 

In [None]:
# Evaluate the model 