In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib as plt
import os
import networkx as nx
from gensim.models import Word2Vec
from node2vec import Node2Vec


from scipy.stats.stats import pearsonr
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score 
from sklearn.preprocessing import LabelBinarizer 

  from scipy.stats.stats import pearsonr


## Usefull stuff

### Paths

In [2]:
information_path = "Data/raw_data/node_information.csv"
test_set_path = "Data/raw_data/testing_set.txt"
train_set_path = "Data/raw_data/training_set.txt"
random_preds_path = "Data/raw_data/random_predictions.csv"

### Useful functions

In [36]:
def generate_samples(graph, train_set_ratio):
    """
    Graph pre-processing step required to perform supervised link prediction
    Create training and test sets
    """
        
       
    
    # --- Step 1: Generate positive edge samples for testing set ---
    residual_g = graph.copy()
    test_pos_samples = []
      
    # Store the shuffled list of current edges of the graph
    edges = list(residual_g.edges())
    np.random.shuffle(edges)
    
    # Define number of positive test samples desired
    test_set_size = int((1.0 - train_set_ratio) * graph.number_of_edges())
    train_set_size = graph.number_of_edges() - test_set_size
    num_of_pos_test_samples = 0
    
    # Remove random edges from the graph, leaving it connected
    # Fill in the blanks
    for edge in edges:
        
        # Remove the edge
        residual_g.remove_edge(edge[0], edge[1])
        
        # Add the removed edge to the positive sample list if the network is still connected
        if nx.is_connected(residual_g):
            num_of_pos_test_samples += 1
            test_pos_samples.append(edge)
        # Otherwise, re-add the edge to the network
        else: 
            residual_g.add_edge(edge[0], edge[1])
        
        # If we have collected enough number of edges for testing set, we can terminate the loop
        if num_of_pos_test_samples == test_set_size:
            break
    
    # Check if we have the desired number of positive samples for testing set 
    if num_of_pos_test_samples != test_set_size:
        raise ValueError("Enough positive edge samples could not be found!")

        
    # --- Step 2: Generate positive edge samples for training set ---
    # The remaining edges are simply considered for positive samples of the training set
    train_pos_samples = list(residual_g.edges())
        
        
    # --- Step 3: Generate the negative samples for testing and training sets ---
    # Fill in the blanks
    non_edges = list(nx.non_edges(graph))
    np.random.shuffle(non_edges)
    
    train_neg_samples = non_edges[:train_set_size] 
    test_neg_samples = non_edges[train_set_size:train_set_size + test_set_size]

    
    # --- Step 4: Combine sample lists and create corresponding labels ---
    # For training set
    train_samples = train_pos_samples + train_neg_samples
    train_labels = [1 for _ in train_pos_samples] + [0 for _ in train_neg_samples]
    # For testing set
    test_samples = test_pos_samples + test_neg_samples
    test_labels = [1 for _ in test_pos_samples] + [0 for _ in test_neg_samples]
    
    return train_samples, train_labels, test_samples, test_labels

## Import data

In [3]:
information_df = pd.read_csv(information_path, header=None)
information_df.columns = ["ID",'pub_year','title','authors','journal_name','abstract']
information_df.sample(3)

Unnamed: 0,ID,pub_year,title,authors,journal_name,abstract
15172,9412200,1994,on electromagnetic duality in locally supersym...,"A. Ceresole, R. D'Auria, S. Ferrara, A. Van Pr...",,theory added we consider duality transformatio...
1463,6237,2000,conformally symmetric massive discrete fields,Manoelito M de Souza,,the meaning of conformal and chiral symmetry o...
5541,110268,2001,deformation quantization and quantum field the...,Chengang Zhou,,case of two-sphere we study the scalar quantum...


In [4]:
train_set = pd.read_csv(train_set_path, sep =" ", header = None)
train_set.columns = ['node1','node2','label']
train_set.sample(5)

Unnamed: 0,node1,node2,label
551229,11167,9909204,0
105240,9908057,1115,0
184270,9705220,9609070,1
480173,201026,9506031,0
395461,112015,107143,1


In [5]:
test_set = pd.read_csv(test_set_path, sep =" ", header = None)
test_set.columns = ['node1','node2']
test_set.sample(5)

Unnamed: 0,node1,node2
29275,9306006,9203046
2258,212032,210148
13336,9709187,207006
29165,209264,3237
24416,9702180,9812050


## Create Graph

In [30]:
nodes = set(np.concatenate((train_set.node1,train_set.node2), axis = 0))

In [28]:
edges = set(train_set.query("label == 1").apply(lambda x: (x.node1,x.node2), axis = 1))


In [31]:
G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(edges)


In [33]:
print("The number of nodes: {}".format(G.number_of_nodes()))
print("The number of edges: {}".format(G.number_of_edges()))

The number of nodes: 27770
The number of edges: 334690


In [37]:
train_samples, train_labels, validation_samples, validation_labels = generate_samples(G, 0.8)