# Link Prediction with Node2Vec

In [1]:
'''
source : http://education.abcom.com/link-prediction-using-node2vec/
'''

import random
from tqdm import tqdm
import networkx as nx
from zipfile import ZipFile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

import warnings
warnings.filterwarnings("ignore")

## Import Data

In [2]:
# !wget https://github.com/abcom-mltutorials/Facebook-Social-Network-Analysis/archive/master.zip -P "/content"
# ZipFile("/content/master.zip").extractall("/content/")
 
# graph
G = nx.read_gpickle('data/fb_repo/Graph.pickle')
# fb dataframe
fb = pd.read_csv('data/fb_repo/fb.csv', index_col=[0])

In [3]:
print(nx.info(G))

Name: 
Type: Graph
Number of nodes: 1295
Number of edges: 18321
Average degree:  28.2950


In [4]:
fb.head()

Unnamed: 0,Node 1,Node 2
0,2,116
1,2,226
2,2,326
3,3,25
4,3,67


In [5]:
# create adj matrix
# get a list of nodes in our graph
l = list(G.nodes())
 
# create adjacency matrix
adj_G = nx.to_numpy_matrix(G, nodelist = l)
 
print(str(adj_G.shape)+'\n')
adj_G

(1295, 1295)



matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [6]:
# find disconnected nodes
# get all node pairs which don't have an edge
non_existing_edges = []
 
# traverse adjacency matrix
offset = 0
for i in tqdm(range(adj_G.shape[0])):
    for j in range(offset,adj_G.shape[1]):
        if i != j:
            if adj_G[i,j] == 0:
                non_existing_edges.extend([(l[i],l[j])])
 
    offset = offset + 1

100%|██████████| 1295/1295 [00:00<00:00, 1768.19it/s]


In [7]:
len(non_existing_edges)

819544

In [8]:
# get subset of isloated nodes 
nodes_4000 = sorted(random.sample(non_existing_edges, k=40000))

In [None]:
# Finding Connected Nodes
non_existing_edges = [(i[0],i[1]) for i in tqdm(nodes_4000) if nx.has_path(G, i[0], i[1])]

 53%|█████▎    | 21121/40000 [00:09<00:05, 3348.83it/s]

In [None]:
non_existing_edges[:5]

In [None]:
# create df of non existing edges
df1 = pd.DataFrame(data = non_existing_edges, columns =['Node 1', 'Node 2'])
 
# create a column 'Connection' with default 0 (no-connection)
df1['Connection'] = 0
 
df1.head()

In [None]:
# get removable edges
# Create a list of all indices of the node pairs in the fb dataframe,
# which when removed won’t change the structure of our graph
 
# create a copy
fb_temp = fb.copy()
 
# for storing removable edges
removable_edges_indices = []
 
# number of connected components and nodes of G
ncc = nx.number_connected_components(G)
number_of_nodes = len(G.nodes)
 
# for each node pair we will be removing a node pair and creating a new graph,
# and check if the number of connected components and the number of nodes
# are the same as the original graph
for i in tqdm(fb.index.values):
  
    # remove a node pair and build a new graph
    G1 = nx.from_pandas_edgelist(
        fb_temp.drop(index= i), "Node 1", "Node 2", create_using=nx.Graph()
    )
  
    # If the number of connected components remain same as the original
    # graph we won't remove the edge
    if (nx.number_connected_components(G1) == ncc) and (len(G1.nodes) == number_of_nodes):
        removable_edges_indices.append(i)
 
    # drop the edge, so that for the next iteration the next G1
    # is created without this edge
    fb_temp = fb_temp.drop(index = i)

In [None]:
removable_edges_indices[:5]

In [None]:
# Creating Dataframe of Removable Edges
# get node pairs in fb dataframe with indices in removable_edges_indices
df2 = fb.loc[removable_edges_indices]
 
# create a column 'Connection' and assign default value of 1 (connected nodes)
df2['Connection'] = 1
 
df2.head()

In [None]:
# Creating Subgraph
df1 = df1.append(df2[['Node 1', 'Node 2', 'Connection']], ignore_index=True)

In [None]:
df1.head()

In [None]:
df3 = fb.drop(index=df2.index.values)


In [None]:
df3.head()


In [None]:
G_new = nx.from_pandas_edgelist(df3, "Node 1", "Node 2", create_using=nx.Graph())

print(nx.info(G_new))

In [None]:
from node2vec import Node2Vec
 
# Generating walks
node2vec = Node2Vec(G_new, dimensions=100, walk_length=16, num_walks=50)
 
# training the node2vec model
n2v_model = node2vec.fit(window=7, min_count=1)

In [None]:
edge_features = [(n2v_model.wv[str(i)]+n2v_model.wv[str(j)]) for i,j in zip(df1['Node 1'], df1['Node 2'])]

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, auc, roc_curve, roc_auc_score,confusion_matrix


In [None]:
# creating dataset
X = np.array(edge_features)  
y = df1['Connection']

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

# Grid Search CV

## Random Forest

In [None]:
#classifier
clf1 = RandomForestClassifier()
 
# parameters
param = {'n_estimators' : [10,50,100], 'max_depth' : [5,10,15]}
 
# model
grid_clf_acc1 = GridSearchCV(clf1, param_grid = param)
 
# train the model
grid_clf_acc1.fit(X_train, y_train)
 
print('Grid best parameter (max. accuracy): ', grid_clf_acc1.best_params_)
print('Grid best score (accuracy): ', grid_clf_acc1.best_score_)
 
# alternative metric to optimize over grid parameters: AUC
grid_clf_auc1 = GridSearchCV(clf1, param_grid = param, scoring = 'roc_auc')
grid_clf_auc1.fit(X_train, y_train)
predict_proba = grid_clf_auc1.predict_proba(X_test)[:,1]
 
print('Test set AUC: ', roc_auc_score(y_test, predict_proba))
print('Grid best parameter (max. AUC): ', grid_clf_auc1.best_params_)
print('Grid best score (AUC): ', grid_clf_auc1.best_score_)

## Gradient Boost

In [None]:
# classifier
clf2 = GradientBoostingClassifier()
 
# parameters
param = {'learning_rate' : [.05,.1]}
 
# model
grid_clf_acc2 = GridSearchCV(clf2, param_grid = param)
 
# train the model
grid_clf_acc2.fit(X_train, y_train)
 
print('Grid best parameter (max. accuracy): ', grid_clf_acc2.best_params_)
print('Grid best score (accuracy): ', grid_clf_acc2.best_score_)
 
# alternative metric to optimize over grid parameters: AUC
grid_clf_auc2 = GridSearchCV(clf2, param_grid = param, scoring = 'roc_auc')
grid_clf_auc2.fit(X_train, y_train)
predict_proba = grid_clf_auc2.predict_proba(X_test)[:,1]
 
print('Test set AUC: ', roc_auc_score(y_test, predict_proba))
print('Grid best parameter (max. AUC): ', grid_clf_auc2.best_params_)
print('Grid best score (AUC): ', grid_clf_auc2.best_score_)

## MLP Classifier (A Neural Network Classifier)


In [None]:
# classifier
clf3 = MLPClassifier(max_iter=1000)
 
# scaling training and test sets
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
 
# parameters
param = {'hidden_layer_sizes' : [10,100,[10,10]], 'activation' : ['tanh','relu'], 'solver' : ['adam','lbfgs']}
 
# model
grid_clf_acc3 = GridSearchCV(clf3, param_grid = param)
 
# train the model
grid_clf_acc3.fit(X_train_scaled, y_train)
 
print('Grid best parameter (max. accuracy): ', grid_clf_acc3.best_params_)
print('Grid best score (accuracy): ', grid_clf_acc3.best_score_)
 
# alternative metric to optimize over grid parameters: AUC
grid_clf_auc3 = GridSearchCV(clf3, param_grid = param, scoring = 'roc_auc')
grid_clf_auc3.fit(X_train_scaled, y_train)
predict_proba = grid_clf_auc3.predict_proba(X_test_scaled)[:,1]
 
print('Test set AUC: ', roc_auc_score(y_test, predict_proba))
print('Grid best parameter (max. AUC): ', grid_clf_auc3.best_params_)
print('Grid best score (AUC): ', grid_clf_auc3.best_score_)

In [None]:
# Inference
pred = grid_clf_auc3.predict(X_test_scaled)
pred[:5]


In [None]:
# Accuracy Score
accuracy_score(pred,y_test)

In [None]:
# Confusion Matrix
confusion_matrix(pred,y_test)


In [None]:
# The ROC_AUC Score and ROC Curve
predict_proba = grid_clf_auc3.predict_proba(X_test_scaled)[:,1]
 
false_positive_rate,true_positive_rate,_ = roc_curve(y_test, predict_proba)
roc_auc_score = auc(false_positive_rate,true_positive_rate)

In [None]:
plt.plot(false_positive_rate,true_positive_rate)
plt.title(f'ROC Curve \n ROC AUC Score : {roc_auc_score}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

## Demonstration

In [None]:
df1.head()


In [None]:
print(f' ({df1.iloc[4,0]},{df1.iloc[4,1]}) node pair features : {X[4]}')

# its position in X_train
print(f'Index of ({df1.iloc[4,0]},{df1.iloc[4,1]}) node pair in X_train : {np.where(X_train == X[4])[0][1]}')

In [None]:
predict_proba = grid_clf_auc3.predict_proba(X_train_scaled[np.where(X_train == X[4])[0][1]].reshape(1,-1))[:,1]
 
print(f'Probability of nodes {df1.iloc[4,0]} and {df1.iloc[4,1]} to form a link is : {float(predict_proba)*100 : .2f}%')