<a href="https://colab.research.google.com/github/yuchenxuuu/Algorithm/blob/master/node2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
if 'google.colab' in sys.modules:
  %pip install -q stellargraph[demos]==1.2.1

[K     |████████████████████████████████| 440kB 2.8MB/s 
[K     |████████████████████████████████| 235kB 12.0MB/s 
[K     |████████████████████████████████| 51kB 6.8MB/s 
[?25h  Building wheel for mplleaflet (setup.py) ... [?25l[?25hdone


In [None]:
import os
import networkx as nx
import numpy as np
from stellargraph import StellarGraph, datasets
from stellargraph.data import EdgeSplitter
import multiprocessing
from sklearn.model_selection import train_test_split

%matplotlib inline

In [None]:
dataset = datasets.Cora()
graph, _ = dataset.load(largest_connected_component_only=True, str_node_ids=True)

In [None]:
edge_splitter_test = EdgeSplitter(graph)
graph_test, examples_test, labels_test = edge_splitter_test.train_test_split(p=0.05)
edge_splitter_train = EdgeSplitter(graph_test,graph)
graph_train, examples, labels = edge_splitter_train.train_test_split(p=0.15)
# print(graph_test.info())
# print(graph_train.info())
(
    examples_train,
    examples_model_selection,
    labels_train,
    labels_model_selection,
) = train_test_split(examples, labels, train_size=0.75, test_size=0.25)


** Sampled 260 positive and 260 negative edges. **
** Sampled 742 positive and 742 negative edges. **


## Split the dataset to training data and testing data

In [None]:
from stellargraph.data import BiasedRandomWalk
from gensim.models import Word2Vec

In [63]:
def node2vec(inputgraph,p,q,dimensions,num_walks,walk_length,window_size):
  workers = multiprocessing.cpu_count()
  randw = BiasedRandomWalk(inputgraph)
  walks = randw.run(graph.nodes(), n=num_walks, length=walk_length, p=p, q=q)
  print(len(walks))
  word2vec_model = Word2Vec(walks,size=dimensions,window=window_size, min_count=0,sg=1,workers=workers,iter=1)
  def get_embedding(u):
        return word2vec_model.wv[u]
  return get_embedding


## Determin the parameters and run the node2vec model

In [79]:
p=1.0
q=1.0
dimensions = 120
num_walks = 10
walk_length = 80
window_size = 10
result = node2vec(graph,p,q,dimensions,num_walks,walk_length,window_size)

24850


## Result evaluation

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

In [77]:
def link_examples_to_features(link_examples, transform_node):
    features = []
    for src, dst in link_examples:
      features.append(transform_node(src)*transform_node(dst))
    return features

def train_linkprediction(link_examples, link_labels, get_embedding):
    lr_clf = LogisticRegressionCV(Cs=10, cv=10, scoring="roc_auc", max_iter=100)
    pipe = Pipeline(steps=[("sc", StandardScaler()), ("clf", lr_clf)])
    link_features = link_examples_to_features(link_examples, get_embedding)
    pipe.fit(link_features, link_labels)
    print(type(pipe))
    return pipe

def evaluate_model(pipe, link_examples_test, link_labels_test, get_embedding):
    link_features_test = link_examples_to_features(
        link_examples_test, get_embedding
    )
    predicted = pipe.predict_proba(link_features_test)
    positive_column = list(pipe.classes_).index(1)
    score = roc_auc_score(link_labels_test, predicted[:, positive_column])
    return score
  

In [81]:
def run_link_prediction():
    clf = train_linkprediction(examples_train, labels_train, result)
    score = evaluate_model(
        clf,
        examples_model_selection,
        labels_model_selection,
        result
    )
    print(score)


In [82]:
results = run_link_prediction()
print(results)

<class 'sklearn.pipeline.Pipeline'>
0.9884335948852078
None
