# Solution 1: Node classification with NetworkX

In [1]:
import os
import networkx as nx
import pandas as pd

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from networkx.algorithms import node_classification

In [2]:
# params
data_dir = 'data/cora/'
train_size = 0.2

## Load data

Dataset: https://graphsandnetworks.com/the-cora-dataset/

### Load edge info

In [3]:
edgelist = pd.read_csv(os.path.join(data_dir, "cora.cites"), sep='\t', header=None, names=["target", "source"])
edgelist

Unnamed: 0,target,source
0,35,1033
1,35,103482
2,35,103515
3,35,1050679
4,35,1103960
...,...,...
5424,853116,19621
5425,853116,853155
5426,853118,1140289
5427,853155,853118


In [4]:
# We have to use an undirected graph here because the node classification algorithms in nx do not support directed graphs
G = nx.from_pandas_edgelist(edgelist)

### Load node info

In [5]:
df_nodes = pd.read_csv(os.path.join(data_dir, "cora.content"), sep='\t', header=None)
df_nodes

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1425,1426,1427,1428,1429,1430,1431,1432,1433,1434
0,31336,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,Neural_Networks
1,1061127,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,Rule_Learning
2,1106406,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
3,13195,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
4,37879,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Probabilistic_Methods
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703,1128975,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Genetic_Algorithms
2704,1128977,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Genetic_Algorithms
2705,1128978,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Genetic_Algorithms
2706,117328,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Case_Based


In [6]:
df_train, df_test = train_test_split(df_nodes, train_size = train_size)

In [7]:
node_ids_train, node_ids_test = df_train.iloc[:, 0].values, df_test.iloc[:, 0].values
labels_train, labels_test = df_train.iloc[:, -1].values.astype(str), df_test.iloc[:, -1].values.astype(str)
data_train = dict(zip(node_ids_train, labels_train))

In [8]:
nx.set_node_attributes(G, data_train, name="label")

## Node classification and evaluation

Now we perform node classification using the built-in Harmonic Function method in NetworkX.

* Zhu, X., Ghahramani, Z., & Lafferty, J. (2003, August). Semi-supervised learning using gaussian fields and harmonic functions. In ICML (Vol. 3, pp. 912-919).

In [9]:
result = node_classification.harmonic_function(G)

The result contains the labels (predicted or ground truth) of all nodes. For example, the labels of the first five nodes:

In [10]:
result[:5]

['Genetic_Algorithms',
 'Genetic_Algorithms',
 'Neural_Networks',
 'Genetic_Algorithms',
 'Neural_Networks']

To check the performance of the classification, we fetch the labels of the nodes in the test set.

In [11]:
dict_result = dict(zip(list(G), result))
labels_pred = [ dict_result.get(id) for id in node_ids_test ]

print('Performance (Harmonic Function): \n', classification_report(labels_test, labels_pred))

Performance (Harmonic Function): 
                         precision    recall  f1-score   support

            Case_Based       0.86      0.78      0.82       232
    Genetic_Algorithms       0.69      0.94      0.79       337
       Neural_Networks       0.87      0.76      0.81       658
 Probabilistic_Methods       0.90      0.76      0.82       343
Reinforcement_Learning       0.81      0.79      0.80       176
         Rule_Learning       0.81      0.72      0.76       144
                Theory       0.66      0.78      0.71       277

              accuracy                           0.79      2167
             macro avg       0.80      0.79      0.79      2167
          weighted avg       0.81      0.79      0.79      2167



Similarly, we can perform node classification using the built-in Local and Global Consistency method.

* Zhou, D., Bousquet, O., Lal, T. N., Weston, J., & Schölkopf, B. (2004). Learning with local and global consistency. Advances in neural information processing systems, 16(16), 321-328.

In [12]:
result = node_classification.local_and_global_consistency(G)

In [13]:
dict_result = dict(zip(list(G), result))
labels_pred = [ dict_result.get(id) for id in node_ids_test ]

In [15]:
print('Performance (Local and Global Consistency): \n', classification_report(labels_test, labels_pred))

Performance (Local and Global Consistency): 
                         precision    recall  f1-score   support

            Case_Based       0.84      0.82      0.83       232
    Genetic_Algorithms       0.68      0.96      0.80       337
       Neural_Networks       0.85      0.79      0.82       658
 Probabilistic_Methods       0.94      0.77      0.84       343
Reinforcement_Learning       0.88      0.74      0.80       176
         Rule_Learning       0.88      0.69      0.78       144
                Theory       0.69      0.79      0.74       277

              accuracy                           0.81      2167
             macro avg       0.82      0.79      0.80      2167
          weighted avg       0.82      0.81      0.81      2167

