In [1]:
#import libraries and set path of data and embeddings
import networkx as nx
import numpy as np
import pickle
from util_func import *

from prettytable import PrettyTable
emb_location = './../generated_embedding/'
graph_location = './../data/'



In [2]:
def run_embed(dataset_name,save_stats=True):
    '''Load the embeddings and perform statistical analysis of predictions'''
    #load the embedding file for the dataset
    emb_loc = emb_location+dataset_name+'.edges_deepwalk.embeddings'
    emb = load_embeddings(emb_loc)
    #runnning simple classifier on deepwalk embeddings
    simple_classify_f1(dataset_name,emb,splits_ratio=[0.1],save_stats=True)
    graph_classify_analysis(dataset_name,graph_stat=True)


In [6]:
#one time file to create labels for the dataset
from sklearn.preprocessing import MultiLabelBinarizer
def dump_label(data):
    print(f"Dataset is {data}")
    f = open(graph_location+data+'.data','rb')
    a = pickle.load(f)
    print(a)
    if data=='blogcatalog':
        labels = a['LILLabels']
    else:
        labels = a['Labels']
        labels = MultiLabelBinarizer().fit_transform(labels.reshape(labels.shape[0], 1))
    with open(graph_location+data+'.labels',"wb") as f1:
        pickle.dump(labels,f1)
        print(f"{data} labels created")
        
d = ['citeseer','cora','blogcatalog']

for i in d:
    dump_label(i)

Dataset is citeseer
{'Labels': array([0, 0, 0, ..., 4, 3, 5]), 'NXGraph': <networkx.classes.digraph.DiGraph object at 0x7f1d9838bac8>, 'CSRFeatures': <3327x3703 sparse matrix of type '<class 'numpy.int64'>'
	with 105165 stored elements in Compressed Sparse Row format>}
citeseer labels created
Dataset is cora
{'Labels': array([2, 5, 0, ..., 3, 3, 2]), 'CSRFeatures': <2708x1433 sparse matrix of type '<class 'numpy.int64'>'
	with 49216 stored elements in Compressed Sparse Row format>, 'NXGraph': <networkx.classes.digraph.DiGraph object at 0x7f1d98718b00>}
cora labels created
Dataset is blogcatalog
{'LILLabels': <10312x39 sparse matrix of type '<class 'numpy.int64'>'
	with 14476 stored elements in LInked List format>, 'NXGraph': <networkx.classes.graph.Graph object at 0x7f1d98313198>}
blogcatalog labels created


In [3]:
#Analysis for Citeseer Dataset
dataset = 'citeseer'
run_embed(dataset)

file saved
Embedding dim: 128, graph: citeseer

Run number 1:
Train ratio: 0.9
micro: 0.5885885885885885
macro: 0.4798335584661983
samples: 0.5885885885885885
weighted: 0.5958909456077889
Accuracy: 0.5885885885885885

Graph is citeseer:
Graph has 3327 and edges are 4676
+------------+----------+---------------+-----------------+--------+----------------+----------------+
| Serial No. | Node no. |   True Label  | Predicted Label | Degree | Neighbor label | Neighbor Match |
+------------+----------+---------------+-----------------+--------+----------------+----------------+
|     1      |   2168   | (array([6]),) |  (array([4]),)  |   1    | (array([1]),)  |     False      |
|     2      |   1739   | (array([5]),) |  (array([1]),)  |   1    | (array([5]),)  |      True      |
|     3      |   1771   | (array([3]),) |  (array([5]),)  |   1    | (array([3]),)  |      True      |
|     4      |   2209   | (array([2]),) |  (array([5]),)  |   1    | (array([2]),)  |      True      |
|     5 

In [3]:
#Analysis for Citeseer Dataset
dataset = 'blogcatalog'
run_embed(dataset)

  'recall', 'true', average, warn_for)


file saved
Embedding dim: 128, graph: blogcatalog

Run number 1:
Train ratio: 0.9
micro: 0.40735694822888285
macro: 0.2638632029510274
samples: 0.3959302325581396
weighted: 0.43920250757323276
Accuracy: 0.3168604651162791

Graph is blogcatalog:
Graph has 10312 and edges are 333983


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [4]:
#Analysis for Citeseer Dataset
dataset = 'pubmed'
run_embed(dataset)

file saved
Embedding dim: 128, graph: pubmed

Run number 1:
Train ratio: 0.9
micro: 0.8057809330628803
macro: 0.7963475542078481
samples: 0.8057809330628803
weighted: 0.8066786564389099
Accuracy: 0.8057809330628803

Graph is pubmed:
Graph has 19717 and edges are 44327
+------------+----------+---------------+-----------------+--------+----------------+----------------+
| Serial No. | Node no. |   True Label  | Predicted Label | Degree | Neighbor label | Neighbor Match |
+------------+----------+---------------+-----------------+--------+----------------+----------------+
|     1      |  10833   | (array([2]),) |  (array([0]),)  |   1    | (array([0]),)  |     False      |
|     2      |  12289   | (array([2]),) |  (array([1]),)  |   1    | (array([1]),)  |     False      |
|     3      |  19682   | (array([2]),) |  (array([1]),)  |   1    | (array([1]),)  |     False      |
|     4      |   9233   | (array([1]),) |  (array([0]),)  |   1    | (array([1]),)  |      True      |
|     5   

In [5]:
#Analysis for Citeseer Dataset
dataset = 'cora'
run_embed(dataset)

file saved
Embedding dim: 128, graph: cora

Run number 1:
Train ratio: 0.9
micro: 0.8376383763837638
macro: 0.8230022432436838
samples: 0.8376383763837638
weighted: 0.8347950399963107
Accuracy: 0.8376383763837638

Graph is cora:
Graph has 2708 and edges are 5278
+------------+----------+---------------+-----------------+--------+----------------+----------------+
| Serial No. | Node no. |   True Label  | Predicted Label | Degree | Neighbor label | Neighbor Match |
+------------+----------+---------------+-----------------+--------+----------------+----------------+
|     1      |   923    | (array([0]),) |  (array([3]),)  |   1    | (array([3]),)  |     False      |
|     2      |   2041   | (array([2]),) |  (array([5]),)  |   1    | (array([2]),)  |      True      |
|     3      |   2551   | (array([3]),) |  (array([2]),)  |   1    | (array([2]),)  |     False      |
|     4      |   1648   | (array([2]),) |  (array([6]),)  |   1    | (array([6]),)  |     False      |
|     5      |  

In [6]:
#Analysis for Citeseer Dataset
dataset = 'reddit'
run_embed(dataset)

file saved
Embedding dim: 128, graph: reddit

Run number 1:
Train ratio: 0.9
micro: 0.8975404558526849
macro: 0.8701530715543039
samples: 0.8975404558526849
weighted: 0.8904164275589822
Accuracy: 0.8975404558526849

Graph is reddit:
Graph has 231443 and edges are 11606919
+------------+----------+-----------------------------------------------------+-----------------+--------+-----------------------------------------------------+----------------+
| Serial No. | Node no. |                      True Label                     | Predicted Label | Degree |                    Neighbor label                   | Neighbor Match |
+------------+----------+-----------------------------------------------------+-----------------+--------+-----------------------------------------------------+----------------+
|     1      |  171174  |  (array([0], dtype=int32), array([8], dtype=int32)) |  (array([8]),)  |   1    | (array([0], dtype=int32), array([30], dtype=int32)) |     False      |
|     2      | 

Statistics for dataset: reddit
---------------------------------


TypeError: '<' not supported between instances of 'DegreeView' and 'int'