<a href="https://colab.research.google.com/github/zeinabkamkar98/graph-classification/blob/main/GraphClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Graph Classification

### Import requirements 

In [44]:
import networkx as nx
import numpy as np
import pandas as pd

from scipy import sparse

from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import cross_val_score

%matplotlib inline


### Loading datasets

In [45]:
class GraphDataset:
    def __init__(self, folder_path=''):

        G = nx.Graph()
        data_adj = np.loadtxt(folder_path + '_A.txt', delimiter=',').astype(int)
        data_graph_indicator = np.loadtxt(folder_path + '_graph_indicator.txt', delimiter=',').astype(int)
        labels = np.loadtxt(folder_path + '_graph_labels.txt', delimiter=',').astype(int)

        data_tuple = list(map(tuple, data_adj))
        G.add_edges_from(data_tuple)
        G.remove_nodes_from(list(nx.isolates(G)))

        # split into graphs

        le = LabelEncoder()
        self.labels_ = le.fit_transform(labels)
        self.n_classes_ = len(le.classes_)
        self.n_graphs_ = len(self.labels_)

        graph_num = data_graph_indicator.max()
        node_list = np.arange(data_graph_indicator.shape[0]) + 1
        self.graphs_ = []

        for i in range(graph_num):

            nodes = node_list[data_graph_indicator == i + 1]
            G_sub = G.subgraph(nodes).copy()
            
            max_cc = max(nx.connected_components(G_sub), key=len)
            G_sub = G_sub.subgraph(max_cc).copy()
            G_sub.graph['label'] = self.labels_ [i]
            nx.convert_node_labels_to_integers(G_sub)
            self.graphs_.append(G_sub)     

        self.graphs_ = np.array(self.graphs_)



#### Load github datasets
 If you wanna use github datasets run below cell. This datasets are limited (10 datasets), They're just for test.

In [46]:
!git clone https://github.com/zeinabkamkar98/graph-classification.git

data_pwd = 'graph-classification/DATASETS/'

fatal: destination path 'graph-classification' already exists and is not an empty directory.


#### Loading google drive datasets
If you wanna use google drive datasets run below cell. All of converted dataset are in google drive. 

**Note**: For using google drive datasets you need to at first login in the related google account.

In [47]:
# from google.colab import drive
# drive.mount('/content/drive')

# data_pwd = 'drive/MyDrive/DATASETS/'

#### Select dataset
Put Name of the datasets that you wanna use in `graphs_name` variables and the mean number of nodes in `embedding_dims` varaibles.

In [48]:
graphs_name = ['DHFR','BZR','COX2','AIDS','ENZYMES','DD','MUTAG','NCI1','PROTEINS_full','PTC_MR']

# Mean number of nodes per dataset
embedding_dims = {'DHFR':42,'BZR':36,'COX2':42,'AIDS':16 ,'ENZYMES': 33,'DD':	284,'MUTAG':18,'NCI1':30,'PROTEINS_full':39,'PTC_MR':14}

In [49]:
datasets = {}
for gn in graphs_name:
    datasets[gn] = GraphDataset(folder_path=data_pwd + gn + '/' + gn)
    print(gn," => loaded")



DHFR  => loaded
BZR  => loaded
COX2  => loaded
AIDS  => loaded
ENZYMES  => loaded
DD  => loaded
MUTAG  => loaded
NCI1  => loaded
PROTEINS_full  => loaded
PTC_MR  => loaded


### Spectral embedding, reshaping and training

Embedding of the nodes: creation of the spectral features

In [50]:
def padded_spectral(graph, embedding_dimension=16, normalized=True):
    
    # Padding with zeros
    embedding = np.zeros(embedding_dimension)
    
    # Usage of networkx graph objects
    adj_matrix = nx.adj_matrix(graph)
    n_nodes, m_nodes = adj_matrix.shape
    k = min(embedding_dimension + 1, n_nodes - 1)

    if normalized:
        laplacian = nx.normalized_laplacian_matrix(graph)
    else:
        laplacian = nx.laplacian_matrix(graph)

    # Minus the eigen decomposition of minus the Laplacian is more stable than directly
    # computing the eigen decomposition of the Laplacian
    
    v0 = np.random.uniform(-1, 1, laplacian.shape[0])
    eigenvalues = sparse.linalg.eigsh(-laplacian, k=k, sigma=1.0, which='LM', tol=1e-6, v0=v0, return_eigenvectors=False)
    embedding[:len(eigenvalues)-1] = sorted(-eigenvalues)[1:]
    
    return embedding

In [51]:
def dataset_embedding(dataset, embedding_dim, normalized=True):

    list_graphs = []
    for g in dataset.graphs_:
        x = padded_spectral(g, embedding_dim, normalized)
        y = g.graph['label']

        list_graphs.append((x, y))

    X = np.array([l[0] for l in list_graphs])
    labels = np.array([l[1] for l in list_graphs])
    
    return X, labels

In [52]:
embedded_datasets = {}

for name, dataset in datasets.items():
    k = embedding_dims[name]
    X, y = dataset_embedding(dataset, k)
    embedded_datasets[name] = {'X': X, 'y': y}

Use `adjacency_matrix` instead

  import sys


Training for all datasets and embedding dimensions

In [53]:
rf_parameters = {'n_estimators': 500, 'max_depth': 100}
data_frame={'Dataset':[],'Accuracy':[]}

for name, dataset in datasets.items():

    X, y = embedded_datasets[name]['X'], embedded_datasets[name]['y']

    rf = RandomForestClassifier(**rf_parameters, random_state=1, class_weight='balanced') 
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

    cv_results = cross_val_score(rf, X, y, n_jobs=1, cv=skf, scoring='accuracy')
    data_frame['Dataset'].append(name)
    data_frame['Accuracy'].append(np.mean(cv_results))
    
print(pd.DataFrame(data_frame))

         Dataset  Accuracy
0           DHFR  0.780316
1            BZR  0.834451
2           COX2  0.775254
3           AIDS  0.998500
4        ENZYMES  0.438333
5             DD  0.753057
6          MUTAG  0.877778
7           NCI1  0.750122
8  PROTEINS_full  0.724147
9         PTC_MR  0.595378
