In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
if 'google.colab' in sys.modules:
  %pip install -q stellargraph[demos]==1.1.0

In [3]:
import stellargraph as sg

try:
    sg.utils.validate_notebook_version("1.1.0")
except AttributeError:
    raise ValueError(
        f"This notebook requires StellarGraph version 1.1.0, but a different version {sg.__version__} is installed.  Please see <https://github.com/stellargraph/stellargraph/issues/1172>."
    ) from None

In [4]:
import numpy as np
import pandas as pd
import networkx as nx
import random
import stellargraph as sg

In [5]:
edges = pd.read_csv("/content/drive/My Drive/Enzyme/edgelist.csv")
features_df = pd.read_csv("/content/drive/My Drive/Enzyme/features.csv")

In [6]:
features_df.columns

Index(['Unnamed: 0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', 'node', 'graph', 'labels'],
      dtype='object')

In [7]:
graph_labels = features_df['labels']

In [8]:
features = features_df[['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11','12', '13', '14', '15', '16', '17', '18', 'node', 'graph']]
features = features.set_index("node")
features

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,graph
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,11.0,15.887014,37.78,-0.51,1.701,93.9,4.0,5.0,2.0,4.0,4.0,3.0,3.0,4.0,4.0,3.0,6.0,2.0,1
2,11.0,16.362935,40.38,-2.03,1.777,102.6,2.0,7.0,2.0,6.0,2.0,3.0,3.0,2.0,6.0,1.0,8.0,2.0,1
3,16.0,21.395072,63.35,2.04,2.981,136.0,2.0,7.0,7.0,6.0,4.0,6.0,6.0,2.0,8.0,2.0,7.0,7.0,1
4,6.0,8.881706,16.92,1.79,0.805,42.7,3.0,1.0,2.0,0.0,3.0,3.0,3.0,3.0,0.0,3.0,1.0,2.0,1
5,17.0,25.530009,53.41,-5.79,2.363,156.9,7.0,7.0,3.0,7.0,8.0,2.0,3.0,7.0,7.0,7.0,7.0,3.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19576,6.0,16.547985,23.03,3.16,1.069,46.3,1.0,3.0,2.0,1.0,2.0,3.0,3.0,1.0,2.0,1.0,3.0,2.0,600
19577,4.0,10.268664,20.70,1.15,0.984,34.5,0.0,2.0,2.0,2.0,0.0,2.0,2.0,0.0,2.0,0.0,2.0,2.0,600
19578,4.0,10.237717,20.70,1.15,0.984,34.5,0.0,2.0,2.0,2.0,0.0,2.0,2.0,0.0,2.0,0.0,2.0,2.0,600
19579,6.0,16.454257,23.03,3.16,1.069,46.3,1.0,3.0,2.0,1.0,2.0,3.0,3.0,1.0,2.0,1.0,3.0,2.0,600


In [9]:
edges = edges[['target', 'source']]
edges

Unnamed: 0,target,source
0,1,2
1,3,2
2,4,2
3,25,2
4,28,2
...,...,...
37277,19554,19555
37278,19578,19555
37279,19580,19555
37280,19580,19554


Building StellarGraph using the Edges and Features

In [10]:
graph = sg.StellarGraph(features, edges)
print(graph.info())

StellarGraph: Undirected multigraph
 Nodes: 19580, Edges: 37282

 Node types:
  default: [19580]
    Features: float32 vector, length 19
    Edge types: default-default->default

 Edge types:
    default-default->default: [37282]
        Weights: all 1 (default)
        Features: none


Building GCN Model and obtaining the Embeddings

In [11]:
from stellargraph.mapper import (
    CorruptedGenerator,
    FullBatchNodeGenerator,
    GraphSAGENodeGenerator,
    HinSAGENodeGenerator,
)
from stellargraph import StellarGraph
from stellargraph.layer import GCN, DeepGraphInfomax, GraphSAGE, GAT, APPNP, HinSAGE

from stellargraph import datasets
from stellargraph.utils import plot_history

import pandas as pd
from matplotlib import pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from IPython.display import display, HTML

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras import Model

In [12]:
fullbatch_generator = FullBatchNodeGenerator(graph, method='gcn')
gcn_model = GCN(layer_sizes=[8,8,8], activations=["relu", "relu", "relu"], generator=fullbatch_generator)
gen = fullbatch_generator.flow(graph.nodes())

Using GCN (local pooling) filters...


In [13]:
x_emb_in, x_emb_out = gcn_model.in_out_tensors()
x_out = tf.squeeze(x_emb_out, axis=0)
emb_model = Model(inputs=x_emb_in, outputs=x_out)

In [14]:
emb = emb_model.predict(gen)

Splitting Data Into Train and Test

In [18]:
from statistics import mean, stdev
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(emb, graph_labels, test_size=0.33, random_state=4)

In [21]:
from sklearn.metrics import roc_auc_score, f1_score,classification_report, confusion_matrix, accuracy_score

def getReport(clf, x_train, x_test, y_train, y_test):
  clf.fit(x_train, y_train)
  y_pred=clf.predict(x_test)
  accuracy=accuracy_score(y_pred, y_test)
  print("accuracy:",accuracy)
  cm = confusion_matrix(y_test, y_pred)
  tp = cm[0,0]
  tn = cm[1,1]
  fp = cm[0,1]
  fn = cm[1,0]
  precision = tp/(tp+fp)
  recall = tp/(tp+fn)
  f1 = 2*(recall * precision) / (recall + precision)
  print("Recall:",recall)
  print("Precision:",precision)
  print("F1:",f1)
  print(classification_report(y_test, y_pred))

LightBGM Classifier

In [22]:
import lightgbm as lgb

clf = lgb.LGBMClassifier(learning_rate=0.005)
getReport(clf, x_train, x_test, y_train, y_test)

accuracy: 0.8478799133395234
Recall: 0.8524727577535625
Precision: 0.9203619909502262
F1: 0.8851174934725847
              precision    recall  f1-score   support

           1       0.80      0.87      0.83      1171
           2       0.81      0.71      0.75      1001
           3       0.76      0.67      0.71       939
           4       0.82      0.93      0.87      1230
           5       0.90      0.92      0.91      1060
           6       0.99      0.95      0.97      1061

    accuracy                           0.85      6462
   macro avg       0.85      0.84      0.84      6462
weighted avg       0.85      0.85      0.85      6462



Random Forest Classifier

In [24]:
from sklearn.ensemble import RandomForestClassifier

clf1 = RandomForestClassifier()
getReport(clf1, x_train, x_test, y_train, y_test)

accuracy: 0.8698545341999381
Recall: 0.8829059829059829
Precision: 0.9297929792979298
F1: 0.9057430951337132
              precision    recall  f1-score   support

           1       0.84      0.88      0.86      1171
           2       0.83      0.78      0.80      1001
           3       0.78      0.74      0.76       939
           4       0.87      0.90      0.88      1230
           5       0.91      0.92      0.92      1060
           6       0.98      0.97      0.97      1061

    accuracy                           0.87      6462
   macro avg       0.87      0.87      0.87      6462
weighted avg       0.87      0.87      0.87      6462

