# Bis438 Final Project Problem 2

## Import Python Libraries

In [1]:
import tensorflow as tf
import numpy as np
import deepchem as dc

from model import GCN
from utils import process_prediction, make_feature, split_data

  from numpy.core.umath_tests import inner1d


## Build GraphConv Model

In [2]:
batch_size = 50
model = GCN(batch_size=batch_size) # build model

## Training GraphConv Model and Calculate ROC-AUC

In [None]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification") # define metric as roc_auc_score

num_models = 10 # the number of iteration
rocauc_train = [] # save rocauc value for training dataset
rocauc_valid = [] # save rocauc value for validation dataset
rocauc_test = [] # save rocauc value for test dataset

# Do featurization
feature = make_feature('HIV','GraphConv')

for i in range(num_models):
    # Load ith dataset with GraphConv Featurizer and random split
    train_dataset, valid_dataset, test_dataset = split_data(feature)
    
    # Fitting ith model with training dataset
    model.fit(train_dataset, 3) # fitting with training epoch 3
    
    # Evaluating model   
    # save rocauc for training dataset
    pred_train = model.predict(train_dataset)
    pred_train = process_prediction(train_dataset.y, pred_train)
    train_scores = metric.compute_metric(train_dataset.y, pred_train, train_dataset.w)
    rocauc_train.append(train_scores)

    # save rocauc for valid dataset
    pred_valid = model.predict(valid_dataset)
    pred_valid = process_prediction(valid_dataset.y, pred_valid)
    valid_scores = metric.compute_metric(valid_dataset.y, pred_valid, valid_dataset.w)
    rocauc_valid.append(valid_scores)
   
    # save rocauc for test dataset
    pred_test = model.predict(test_dataset)
    pred_test = process_prediction(test_dataset.y, pred_test)
    test_scores = metric.compute_metric(test_dataset.y, pred_test, test_dataset.w)
    rocauc_test.append(test_scores)   
    
    # print rocauc result
    print("Evaluating model number ", i)
    print("Train ROC-AUC Score: " , train_scores,  
          "Valid ROC-AUC Score: " , valid_scores,
          "Test ROC-AUC Score: " , test_scores)    

Loading raw samples now.
shard_size: 8192
About to start loading CSV from ../data/HIV.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 0 took 14.708 s
Loading shard 2 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 1 took 15.685 s
Loading shard 3 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 2 took 15.883 s
Loading shard 4 of size 8192.
Featurizing sample 0
Feat

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


computed_metrics: [0.8013534854443946]
computed_metrics: [0.7511128237154361]
computed_metrics: [0.7960931606738748]
Evaluating model number  0
Train ROC-AUC Score:  0.8013534854443946 Valid ROC-AUC Score:  0.7511128237154361 Test ROC-AUC Score:  0.7960931606738748
TIMING: dataset construction took 15.501 s
Loading dataset from disk.
TIMING: dataset construction took 6.545 s
Loading dataset from disk.
TIMING: dataset construction took 6.766 s
Loading dataset from disk.
computed_metrics: [0.8383357800318403]
computed_metrics: [0.7899448736506748]
computed_metrics: [0.7986599799398195]
Evaluating model number  1
Train ROC-AUC Score:  0.8383357800318403 Valid ROC-AUC Score:  0.7899448736506748 Test ROC-AUC Score:  0.7986599799398195
TIMING: dataset construction took 15.514 s
Loading dataset from disk.
TIMING: dataset construction took 6.853 s
Loading dataset from disk.
TIMING: dataset construction took 6.778 s
Loading dataset from disk.
computed_metrics: [0.8689424004206228]
computed_metr

## Calculate mean value of ROC-AUC and use std1 for error bar

In [None]:
values = []
values.append(np.mean(rocauc_train))
values.append(np.mean(rocauc_valid))
values.append(np.mean(rocauc_test))
stds = []
stds.append(np.std(rocauc_train))
stds.append(np.std(rocauc_valid))
stds.append(np.std(rocauc_test))

## Plot ROC-AUC Score

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline
topics = ['train', 'valid', 'test']
plt.bar(topics, values, yerr=stds, capsize=2)

plt.title('GCN Model')
plt.xlabel('Dataset')
plt.ylabel('ROC-AUC Score')
plt.show()