In [1]:
import numpy as np
import os
import tensorflow as tf
import urllib
import tensorflowjs as tfjs
import json
import keras
from dataclasses import dataclass
import matplotlib.pyplot as plt

2022-12-16 00:19:24.553928: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-16 00:19:24.745320: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-12-16 00:19:25.489540: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /software/gmp/6.2.1/b1/lib:/software/glpk/4.65/lib:/software/zmq/4.2.3/b1/lib:/software/git/2.30.1/lib64:/software/gcc/7.3.0/lib64:/software/gcc/7.3.0/lib:/software/openmpi/4.0.4/b1/lib:/software/cuda/11.4/usr/lo

In [2]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  4


## Downloading solubility model from GitHub

In [3]:
# load json and create model
json_file = open('model_weights_seeded_SOL.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = keras.models.model_from_json(loaded_model_json)
# load weights into new model
model.load_weights("model_weights_seeded_SOL.h5")
print("Loaded model from disk.")

2022-12-16 00:19:33.512034: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-16 00:19:35.640366: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10786 MB memory:  -> device: 0, name: Tesla K80, pci bus id: 0000:08:00.0, compute capability: 3.7
2022-12-16 00:19:35.641647: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 10786 MB memory:  -> device: 1, name: Tesla K80, pci bus id: 0000:09:00.0, compute capability: 3.7
2022-12-16 00:19:35.643294: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 10786 MB m

Loaded model from disk.


# Getting test data

In [4]:
urllib.request.urlretrieve(
    "https://github.com/ur-whitelab/peptide-dashboard/raw/master/ml/data/insoluble.npz",
    "insoluble.npz",
)
urllib.request.urlretrieve(
    "https://github.com/ur-whitelab/peptide-dashboard/raw/master/ml/data/soluble.npz",
    "soluble.npz",
)
with np.load("soluble.npz") as r:
    pos_data = r['arr_0']
with np.load("insoluble.npz") as r:
    neg_data = r['arr_0']

def counts_aa(vec):
    counts =  tf.histogram_fixed_width(vec, [0, 20], nbins=21)[1:]
    return counts /tf.reduce_sum(counts)
labels = np.concatenate(
    (
        np.ones((pos_data.shape[0], 1), dtype=pos_data.dtype),
        np.zeros((neg_data.shape[0], 1), dtype=pos_data.dtype),
    ),
    axis=0,
)
features = np.concatenate((pos_data, neg_data), axis=0)


In [5]:

@dataclass
class Config:
    vocab_size: int
    example_number: int
    batch_size: int
    buffer_size: int
    rnn_units: int
    hidden_dim: int
    embedding_dim: int
    reg_strength: float
    lr: float
    drop_rate: float
        
config = Config(vocab_size=21, # include gap
                example_number=len(labels), 
                batch_size=16, 
                buffer_size=10000,
                rnn_units=64,
                hidden_dim=64,
                embedding_dim=32,
                reg_strength=0.01,
                lr=1e-4,
                drop_rate=0.2
               )

In [6]:
# we now need to shuffle before creating TF dataset
# so that our train/test/val splits are random
np.random.seed(0) # Note: seed 0 is used for training. DO NOT CHANGE!
                                 
i = np.arange(len(labels))
np.random.shuffle(i)
shuffled_labels = labels[i]
shuffled_features = features[i]
data = tf.data.Dataset.from_tensor_slices((shuffled_features, shuffled_labels)).map(lambda x,y: ((x, counts_aa(x)), y))
# now split into val, test, train and batch
N = len(data)  
L = None#features[0].shape[-1]
split = int(0.1 * N)
test_data = data.take(split).batch(config.batch_size)
nontest = data.skip(split)
val_data, train_data = nontest.take(split).batch(config.batch_size), \
    nontest.skip(split).shuffle(config.buffer_size).batch(config.batch_size).prefetch(tf.data.experimental.AUTOTUNE)

In [9]:
X_test = shuffled_features[:split]
y_test = shuffled_labels[:split]

In [7]:
use_tpu = False
decay_epochs = 50
decay_steps = N  // config.batch_size * decay_epochs
lr_decayed_fn = tf.keras.optimizers.schedules.CosineDecay(
  config.lr, decay_steps, alpha=1e-3)
opt = tf.optimizers.Adam(lr_decayed_fn)
model.compile(
  opt,
  loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
  steps_per_execution = 60 if use_tpu else None,
  metrics=[tf.keras.metrics.AUC(from_logits=False), tf.keras.metrics.BinaryAccuracy(threshold=0)])

In [10]:
y_hat_test = model.predict(test_data)
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
fpr, tpr, thresholds = roc_curve(y_test, y_hat_test, drop_intermediate=False)
# calculate the g-mean for each threshold
gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
best_accuracy_threshold = thresholds[ix]
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))
adjusted_y_hat_test = [1 if m>best_accuracy_threshold else 0 for m in y_hat_test]
acc = accuracy_score(y_test, adjusted_y_hat_test, normalize=True)
print(f'Accuracy: {acc:.3f}')

Best Threshold=0.516320, G-Mean=0.697
Accuracy: 0.710


In [11]:
# compiling the model again based on adjusted decision boundary threshold
model.compile(
  opt,
  loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
  steps_per_execution = 60 if use_tpu else None,
  metrics=[tf.keras.metrics.AUC(from_logits=False), tf.keras.metrics.BinaryAccuracy(threshold=best_accuracy_threshold)])

In [12]:
def filter_lengths(X,y, min_length=10, max_length=80):
    test_lengths = np.count_nonzero(X, axis=1)
    filtered_idx = np.where((test_lengths>min_length) & (test_lengths<max_length))
    return X[filtered_idx], y[filtered_idx]

## No length filter 

In [13]:
X_test_f, y_test_f = filter_lengths(X_test, y_test, min_length=1, max_length=200)

filtered_test_data = tf.data.Dataset.from_tensor_slices((X_test_f, y_test_f)).map(lambda x,y: ((x, counts_aa(x)), y))
filtered_test_data = filtered_test_data.batch(config.batch_size)
model.evaluate(filtered_test_data)



[0.5769950747489929, 0.7561261057853699, 0.7100270986557007]

## Length filter 1 - 50

In [14]:
X_test_f, y_test_f = filter_lengths(X_test, y_test, min_length=1, max_length=50)

filtered_test_data = tf.data.Dataset.from_tensor_slices((X_test_f, y_test_f)).map(lambda x,y: ((x, counts_aa(x)), y))
filtered_test_data = filtered_test_data.batch(config.batch_size)
model.evaluate(filtered_test_data)



[0.22285817563533783, 0.9523809552192688, 0.9130434989929199]

## Length filter 50 - 100

In [15]:
X_test_f, y_test_f = filter_lengths(X_test, y_test, min_length=50, max_length=100)

filtered_test_data = tf.data.Dataset.from_tensor_slices((X_test_f, y_test_f)).map(lambda x,y: ((x, counts_aa(x)), y))
filtered_test_data = filtered_test_data.batch(config.batch_size)
model.evaluate(filtered_test_data)



[0.5421819090843201, 0.7949349880218506, 0.7242646813392639]

## Length filter 100 - 150

In [16]:
X_test_f, y_test_f = filter_lengths(X_test, y_test, min_length=100, max_length=150)

filtered_test_data = tf.data.Dataset.from_tensor_slices((X_test_f, y_test_f)).map(lambda x,y: ((x, counts_aa(x)), y))
filtered_test_data = filtered_test_data.batch(config.batch_size)
model.evaluate(filtered_test_data)



[0.5798361897468567, 0.7549368143081665, 0.703496515750885]

## Length filter 150 - 200

In [17]:
X_test_f, y_test_f = filter_lengths(X_test, y_test, min_length=150, max_length=200)

filtered_test_data = tf.data.Dataset.from_tensor_slices((X_test_f, y_test_f)).map(lambda x,y: ((x, counts_aa(x)), y))
filtered_test_data = filtered_test_data.batch(config.batch_size)
model.evaluate(filtered_test_data)



[0.6005921959877014, 0.7351933717727661, 0.7022332549095154]

## Length filter 1 - 100

In [18]:
X_test_f, y_test_f = filter_lengths(X_test, y_test, min_length=1, max_length=100)

filtered_test_data = tf.data.Dataset.from_tensor_slices((X_test_f, y_test_f)).map(lambda x,y: ((x, counts_aa(x)), y))
filtered_test_data = filtered_test_data.batch(config.batch_size)
model.evaluate(filtered_test_data)



[0.5172854661941528, 0.8138336539268494, 0.7389830350875854]