In [38]:
"""Multilayer Perceptron for drug response problem"""

from __future__ import division, print_function

import argparse
import csv
import logging
import sys
import json

import numpy as np

from keras import backend as K
from keras import metrics
from keras.models import Sequential, load_model
from sklearn.metrics import r2_score
from keras.layers import Activation, BatchNormalization, Dense, Dropout, LocallyConnected1D, Conv1D, MaxPooling1D, Flatten, Conv2D, LocallyConnected2D
from keras.callbacks import Callback, ModelCheckpoint, ProgbarLogger

# For non-interactive plotting
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt


import p1b3_combat_ccle as benchmark
import candle

sys.argv = [''] # for Jupyter nbs

#cfg = K.tf.ConfigProto(gpu_options={'allow_growth': True})
#K.set_session(K.tf.Session(config=cfg))

In [2]:
import keras

keras.__version__

'2.3.1'

In [3]:
def initialize_parameters(default_model = 'p1b3_default_model.txt'):
    
    # Build benchmark object
    p1b3Bmk = benchmark.BenchmarkP1B3(benchmark.file_path, default_model, 'keras',
    prog='p1b3_baseline', desc='Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1')
    
    
    # Initialize parameters
    gParameters = candle.finalize_parameters(p1b3Bmk)
    #benchmark.logger.info('Params: {}'.format(gParameters))

    return gParameters

def str2lst(string_val):
    result = [int(x) for x in string_val.split(' ')]
    return result


def evaluate_keras_metric(y_true, y_pred, metric):
    #objective_function = metrics.get(metric)
    objective_function = metrics.RootMeanSquaredError()
    objective = objective_function(y_true, y_pred)
    return K.eval(objective)


def evaluate_model(model, generator, steps, metric, category_cutoffs=[0.]):
    y_true, y_pred = None, None
    count = 0
    while count < steps:
        x_batch, y_batch = next(generator)
        y_batch_pred = model.predict_on_batch(x_batch)
        y_batch_pred = y_batch_pred.ravel()
        y_true = np.concatenate((y_true, y_batch)) if y_true is not None else y_batch
        y_pred = np.concatenate((y_pred, y_batch_pred)) if y_pred is not None else y_batch_pred
        count += 1

    loss = evaluate_keras_metric(y_true.astype(np.float32), y_pred.astype(np.float32), metric)

    y_true_class = np.digitize(y_true, category_cutoffs)
    y_pred_class = np.digitize(y_pred, category_cutoffs)

    # theano does not like integer input
    acc = evaluate_keras_metric(y_true_class.astype(np.float32), y_pred_class.astype(np.float32), 'binary_accuracy')  # works for multiclass labels as well

    return loss, acc, y_true, y_pred, y_true_class, y_pred_class


def plot_error(y_true, y_pred, batch, file_ext, file_pre='output_dir', subsample=1000):
    if batch % 10:
        return

    total = len(y_true)
    if subsample and subsample < total:
        usecols = np.random.choice(total, size=subsample, replace=False)
        y_true = y_true[usecols]
        y_pred = y_pred[usecols]

    y_true = y_true * 100
    y_pred = y_pred * 100
    diffs = y_pred - y_true

    bins = np.linspace(-200, 200, 100)
    if batch == 0:
        y_shuf = np.random.permutation(y_true)
        plt.hist(y_shuf - y_true, bins, alpha=0.5, label='Random')

    #plt.hist(diffs, bins, alpha=0.35-batch/100., label='Epoch {}'.format(batch+1))
    plt.hist(diffs, bins, alpha=0.3, label='Epoch {}'.format(batch+1))
    plt.title("Histogram of errors in percentage growth")
    plt.legend(loc='upper right')
    plt.savefig(file_pre+'.histogram'+file_ext+'.b'+str(batch)+'.png')
    plt.close()

    # Plot measured vs. predicted values
    fig, ax = plt.subplots()
    plt.grid('on')
    ax.scatter(y_true, y_pred, color='red', s=10)
    ax.plot([y_true.min(), y_true.max()],
            [y_true.min(), y_true.max()], 'k--', lw=4)
    ax.set_xlabel('Measured')
    ax.set_ylabel('Predicted')
    plt.savefig(file_pre+'.diff'+file_ext+'.b'+str(batch)+'.png')
    plt.close()

In [4]:
class MyLossHistory(Callback):
    def __init__(self, progbar, val_gen, test_gen, val_steps, test_steps, metric, category_cutoffs=[0.], ext='', pre='save'):
        super(MyLossHistory, self).__init__()
        self.progbar = progbar
        self.val_gen = val_gen
        self.test_gen = test_gen
        self.val_steps = val_steps
        self.test_steps = test_steps
        self.metric = metric
        self.category_cutoffs = category_cutoffs
        self.pre = pre
        self.ext = ext

    def on_train_begin(self, logs={}):
        self.best_val_loss = np.Inf
        self.best_val_acc = -np.Inf

    def on_epoch_end(self, batch, logs={}):
        val_loss, val_acc, y_true, y_pred, y_true_class, y_pred_class = evaluate_model(self.model, self.val_gen, self.val_steps, self.metric, self.category_cutoffs)
        test_loss, test_acc, _, _, _, _ = evaluate_model(self.model, self.test_gen, self.test_steps, self.metric, self.category_cutoffs)
        self.progbar.append_extra_log_values([('val_acc', val_acc), ('test_loss', test_loss), ('test_acc', test_acc)])
        if float(logs.get('val_loss', 0)) < self.best_val_loss:
            plot_error(y_true, y_pred, batch, self.ext, self.pre)
        self.best_val_loss = min(float(logs.get('val_loss', 0)), self.best_val_loss)
        self.best_val_acc = max(float(logs.get('val_acc', 0)), self.best_val_acc)

In [5]:
class MyProgbarLogger(ProgbarLogger):
    def __init__(self, samples):
        super(MyProgbarLogger, self).__init__(count_mode='samples')
        self.samples = samples

    def on_train_begin(self, logs=None):
        super(MyProgbarLogger, self).on_train_begin(logs)
        self.verbose = 1
        self.extra_log_values = []
        self.params['samples'] = self.samples

    def on_batch_begin(self, batch, logs=None):
        if self.seen < self.target:
            self.log_values = []
            self.extra_log_values = []

    def append_extra_log_values(self, tuples):
        for k, v in tuples:
            self.extra_log_values.append((k, v))

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        epoch_log = 'Epoch {}/{}'.format(epoch + 1, self.epochs)
        for k in self.params['metrics']:
            if k in logs:
                self.log_values.append((k, logs[k]))
                epoch_log += ' - {}: {:.4f}'.format(k, logs[k])
        for k, v in self.extra_log_values:
            self.log_values.append((k, v))
            epoch_log += ' - {}: {:.4f}'.format(k, float(v))
        if self.verbose:
            self.progbar.update(self.seen, self.log_values)
        benchmark.logger.debug(epoch_log)

In [6]:
def add_conv_layer(model, layer_params, input_dim=None, locally_connected=False):
    if len(layer_params) == 3: # 1D convolution
        filters = layer_params[0]
        filter_len = layer_params[1]
        stride = layer_params[2]
        if locally_connected:
            if input_dim:
                model.add(LocallyConnected1D(filters, filter_len, strides=stride, input_shape=(input_dim, 1)))
            else:
                model.add(LocallyConnected1D(filters, filter_len, strides=stride))
        else:
            if input_dim:
                model.add(Conv1D(filters, filter_len, strides=stride, input_shape=(input_dim, 1)))
            else:
                model.add(Conv1D(filters, filter_len, strides=stride))
    elif len(layer_params) == 5: # 2D convolution
        filters = layer_params[0]
        filter_len = (layer_params[1], layer_params[2])
        stride = (layer_params[3], layer_params[4])
        if locally_connected:
            if input_dim:
                model.add(LocallyConnected2D(filters, filter_len, strides=stride, input_shape=(input_dim, 1)))
            else:
                model.add(LocallyConnected2D(filters, filter_len, strides=stride))
        else:
            if input_dim:
                model.add(Conv2D(filters, filter_len, strides=stride, input_shape=(input_dim, 1)))
            else:
                model.add(Conv2D(filters, filter_len, strides=stride))
    return model

In [7]:
gParameters = initialize_parameters()
#gParameters['cell_features'] = 'all'
#gParameters['drug_features'] = 'all'
benchmark.check_params(gParameters)

Params:
{'activation': 'relu',
 'batch_normalization': False,
 'batch_size': 100,
 'category_cutoffs': [0.0],
 'cell_features': ['expression'],
 'cell_noise_sigma': 0.0,
 'data_type': <class 'numpy.float32'>,
 'dense': [1000, 500, 100, 50],
 'dropout': 0.1,
 'drug_features': ['descriptors'],
 'epochs': 50,
 'experiment_id': 'EXP000',
 'feature_subsample': 0,
 'initialization': 'normal',
 'learning_rate': 0.001,
 'logfile': None,
 'loss': 'mse',
 'max_logconc': -4.0,
 'min_logconc': -5.0,
 'optimizer': 'sgd',
 'output_dir': '/lustre/schandra_crpl/users/2216/NCI-DOE-Collab-Pilot1-Single-Drug-Response-Predictor/Pilot1/P1B3/save/EXP000/RUN000',
 'profiling': False,
 'rng_seed': 2017,
 'run_id': 'RUN000',
 'scaling': 'std',
 'scramble': False,
 'shuffle': False,
 'subsample': 'naive_balancing',
 'test_cell_split': 0.15,
 'timeout': -1,
 'train_bool': True,
 'val_split': 0.1,
 'verbose': None,
 'workers': 1}


In [8]:
"""
Runs the model using the specified set of parameters

Args:
   gParameters: a python dictionary containing the parameters (e.g. epoch)
   to run the model with.
"""
#
if 'dense' in gParameters:
    dval = gParameters['dense']
    if type(dval) != list:
        res = list(dval)
    #try:
        #is_str = isinstance(dval, basestring)
    #except NameError:
        #is_str = isinstance(dval, str)
    #if is_str:
        #res = str2lst(dval)
        gParameters['dense'] = res
    print(gParameters['dense'])

if 'conv' in gParameters:
    flat = gParameters['conv']
    gParameters['conv'] = [flat[i:i+3] for i in range(0, len(flat), 3)]
    #conv_list = p1_common.parse_conv_list(gParameters['conv'])
    #cval = gParameters['conv']
    #try:
    #    is_str = isinstance(cval, basestring)
    #except NameError:
    #    is_str = isinstance(cval, str)
    #if is_str:
    #    res = str2lst(cval)
    #    gParameters['conv'] = res
    print('Conv input', gParameters['conv'])
# print('Params:', gParameters)
# Construct extension to save model
ext = benchmark.extension_from_parameters(gParameters, '.keras')
logfile = gParameters['logfile'] if gParameters['logfile'] else gParameters['output_dir']+ext+'.log'

fh = logging.FileHandler(logfile)
fh.setFormatter(logging.Formatter("[%(asctime)s %(process)d] %(message)s", datefmt="%Y-%m-%d %H:%M:%S"))
fh.setLevel(logging.DEBUG)

sh = logging.StreamHandler()
sh.setFormatter(logging.Formatter(''))
sh.setLevel(logging.DEBUG if gParameters['verbose'] else logging.INFO)

benchmark.logger.setLevel(logging.DEBUG)
benchmark.logger.addHandler(fh)
benchmark.logger.addHandler(sh)
benchmark.logger.info('Params: {}'.format(gParameters))

# Get default parameters for initialization and optimizer functions
kerasDefaults = candle.keras_default_config()
seed = gParameters['rng_seed']

# Build dataset loader object
loader = benchmark.DataLoader(seed=seed, dtype=gParameters['data_type'],
                         val_split=gParameters['val_split'],
                         test_cell_split=gParameters['test_cell_split'],
                         cell_features=gParameters['cell_features'],
                         drug_features=gParameters['drug_features'],
                         feature_subsample=gParameters['feature_subsample'],
                         scaling=gParameters['scaling'],
                         scramble=gParameters['scramble'],
                         min_logconc=gParameters['min_logconc'],
                         max_logconc=gParameters['max_logconc'],
                         subsample=gParameters['subsample'],
                         category_cutoffs=gParameters['category_cutoffs'])

# Initialize weights and learning rule
initializer_weights = candle.build_initializer(gParameters['initialization'], kerasDefaults, seed)
initializer_bias = candle.build_initializer('constant', kerasDefaults, 0.)

activation = gParameters['activation']

# Define model architecture
gen_shape = None
out_dim = 1



model = Sequential()
if 'dense' in gParameters: # Build dense layers
    for layer in gParameters['dense']:
        if layer:
            model.add(Dense(layer, input_dim=loader.input_dim,
                        kernel_initializer=initializer_weights,
                        bias_initializer=initializer_bias))
            if gParameters['batch_normalization']:
                model.add(BatchNormalization())
            model.add(Activation(gParameters['activation']))
            if gParameters['dropout']:
                model.add(Dropout(gParameters['dropout']))
else: # Build convolutional layers
    gen_shape = 'add_1d'
    layer_list = list(range(0, len(gParameters['conv'])))
    lc_flag=False
    if 'locally_connected' in gParameters:
        lc_flag = True

    for l, i in enumerate(layer_list):
        if i == 0:
            add_conv_layer(model, gParameters['conv'][i], input_dim=loader.input_dim,locally_connected=lc_flag)
        else:
            add_conv_layer(model, gParameters['conv'][i],locally_connected=lc_flag)
        if gParameters['batch_normalization']:
                model.add(BatchNormalization())
        model.add(Activation(gParameters['activation']))
        if gParameters['pool']:
            model.add(MaxPooling1D(pool_size=gParameters['pool']))
    model.add(Flatten())

model.add(Dense(out_dim))

# Define optimizer
optimizer = candle.build_optimizer(gParameters['optimizer'],
                                        gParameters['learning_rate'],
                                        kerasDefaults)

# Compile and display model
model.compile(loss=gParameters['loss'], optimizer=optimizer, metrics=[metrics.RootMeanSquaredError()])
model.summary()
benchmark.logger.debug('Model: {}'.format(model.to_json()))



Params: {'dense': [1000, 500, 100, 50], 'batch_size': 100, 'epochs': 50, 'activation': 'relu', 'loss': 'mse', 'optimizer': 'sgd', 'learning_rate': 0.001, 'scaling': 'std', 'dropout': 0.1, 'feature_subsample': 0, 'val_split': 0.1, 'rng_seed': 2017, 'initialization': 'normal', 'min_logconc': -5.0, 'max_logconc': -4.0, 'category_cutoffs': [0.0], 'test_cell_split': 0.15, 'cell_features': ['expression'], 'drug_features': ['descriptors'], 'subsample': 'naive_balancing', 'batch_normalization': False, 'cell_noise_sigma': 0.0, 'output_dir': '/lustre/schandra_crpl/users/2216/NCI-DOE-Collab-Pilot1-Single-Drug-Response-Predictor/Pilot1/P1B3/save/EXP000/RUN000', 'verbose': None, 'logfile': None, 'train_bool': True, 'experiment_id': 'EXP000', 'run_id': 'RUN000', 'shuffle': False, 'profiling': False, 'scramble': False, 'workers': 1, 'data_type': <class 'numpy.float32'>, 'timeout': -1}


[1000, 500, 100, 50]


Loaded 11670 unique (D, CL) response sets.
  converters ={'NAME' : str})
Distribution of dose response:
               AUC
count  8851.000000
mean      0.777905
std       0.163426
min       0.122400
25%       0.704300
50%       0.819800
75%       0.896000
max       1.000000
Category cutoffs: [0.0]
Dose response bin counts:
  Class 0:       0 (0.0000) - between +0.00 and +0.00
  Class 1:    8851 (1.0000) - between +0.00 and +0.01
  Total:      8851
Rows in train: 7806, val: 867, test: 178
Input features shapes:
  drug_concentration: (1,)
  drug_descriptors: (3837,)
  cell_expression: (942,)
Total input dimensions: 4779


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1000)              4780000   
_________________________________________________________________
activation_1 (Activation)    (None, 1000)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 500)               500500    
_________________________________________________________________
activation_2 (Activation)    (None, 500)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 500)               0         
_______________________________________

In [30]:
#!pip install keras-tuner -q
gParameters['epochs'] = 

In [31]:


train_gen = benchmark.DataGenerator(loader, batch_size=gParameters['batch_size'], shape=gen_shape, name='train_gen', cell_noise_sigma=gParameters['cell_noise_sigma']).flow()
val_gen = benchmark.DataGenerator(loader, partition='val', batch_size=gParameters['batch_size'], shape=gen_shape, name='val_gen').flow()
val_gen2 = benchmark.DataGenerator(loader, partition='val', batch_size=gParameters['batch_size'], shape=gen_shape, name='val_gen2').flow()
test_gen = benchmark.DataGenerator(loader, partition='test', batch_size=gParameters['batch_size'], shape=gen_shape, name='test_gen').flow()

train_steps = int(loader.n_train/gParameters['batch_size'])
val_steps = int(loader.n_val/gParameters['batch_size'])
test_steps = int(loader.n_test/gParameters['batch_size'])

if 'train_steps' in gParameters:
    train_steps = gParameters['train_steps']
if 'val_steps' in gParameters:
    val_steps = gParameters['val_steps']
if 'test_steps' in gParameters:
    test_steps = gParameters['test_steps']

checkpointer = ModelCheckpoint(filepath=gParameters['output_dir']+'.model'+ext+'.h5', save_best_only=True)
progbar = MyProgbarLogger(train_steps * gParameters['batch_size'])
loss_history = MyLossHistory(progbar=progbar, val_gen=val_gen2, test_gen=test_gen,
                        val_steps=val_steps, test_steps=test_steps,
                        metric=gParameters['loss'], category_cutoffs=gParameters['category_cutoffs'],
                        ext=ext, pre=gParameters['output_dir'])

In [32]:
# Seed random generator for training
np.random.seed(seed)

candleRemoteMonitor = candle.CandleRemoteMonitor(params=gParameters)

history = model.fit_generator(train_gen, train_steps,
                    epochs=gParameters['epochs'],
                    validation_data=val_gen,
                    validation_steps=val_steps,
                    verbose=0,
                    callbacks=[checkpointer, loss_history, progbar, candleRemoteMonitor],
                    )

benchmark.logger.removeHandler(fh)
benchmark.logger.removeHandler(sh)

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250


Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250


Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78/250
Epoch 79/250
Epoch 80/250
Epoch 81/250
Epoch 82/250
Epoch 83/250
Epoch 84/250
Epoch 85/250
Epoch 86/250
Epoch 87/250
Epoch 88/250
Epoch 89/250
Epoch 90/250
Epoch 91/250
Epoch 92/250
Epoch 93/250
Epoch 94/250
Epoch 95/250
Epoch 96/250
Epoch 97/250
Epoch 98/250
Epoch 99/250
Epoch 100/250
Epoch 101/250
Epoch 102/250


Epoch 103/250
Epoch 104/250
Epoch 105/250
Epoch 106/250
Epoch 107/250
Epoch 108/250
Epoch 109/250
Epoch 110/250
Epoch 111/250
Epoch 112/250
Epoch 113/250
Epoch 114/250
Epoch 115/250
Epoch 116/250
Epoch 117/250
Epoch 118/250
Epoch 119/250
Epoch 120/250
Epoch 121/250
Epoch 122/250
Epoch 123/250
Epoch 124/250
Epoch 125/250
Epoch 126/250
Epoch 127/250
Epoch 128/250
Epoch 129/250
Epoch 130/250
Epoch 131/250
Epoch 132/250
Epoch 133/250
Epoch 134/250
Epoch 135/250
Epoch 136/250


Epoch 137/250
Epoch 138/250
Epoch 139/250
Epoch 140/250
Epoch 141/250
Epoch 142/250
Epoch 143/250
Epoch 144/250
Epoch 145/250
Epoch 146/250
Epoch 147/250
Epoch 148/250
Epoch 149/250
Epoch 150/250
Epoch 151/250
Epoch 152/250
Epoch 153/250
Epoch 154/250
Epoch 155/250
Epoch 156/250
Epoch 157/250
Epoch 158/250
Epoch 159/250
Epoch 160/250
Epoch 161/250
Epoch 162/250
Epoch 163/250
Epoch 164/250
Epoch 165/250
Epoch 166/250
Epoch 167/250
Epoch 168/250
Epoch 169/250
Epoch 170/250


Epoch 171/250
Epoch 172/250
Epoch 173/250
Epoch 174/250
Epoch 175/250
Epoch 176/250
Epoch 177/250
Epoch 178/250
Epoch 179/250
Epoch 180/250
Epoch 181/250
Epoch 182/250
Epoch 183/250
Epoch 184/250
Epoch 185/250
Epoch 186/250
Epoch 187/250
Epoch 188/250
Epoch 189/250
Epoch 190/250
Epoch 191/250
Epoch 192/250
Epoch 193/250
Epoch 194/250
Epoch 195/250
Epoch 196/250
Epoch 197/250
Epoch 198/250
Epoch 199/250
Epoch 200/250
Epoch 201/250
Epoch 202/250
Epoch 203/250
Epoch 204/250


Epoch 205/250
Epoch 206/250
Epoch 207/250
Epoch 208/250
Epoch 209/250
Epoch 210/250
Epoch 211/250
Epoch 212/250
Epoch 213/250
Epoch 214/250
Epoch 215/250
Epoch 216/250
Epoch 217/250
Epoch 218/250
Epoch 219/250
Epoch 220/250
Epoch 221/250
Epoch 222/250
Epoch 223/250
Epoch 224/250
Epoch 225/250
Epoch 226/250
Epoch 227/250
Epoch 228/250
Epoch 229/250
Epoch 230/250
Epoch 231/250
Epoch 232/250
Epoch 233/250
Epoch 234/250
Epoch 235/250
Epoch 236/250
Epoch 237/250
Epoch 238/250


Epoch 239/250
Epoch 240/250
Epoch 241/250
Epoch 242/250
Epoch 243/250
Epoch 244/250
Epoch 245/250
Epoch 246/250
Epoch 247/250
Epoch 248/250
Epoch 249/250
Epoch 250/250


In [46]:
#model.save('./save/250epochs.h5')

def rmse(y_true, y_pred):
    return K.sqrt(mean_squared_error(y_true, y_pred))


# Load the saved model from file
#loaded_model = load_model('./save/EXP000/RUN000.model.keras.A=relu.B=100.D=0.1.E=50.N=0.0.D1=1000.D2=500.D3=100.D4=50.S=std.h5')
loaded_model = load_model('./save/250epochs.h5', compile={'metrics':  [metrics.RootMeanSquaredError()]})
loaded_model.compile(loss=gParameters['loss'], optimizer=optimizer, metrics=[rmse])


# Evaluate the loaded model on test data
test_loss, test_acc = loaded_model.evaluate(val_test, val_auc)

print(test_loss, test_accuracy)

ValueError: Unknown metric function: {'class_name': 'RootMeanSquaredError', 'config': {'name': 'root_mean_squared_error', 'dtype': 'float32'}}

In [13]:
import pandas as pd

df_test_cell = pd.read_csv('./data/val_cell.csv')
df_test_drug = pd.read_csv('./data/val_drug.csv')

val_data = pd.read_csv('./data/val_data.tsv', sep='\t')
val_data.head()

Unnamed: 0.1,Unnamed: 0,celline,DRUG,AUC,MW,AMW,Sv,Se,Sp,Si,...,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10,drug_encoded,celline_encoded
0,92,CCLE.697,CCLE.1,0.7692,439.61,7.09,39.143,61.44,41.62,69.683,...,2.047,-0.9717,2.723,2.46,2.078,1.812,1.619,2.334,0,4
1,93,CCLE.697,CCLE.10,0.7777,475.4,8.804,34.718,54.523,36.597,61.16,...,2.047,-0.9717,2.723,2.46,2.078,1.812,1.619,2.334,1,4
2,94,CCLE.697,CCLE.11,0.3723,349.47,7.132,30.762,48.796,32.54,55.066,...,2.047,-0.9717,2.723,2.46,2.078,1.812,1.619,2.334,2,4
3,95,CCLE.697,CCLE.12,0.7869,464.86,9.685,33.422,50.32,33.148,54.906,...,2.047,-0.9717,2.723,2.46,2.078,1.812,1.619,2.334,3,4
4,96,CCLE.697,CCLE.14,0.4337,421.49,7.805,34.906,54.778,35.904,60.694,...,2.047,-0.9717,2.723,2.46,2.078,1.812,1.619,2.334,5,4


In [14]:
val_data.isna().any().any()

True

In [15]:
val_data.shape

(1088, 4785)

In [16]:
val_data = val_data.dropna()

In [17]:
val_auc = val_data['AUC'].to_numpy()
val_drugs = val_data['DRUG']

val_test = val_data.drop(columns=['celline', 'DRUG', 'AUC', 'celline_encoded', 'drug_encoded', 'Unnamed: 0'])
val_test = val_test.to_numpy()

In [18]:
val_test.shape

(922, 4779)

In [19]:
val_test[0].shape

(4779,)

In [33]:
loss, accuracy = model.evaluate(val_test, val_auc)



In [34]:
print(loss, accuracy)

11.350824033360677 3.369098424911499


In [35]:
y_pred = model.predict(val_test)
r_squared = r2_score(val_auc, y_pred)
print("R-squared: ", r_squared)

R-squared:  -619.9607695541692


In [11]:
from itertools import cycle, islice

In [12]:
range(loader.n_train)

range(0, 7806)

In [13]:
range(loader.total)[-loader.n_val:]

range(7806, 8673)

In [14]:
range(loader.total, loader.total + loader.n_test)

range(8673, 8851)

In [23]:
## train
train_cycle = cycle(range(loader.n_train))
train_indices = list(islice(train_cycle, loader.n_train))
df_train = loader.df_response.iloc[train_indices, :]
df_train = df_train.merge(loader.df_cell_expr, on='CELLNAME')
df_train.shape

(7806, 945)

In [24]:
## val
val_cycle = cycle(range(loader.total)[-loader.n_val:])
val_indices = list(islice(val_cycle, loader.n_val))
df_val = loader.df_response.iloc[val_indices, :]
df_val = df_val.merge(loader.df_cell_expr, on='CELLNAME')
df_val.shape

(867, 945)

In [25]:
## test
test_cycle = cycle(range(loader.total, loader.total + loader.n_test))
test_indices = list(islice(test_cycle, loader.n_test))
df_test = loader.df_response.iloc[test_indices, :]
df_test = df_test.merge(loader.df_cell_expr, on='CELLNAME')
df_test.shape

(178, 945)

In [30]:
df_train.merge(df_val, how='inner')

Unnamed: 0,CCLE,CELLNAME,AUC,AARS,ABCB6,ABCC5,ABCF1,ABCF3,ABHD4,ABHD6,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10


In [31]:
df_train.merge(df_test, how='inner')

Unnamed: 0,CCLE,CELLNAME,AUC,AARS,ABCB6,ABCC5,ABCF1,ABCF3,ABHD4,ABHD6,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10


In [32]:
df_val.merge(df_test, how='inner')

Unnamed: 0,CCLE,CELLNAME,AUC,AARS,ABCB6,ABCC5,ABCF1,ABCF3,ABHD4,ABHD6,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10


In [16]:
all_data = loader.df_response
#all_data = pd.merge(all_data, loader.df_cell_expr, on='CELLNAME')
#all_data = all_data.merge(loader.df_drug_desc_ccle, on='CCLE')
all_data.head()

Unnamed: 0,CCLE,CELLNAME,AUC
0,CCLE.16,CCLE.FU97,0.7314
1,CCLE.5,CCLE.TEN,0.9186
2,CCLE.17,CCLE.MCF7,0.7775
3,CCLE.4,CCLE.FADU,0.8808
4,CCLE.3,CCLE.KCL22,0.5282


In [17]:
all_data.shape

(8851, 3)

In [21]:
val_data = all_data[~all_data['CELLNAME'].isin(df_test_cell)]
val_data.shape

(8851, 3)

In [22]:
val_data

Unnamed: 0,CCLE,CELLNAME,AUC
0,CCLE.16,CCLE.FU97,0.7314
1,CCLE.5,CCLE.TEN,0.9186
2,CCLE.17,CCLE.MCF7,0.7775
3,CCLE.4,CCLE.FADU,0.8808
4,CCLE.3,CCLE.KCL22,0.5282
...,...,...,...
8846,CCLE.2,CCLE.HEC251,0.8361
8847,CCLE.19,CCLE.NCIH1563,0.9414
8848,CCLE.2,CCLE.HUCCT1,0.8620
8849,CCLE.19,CCLE.COLO320,0.9527


In [20]:
(8851-7806)/8851

0.11806575528188905

In [23]:
loader.n_train

7806

In [24]:
loader.n_val

867

In [25]:
loader.n_test

178

In [26]:
7806+967+178

8951