In [1]:
"""Multilayer Perceptron for drug response problem"""

from __future__ import division, print_function

import argparse
import csv
import logging
import sys
import json

import numpy as np

from keras import backend as K
from keras import metrics
from keras.models import Sequential
from keras.layers import Activation, BatchNormalization, Dense, Dropout, LocallyConnected1D, Conv1D, MaxPooling1D, Flatten, Conv2D, LocallyConnected2D
from keras.callbacks import Callback, ModelCheckpoint, ProgbarLogger

# For non-interactive plotting
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt


import p1b3 as benchmark
import candle

sys.argv = [''] # for Jupyter nbs

#cfg = K.tf.ConfigProto(gpu_options={'allow_growth': True})
#K.set_session(K.tf.Session(config=cfg))

'''
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
config.log_device_placement = True  # to log device placement (on which device the operation ran)
sess = tf.Session(config=config)
set_session(sess)  # set this TensorFlow session as the default session for Keras
'''

Using TensorFlow backend.


Importing candle utils for keras


'\nfrom keras.backend.tensorflow_backend import set_session\nimport tensorflow as tf\nconfig = tf.ConfigProto()\nconfig.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU\nconfig.log_device_placement = True  # to log device placement (on which device the operation ran)\nsess = tf.Session(config=config)\nset_session(sess)  # set this TensorFlow session as the default session for Keras\n'

In [2]:
def initialize_parameters(default_model = 'p1b3_default_model.txt'):
    
    # Build benchmark object
    p1b3Bmk = benchmark.BenchmarkP1B3(benchmark.file_path, default_model, 'keras',
    prog='p1b3_baseline', desc='Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1')
    
    
    # Initialize parameters
    gParameters = candle.finalize_parameters(p1b3Bmk)
    #benchmark.logger.info('Params: {}'.format(gParameters))

    return gParameters

def str2lst(string_val):
    result = [int(x) for x in string_val.split(' ')]
    return result


def evaluate_keras_metric(y_true, y_pred, metric):
    objective_function = metrics.get(metric)
    objective = objective_function(y_true, y_pred)
    return K.eval(objective)


def evaluate_model(model, generator, steps, metric, category_cutoffs=[0.]):
    y_true, y_pred = None, None
    count = 0
    while count < steps:
        x_batch, y_batch = next(generator)
        y_batch_pred = model.predict_on_batch(x_batch)
        y_batch_pred = y_batch_pred.ravel()
        y_true = np.concatenate((y_true, y_batch)) if y_true is not None else y_batch
        y_pred = np.concatenate((y_pred, y_batch_pred)) if y_pred is not None else y_batch_pred
        count += 1

    loss = evaluate_keras_metric(y_true.astype(np.float32), y_pred.astype(np.float32), metric)

    y_true_class = np.digitize(y_true, category_cutoffs)
    y_pred_class = np.digitize(y_pred, category_cutoffs)

    # theano does not like integer input
    acc = evaluate_keras_metric(y_true_class.astype(np.float32), y_pred_class.astype(np.float32), 'binary_accuracy')  # works for multiclass labels as well

    return loss, acc, y_true, y_pred, y_true_class, y_pred_class


def plot_error(y_true, y_pred, batch, file_ext, file_pre='output_dir', subsample=1000):
    if batch % 10:
        return

    total = len(y_true)
    if subsample and subsample < total:
        usecols = np.random.choice(total, size=subsample, replace=False)
        y_true = y_true[usecols]
        y_pred = y_pred[usecols]

    y_true = y_true * 100
    y_pred = y_pred * 100
    diffs = y_pred - y_true

    bins = np.linspace(-200, 200, 100)
    if batch == 0:
        y_shuf = np.random.permutation(y_true)
        plt.hist(y_shuf - y_true, bins, alpha=0.5, label='Random')

    #plt.hist(diffs, bins, alpha=0.35-batch/100., label='Epoch {}'.format(batch+1))
    plt.hist(diffs, bins, alpha=0.3, label='Epoch {}'.format(batch+1))
    plt.title("Histogram of errors in percentage growth")
    plt.legend(loc='upper right')
    plt.savefig(file_pre+'.histogram'+file_ext+'.b'+str(batch)+'.png')
    plt.close()

    # Plot measured vs. predicted values
    fig, ax = plt.subplots()
    plt.grid('on')
    ax.scatter(y_true, y_pred, color='red', s=10)
    ax.plot([y_true.min(), y_true.max()],
            [y_true.min(), y_true.max()], 'k--', lw=4)
    ax.set_xlabel('Measured')
    ax.set_ylabel('Predicted')
    plt.savefig(file_pre+'.diff'+file_ext+'.b'+str(batch)+'.png')
    plt.close()

In [3]:
class MyLossHistory(Callback):
    def __init__(self, progbar, val_gen, test_gen, val_steps, test_steps, metric, category_cutoffs=[0.], ext='', pre='save'):
        super(MyLossHistory, self).__init__()
        self.progbar = progbar
        self.val_gen = val_gen
        self.test_gen = test_gen
        self.val_steps = val_steps
        self.test_steps = test_steps
        self.metric = metric
        self.category_cutoffs = category_cutoffs
        self.pre = pre
        self.ext = ext

    def on_train_begin(self, logs={}):
        self.best_val_loss = np.Inf
        self.best_val_acc = -np.Inf

    def on_epoch_end(self, batch, logs={}):
        val_loss, val_acc, y_true, y_pred, y_true_class, y_pred_class = evaluate_model(self.model, self.val_gen, self.val_steps, self.metric, self.category_cutoffs)
        test_loss, test_acc, _, _, _, _ = evaluate_model(self.model, self.test_gen, self.test_steps, self.metric, self.category_cutoffs)
        self.progbar.append_extra_log_values([('val_acc', val_acc), ('test_loss', test_loss), ('test_acc', test_acc)])
        if float(logs.get('val_loss', 0)) < self.best_val_loss:
            plot_error(y_true, y_pred, batch, self.ext, self.pre)
        self.best_val_loss = min(float(logs.get('val_loss', 0)), self.best_val_loss)
        self.best_val_acc = max(float(logs.get('val_acc', 0)), self.best_val_acc)

In [4]:
class MyProgbarLogger(ProgbarLogger):
    def __init__(self, samples):
        super(MyProgbarLogger, self).__init__(count_mode='samples')
        self.samples = samples

    def on_train_begin(self, logs=None):
        super(MyProgbarLogger, self).on_train_begin(logs)
        self.verbose = 1
        self.extra_log_values = []
        self.params['samples'] = self.samples

    def on_batch_begin(self, batch, logs=None):
        if self.seen < self.target:
            self.log_values = []
            self.extra_log_values = []

    def append_extra_log_values(self, tuples):
        for k, v in tuples:
            self.extra_log_values.append((k, v))

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        epoch_log = 'Epoch {}/{}'.format(epoch + 1, self.epochs)
        for k in self.params['metrics']:
            if k in logs:
                self.log_values.append((k, logs[k]))
                epoch_log += ' - {}: {:.4f}'.format(k, logs[k])
        for k, v in self.extra_log_values:
            self.log_values.append((k, v))
            epoch_log += ' - {}: {:.4f}'.format(k, float(v))
        if self.verbose:
            self.progbar.update(self.seen, self.log_values)
        benchmark.logger.debug(epoch_log)

In [5]:
def add_conv_layer(model, layer_params, input_dim=None, locally_connected=False):
    if len(layer_params) == 3: # 1D convolution
        filters = layer_params[0]
        filter_len = layer_params[1]
        stride = layer_params[2]
        if locally_connected:
            if input_dim:
                model.add(LocallyConnected1D(filters, filter_len, strides=stride, input_shape=(input_dim, 1)))
            else:
                model.add(LocallyConnected1D(filters, filter_len, strides=stride))
        else:
            if input_dim:
                model.add(Conv1D(filters, filter_len, strides=stride, input_shape=(input_dim, 1)))
            else:
                model.add(Conv1D(filters, filter_len, strides=stride))
    elif len(layer_params) == 5: # 2D convolution
        filters = layer_params[0]
        filter_len = (layer_params[1], layer_params[2])
        stride = (layer_params[3], layer_params[4])
        if locally_connected:
            if input_dim:
                model.add(LocallyConnected2D(filters, filter_len, strides=stride, input_shape=(input_dim, 1)))
            else:
                model.add(LocallyConnected2D(filters, filter_len, strides=stride))
        else:
            if input_dim:
                model.add(Conv2D(filters, filter_len, strides=stride, input_shape=(input_dim, 1)))
            else:
                model.add(Conv2D(filters, filter_len, strides=stride))
    return model

In [6]:
gParameters = initialize_parameters()
#gParameters['cell_features'] = 'all'
#gParameters['drug_features'] = 'all'
benchmark.check_params(gParameters)

Params:
{'activation': 'relu',
 'batch_normalization': False,
 'batch_size': 100,
 'category_cutoffs': [0.0],
 'cell_features': ['expression'],
 'cell_noise_sigma': 0.0,
 'data_type': <class 'numpy.float32'>,
 'dense': [1000, 500, 100, 50],
 'dropout': 0.1,
 'drug_features': ['descriptors'],
 'epochs': 1,
 'experiment_id': 'EXP000',
 'feature_subsample': 0,
 'initialization': 'normal',
 'learning_rate': 0.001,
 'logfile': None,
 'loss': 'mse',
 'max_logconc': -4.0,
 'min_logconc': -5.0,
 'optimizer': 'sgd',
 'output_dir': '/global/u2/v/vineethg/NCI-DOE-Collab-Pilot1-Single-Drug-Response-Predictor/Pilot1/P1B3/save/EXP000/RUN000',
 'profiling': False,
 'rng_seed': 2017,
 'run_id': 'RUN000',
 'scaling': 'std',
 'scramble': False,
 'shuffle': False,
 'subsample': 'naive_balancing',
 'test_cell_split': 0.15,
 'timeout': -1,
 'train_bool': True,
 'val_split': 0.1,
 'verbose': None,
 'workers': 1}


In [8]:
"""
Runs the model using the specified set of parameters

Args:
   gParameters: a python dictionary containing the parameters (e.g. epoch)
   to run the model with.
"""
#
if 'dense' in gParameters:
    dval = gParameters['dense']
    if type(dval) != list:
        res = list(dval)
    #try:
        #is_str = isinstance(dval, basestring)
    #except NameError:
        #is_str = isinstance(dval, str)
    #if is_str:
        #res = str2lst(dval)
        gParameters['dense'] = res
    print(gParameters['dense'])

if 'conv' in gParameters:
    flat = gParameters['conv']
    gParameters['conv'] = [flat[i:i+3] for i in range(0, len(flat), 3)]
    #conv_list = p1_common.parse_conv_list(gParameters['conv'])
    #cval = gParameters['conv']
    #try:
    #    is_str = isinstance(cval, basestring)
    #except NameError:
    #    is_str = isinstance(cval, str)
    #if is_str:
    #    res = str2lst(cval)
    #    gParameters['conv'] = res
    print('Conv input', gParameters['conv'])
# print('Params:', gParameters)
# Construct extension to save model
ext = benchmark.extension_from_parameters(gParameters, '.keras')
logfile = gParameters['logfile'] if gParameters['logfile'] else gParameters['output_dir']+ext+'.log'

fh = logging.FileHandler(logfile)
fh.setFormatter(logging.Formatter("[%(asctime)s %(process)d] %(message)s", datefmt="%Y-%m-%d %H:%M:%S"))
fh.setLevel(logging.DEBUG)

sh = logging.StreamHandler()
sh.setFormatter(logging.Formatter(''))
sh.setLevel(logging.DEBUG if gParameters['verbose'] else logging.INFO)

benchmark.logger.setLevel(logging.DEBUG)
benchmark.logger.addHandler(fh)
benchmark.logger.addHandler(sh)
benchmark.logger.info('Params: {}'.format(gParameters))

# Get default parameters for initialization and optimizer functions
kerasDefaults = candle.keras_default_config()
seed = gParameters['rng_seed']

# Build dataset loader object
loader = benchmark.DataLoader(seed=seed, dtype=gParameters['data_type'],
                         val_split=gParameters['val_split'],
                         test_cell_split=gParameters['test_cell_split'],
                         cell_features=gParameters['cell_features'],
                         drug_features=gParameters['drug_features'],
                         feature_subsample=gParameters['feature_subsample'],
                         scaling=gParameters['scaling'],
                         scramble=gParameters['scramble'],
                         min_logconc=gParameters['min_logconc'],
                         max_logconc=gParameters['max_logconc'],
                         subsample=gParameters['subsample'],
                         category_cutoffs=gParameters['category_cutoffs'])

# Initialize weights and learning rule
initializer_weights = candle.build_initializer(gParameters['initialization'], kerasDefaults, seed)
initializer_bias = candle.build_initializer('constant', kerasDefaults, 0.)

activation = gParameters['activation']

# Define model architecture
gen_shape = None
out_dim = 1

model = Sequential()
if 'dense' in gParameters: # Build dense layers
    for layer in gParameters['dense']:
        if layer:
            model.add(Dense(layer, input_dim=loader.input_dim,
                        kernel_initializer=initializer_weights,
                        bias_initializer=initializer_bias))
            if gParameters['batch_normalization']:
                model.add(BatchNormalization())
            model.add(Activation(gParameters['activation']))
            if gParameters['dropout']:
                model.add(Dropout(gParameters['dropout']))
else: # Build convolutional layers
    gen_shape = 'add_1d'
    layer_list = list(range(0, len(gParameters['conv'])))
    lc_flag=False
    if 'locally_connected' in gParameters:
        lc_flag = True

    for l, i in enumerate(layer_list):
        if i == 0:
            add_conv_layer(model, gParameters['conv'][i], input_dim=loader.input_dim,locally_connected=lc_flag)
        else:
            add_conv_layer(model, gParameters['conv'][i],locally_connected=lc_flag)
        if gParameters['batch_normalization']:
                model.add(BatchNormalization())
        model.add(Activation(gParameters['activation']))
        if gParameters['pool']:
            model.add(MaxPooling1D(pool_size=gParameters['pool']))
    model.add(Flatten())

model.add(Dense(out_dim))

# Define optimizer
optimizer = candle.build_optimizer(gParameters['optimizer'],
                                            gParameters['learning_rate'],
                                            kerasDefaults)

[1000, 500, 100, 50]


Params: {'dense': [1000, 500, 100, 50], 'batch_size': 100, 'epochs': 1, 'activation': 'relu', 'loss': 'mse', 'optimizer': 'sgd', 'learning_rate': 0.001, 'scaling': 'std', 'dropout': 0.1, 'feature_subsample': 0, 'val_split': 0.1, 'rng_seed': 2017, 'initialization': 'normal', 'min_logconc': -5.0, 'max_logconc': -4.0, 'category_cutoffs': [0.0], 'test_cell_split': 0.15, 'cell_features': ['expression'], 'drug_features': ['descriptors'], 'subsample': 'naive_balancing', 'batch_normalization': False, 'cell_noise_sigma': 0.0, 'output_dir': '/global/u2/v/vineethg/NCI-DOE-Collab-Pilot1-Single-Drug-Response-Predictor/Pilot1/P1B3/save/EXP000/RUN000', 'verbose': None, 'logfile': None, 'train_bool': True, 'experiment_id': 'EXP000', 'run_id': 'RUN000', 'shuffle': False, 'profiling': False, 'scramble': False, 'workers': 1, 'data_type': <class 'numpy.float32'>, 'timeout': -1}
Loaded 2774676 unique (D, CL) response sets.
  converters ={'NAME' : str})
  converters ={'NAME' : str})


AttributeError: 'DataLoader' object has no attribute 'df_drug_desc'

In [None]:
# Compile and display model
model.compile(loss=gParameters['loss'], optimizer=optimizer)
model.summary()
benchmark.logger.debug('Model: {}'.format(model.to_json()))

train_gen = benchmark.DataGenerator(loader, batch_size=gParameters['batch_size'], shape=gen_shape, name='train_gen', cell_noise_sigma=gParameters['cell_noise_sigma']).flow()
val_gen = benchmark.DataGenerator(loader, partition='val', batch_size=gParameters['batch_size'], shape=gen_shape, name='val_gen').flow()
val_gen2 = benchmark.DataGenerator(loader, partition='val', batch_size=gParameters['batch_size'], shape=gen_shape, name='val_gen2').flow()
test_gen = benchmark.DataGenerator(loader, partition='test', batch_size=gParameters['batch_size'], shape=gen_shape, name='test_gen').flow()

train_steps = int(loader.n_train/gParameters['batch_size'])
val_steps = int(loader.n_val/gParameters['batch_size'])
test_steps = int(loader.n_test/gParameters['batch_size'])

if 'train_steps' in gParameters:
    train_steps = gParameters['train_steps']
if 'val_steps' in gParameters:
    val_steps = gParameters['val_steps']
if 'test_steps' in gParameters:
    test_steps = gParameters['test_steps']

checkpointer = ModelCheckpoint(filepath=gParameters['output_dir']+'.model'+ext+'.h5', save_best_only=True)
progbar = MyProgbarLogger(train_steps * gParameters['batch_size'])
loss_history = MyLossHistory(progbar=progbar, val_gen=val_gen2, test_gen=test_gen,
                        val_steps=val_steps, test_steps=test_steps,
                        metric=gParameters['loss'], category_cutoffs=gParameters['category_cutoffs'],
                        ext=ext, pre=gParameters['output_dir'])

In [None]:
# Seed random generator for training
np.random.seed(seed)

candleRemoteMonitor = candle.CandleRemoteMonitor(params=gParameters)

history = model.fit_generator(train_gen, train_steps,
                    epochs=gParameters['epochs'],
                    validation_data=val_gen,
                    validation_steps=val_steps,
                    verbose=0,
                    callbacks=[checkpointer, loss_history, progbar, candleRemoteMonitor],
                    )

benchmark.logger.removeHandler(fh)
benchmark.logger.removeHandler(sh)

In [19]:
next(train_gen)[0][0]

array([-4.        ,  0.52599496, -0.29818445, ..., -0.03238334,
       -0.8911875 , -0.3270878 ], dtype=float32)

### Dose Response

In [9]:
cell_expr_path, cell_mrna_path, cell_prot_path, cell_kino_path,drug_desc_path, drug_auen_path, dose_resp_path, test_cell_path, test_drug_path = benchmark.stage_data()


In [10]:
dose_response_nci, dose_response_gdsc = benchmark.load_dose_response(dose_resp_path, seed, gParameters['data_type'], min_logconc=gParameters['min_logconc'], max_logconc=gParameters['max_logconc'], subsample=gParameters['subsample'])

In [11]:
dose_response_nci

Unnamed: 0_level_0,CELLNAME,GROWTH,LOG_CONCENTRATION
NSC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,BR:BT_549,-82.0,-4.0
1,BR:HS578T,-15.0,-4.0
1,BR:MCF7,-50.0,-4.0
1,BR:MDA_MB_231,-77.0,-4.0
1,BR:T47D,-40.0,-4.0
...,...,...,...
622976,ME:MALME_3M,101.0,-5.0
681024,OV:OVCAR_4,102.0,-4.0
688341,RE:786_0,107.0,-4.0
686352,LC:NCI_H23,103.0,-5.0


In [12]:
dose_response_nci.to_csv("/global/homes/v/vineethg/xgboost-single-drug-reponse-prediction/data/nci_dose_response.tsv", sep="\t", index=True)

In [11]:
dose_response_gdsc

Unnamed: 0_level_0,CELLNAME,GROWTH,LOG_CONCENTRATION
GDSC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,ACH-000002,0.701901,4.834112
1,ACH-000002,0.705002,3.350517
1,ACH-000004,-1.656763,1.199252
1,ACH-000004,-1.655299,0.831239
1,ACH-000006,-0.318435,3.261706
...,...,...,...
1530,ACH-002314,-1.878165,4.851377
1530,ACH-002314,-1.878975,3.362654
1530,ACH-002315,0.976627,7.614813
1530,ACH-002317,-1.007117,5.694550


In [12]:
#merged_dose_response = pd.concat([dose_response_gdsc, dose_response_nci], join="outer", axis=0,ignore_index=True) 

In [13]:
#merged_dose_response

In [53]:
dose_response_gdsc.to_csv('~/NCI-UDel-collab/candle/P1B3/dose_response_gdsc.tsv', sep='\t', index=True)

### Drug descriptors

In [12]:
drug_desc_nci, drug_desc_ccle = benchmark.load_drug_descriptors(drug_desc_path, gParameters['data_type'], gParameters['feature_subsample'], gParameters['scaling'])

  converters ={'NAME' : str})
  converters ={'NAME' : str})


In [13]:
drug_desc_nci

Unnamed: 0,NSC,MW,AMW,Sv,Se,Sp,Si,Mv,Me,Mp,...,DLS_01,DLS_02,DLS_03,DLS_04,DLS_05,DLS_06,DLS_07,DLS_cons,LLS_01,LLS_02
0,1,-1.520317,-0.350065,-1.487739,-1.437979,-1.484601,-1.445729,0.049422,0.244448,-0.335109,...,0.531158,-1.141683,-0.808080,-0.452888,-0.741761,0.461134,0.229948,-0.480889,1.911684,0.612799
1,100044,-0.553066,-0.191502,-0.607726,-0.434899,-0.714828,-0.413067,-1.371013,1.274786,-1.288745,...,0.531158,-0.207431,0.534837,1.151560,0.962950,0.461134,0.229948,0.864627,0.531165,0.612799
2,100046,-0.049945,-0.899056,0.099415,0.366422,0.028789,0.389588,-2.584842,0.115658,-1.370152,...,0.531158,0.785212,0.534837,2.756007,2.667661,0.461134,0.229948,1.941040,-0.180011,0.612799
3,100055,0.021073,-0.599677,0.273528,0.233768,0.231915,0.232346,-0.208838,-0.528301,-0.207183,...,0.531158,0.785212,0.534837,1.151560,-0.741761,0.461134,0.229948,0.416122,0.531165,-0.327088
4,100058,-0.552937,-0.435713,-0.392663,-0.432241,-0.423073,-0.442911,0.333511,-0.367311,-0.044367,...,0.531158,0.785212,0.534837,1.151560,0.962950,0.461134,0.229948,1.044029,1.200507,0.612799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30237,677208,-0.642902,0.080100,-0.703182,-0.655425,-0.703905,-0.636816,-0.260490,0.341043,-0.183923,...,0.531158,-0.207431,0.534837,-0.452888,-0.741761,0.461134,0.229948,-0.032383,0.531165,0.612799
30238,67721,-0.707733,-0.408707,-0.583575,-0.593327,-0.614701,-0.603362,0.178553,-0.174126,-0.160664,...,0.531158,0.785212,0.534837,-0.452888,-0.741761,0.461134,0.229948,0.147019,0.531165,0.612799
30239,677228,-0.179093,-0.322674,-0.069352,-0.043863,-0.160637,-0.094974,-0.079708,0.437638,-0.451406,...,0.531158,0.785212,0.534837,1.151560,0.962950,0.461134,0.229948,1.044029,-1.560530,-0.327088
30240,677229,-0.307853,-0.435713,-0.129394,-0.177726,-0.171560,-0.178890,0.230206,-0.367311,-0.079256,...,0.531158,-0.207431,0.534837,-0.452888,-0.741761,0.461134,0.229948,-0.032383,-0.180011,-0.327088


In [14]:
drug_desc_ccle

Unnamed: 0,GDSC,MW,AMW,Sv,Se,Sp,Si,Mv,Me,Mp,...,Psychotic-80,Psychotic-50,Hypertens-80,Hypertens-50,Hypnotic-80,Hypnotic-50,Neoplastic-80,Neoplastic-50,Infective-80,Infective-50
0,1004,2.309956,-0.665898,2.629346,2.662512e+00,2.660304,2.651558,-1.171431,-5.870051e-01,-0.952357,...,-5.693751e-01,-1.686762e-01,-9.186344e-01,-4.711948e-01,-3.383549e-01,-1.267399e-01,-9.708902e-01,-3.938958e-01,-8.898526e-01,-3.310591e-01
1,1005,-1.051031,12.600883,-2.428018,-3.875274e-08,-2.181030,-2.153318,-0.177527,2.425257e-08,7.761987,...,6.542770e-09,-6.259036e-10,-1.833678e-08,7.906052e-09,1.101001e-08,-4.702912e-10,1.499646e-08,-1.238080e-08,-3.301960e-09,-7.469005e-09
2,1006,-1.409966,-0.070048,-1.455942,-1.222097e+00,-1.506773,-1.200579,-1.066809,1.643772e+00,-1.809105,...,-5.693751e-01,-1.686762e-01,-9.186344e-01,-4.711948e-01,-3.383549e-01,-1.267399e-01,-9.708902e-01,-3.938958e-01,-8.898526e-01,-3.310591e-01
3,1007,2.289582,-0.490082,2.394444,2.456539e+00,2.328405,2.380324,-1.040654,8.741660e-02,-1.099228,...,-5.693751e-01,-1.686762e-01,-9.186344e-01,-4.711948e-01,-3.383549e-01,-1.267399e-01,-9.708902e-01,-3.938958e-01,-8.898526e-01,-3.310591e-01
4,1008,-0.026042,0.008993,-0.119795,-7.838865e-02,-0.214017,-0.081258,-0.151372,8.655941e-01,-0.707572,...,-5.693751e-01,-1.686762e-01,1.092858e+00,-4.711948e-01,-3.383549e-01,-1.267399e-01,1.034038e+00,-3.938958e-01,1.128206e+00,-3.310591e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250,88,-0.537357,-0.204317,-0.430376,-4.484333e-01,-0.447510,-0.436831,0.136337,-1.719759e-01,0.002304,...,-5.693751e-01,-1.686762e-01,1.092858e+00,2.130620e+00,-3.383549e-01,-1.267399e-01,1.034038e+00,2.548738e+00,1.128206e+00,-3.310591e-01
251,89,-1.376556,-0.866542,-1.133603,-9.298804e-01,-1.073595,-0.878007,-1.563763,-9.501535e-01,-1.123707,...,-5.693751e-01,-1.686762e-01,1.092858e+00,-4.711948e-01,2.967112e+00,7.921241e+00,1.034038e+00,-3.938958e-01,1.128206e+00,3.032501e+00
252,91,0.072748,0.486787,-0.135834,-2.807413e-01,-0.138298,-0.327752,1.077931,5.024459e-01,1.030400,...,-5.693751e-01,-1.686762e-01,1.092858e+00,-4.711948e-01,-3.383549e-01,-1.267399e-01,1.034038e+00,-3.938958e-01,-8.898526e-01,-3.310591e-01
253,94,-0.615709,-0.556963,-0.403982,-3.380264e-01,-0.368844,-0.293724,-0.700634,-7.945167e-01,-0.462788,...,1.763226e+00,-1.686762e-01,1.092858e+00,2.130620e+00,-3.383549e-01,-1.267399e-01,1.034038e+00,-3.938958e-01,1.128206e+00,-3.310591e-01


In [42]:
#drug_desc_ccle.to_csv('~/NCI-UDel-collab/candle/P1B3/drug_desc_gdsc.tsv', sep='\t', index=False)

In [19]:
len(set(drug_desc_ccle.columns).intersection(set(drug_desc_nci.columns)))

3809

In [20]:
len(set(drug_desc_ccle.columns))

3840

In [21]:
len(set(drug_desc_nci.columns))
# delta of 31 for cols of drug desc. between each of the sets. Come back and take intersection to get rid of NaN

3810

### Merging dose response w/ drug descriptors before merging dose response with cell expression data
Because the former is done using drug name (in this case number) which has overlaps between GDSC and NSC

In [15]:
merged_ccle = dose_response_gdsc.merge(drug_desc_ccle, on='GDSC')

In [18]:
merged_nci = dose_response_nci.merge(drug_desc_nci, on='NSC')

In [26]:
merged_ccle

Unnamed: 0,GDSC,CELLNAME,GROWTH,LOG_CONCENTRATION,MW,AMW,Sv,Se,Sp,Si,...,Psychotic-80,Psychotic-50,Hypertens-80,Hypertens-50,Hypnotic-80,Hypnotic-50,Neoplastic-80,Neoplastic-50,Infective-80,Infective-50
0,1,ACH-000002,0.701901,4.834112,-0.425792,-0.344160,-0.294356,-0.267755,-0.302332,-0.257365,...,1.763226,5.951860,1.092858,-0.471195,-0.338355,-0.12674,1.034038,-0.393896,1.128206,-0.331059
1,1,ACH-000002,0.705002,3.350517,-0.425792,-0.344160,-0.294356,-0.267755,-0.302332,-0.257365,...,1.763226,5.951860,1.092858,-0.471195,-0.338355,-0.12674,1.034038,-0.393896,1.128206,-0.331059
2,1,ACH-000004,-1.656763,1.199252,-0.425792,-0.344160,-0.294356,-0.267755,-0.302332,-0.257365,...,1.763226,5.951860,1.092858,-0.471195,-0.338355,-0.12674,1.034038,-0.393896,1.128206,-0.331059
3,1,ACH-000004,-1.655299,0.831239,-0.425792,-0.344160,-0.294356,-0.267755,-0.302332,-0.257365,...,1.763226,5.951860,1.092858,-0.471195,-0.338355,-0.12674,1.034038,-0.393896,1.128206,-0.331059
4,1,ACH-000006,-0.318435,3.261706,-0.425792,-0.344160,-0.294356,-0.267755,-0.302332,-0.257365,...,1.763226,5.951860,1.092858,-0.471195,-0.338355,-0.12674,1.034038,-0.393896,1.128206,-0.331059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446109,1530,ACH-002314,-1.878165,4.851377,-0.897931,-0.390774,-0.721260,-0.694089,-0.702437,-0.662418,...,1.763226,-0.168676,1.092858,2.130620,2.967112,-0.12674,1.034038,2.548738,1.128206,-0.331059
446110,1530,ACH-002314,-1.878975,3.362654,-0.897931,-0.390774,-0.721260,-0.694089,-0.702437,-0.662418,...,1.763226,-0.168676,1.092858,2.130620,2.967112,-0.12674,1.034038,2.548738,1.128206,-0.331059
446111,1530,ACH-002315,0.976627,7.614813,-0.897931,-0.390774,-0.721260,-0.694089,-0.702437,-0.662418,...,1.763226,-0.168676,1.092858,2.130620,2.967112,-0.12674,1.034038,2.548738,1.128206,-0.331059
446112,1530,ACH-002317,-1.007117,5.694550,-0.897931,-0.390774,-0.721260,-0.694089,-0.702437,-0.662418,...,1.763226,-0.168676,1.092858,2.130620,2.967112,-0.12674,1.034038,2.548738,1.128206,-0.331059


In [25]:
merged_nci

Unnamed: 0,NSC,CELLNAME,GROWTH,LOG_CONCENTRATION,MW,AMW,Sv,Se,Sp,Si,...,DLS_01,DLS_02,DLS_03,DLS_04,DLS_05,DLS_06,DLS_07,DLS_cons,LLS_01,LLS_02
0,1,BR:BT_549,-82.0,-4.0,-1.520317,-0.350065,-1.487739,-1.437979,-1.484601,-1.445729,...,0.531158,-1.141683,-0.808080,-0.452888,-0.741761,0.461134,0.229948,-0.480889,1.911684,0.612799
1,1,BR:HS578T,-15.0,-4.0,-1.520317,-0.350065,-1.487739,-1.437979,-1.484601,-1.445729,...,0.531158,-1.141683,-0.808080,-0.452888,-0.741761,0.461134,0.229948,-0.480889,1.911684,0.612799
2,1,BR:MCF7,-50.0,-4.0,-1.520317,-0.350065,-1.487739,-1.437979,-1.484601,-1.445729,...,0.531158,-1.141683,-0.808080,-0.452888,-0.741761,0.461134,0.229948,-0.480889,1.911684,0.612799
3,1,BR:MDA_MB_231,-77.0,-4.0,-1.520317,-0.350065,-1.487739,-1.437979,-1.484601,-1.445729,...,0.531158,-1.141683,-0.808080,-0.452888,-0.741761,0.461134,0.229948,-0.480889,1.911684,0.612799
4,1,BR:T47D,-40.0,-4.0,-1.520317,-0.350065,-1.487739,-1.437979,-1.484601,-1.445729,...,0.531158,-1.141683,-0.808080,-0.452888,-0.741761,0.461134,0.229948,-0.480889,1.911684,0.612799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1151552,661212,LE:MOLT_4,95.0,-5.0,0.053360,-0.532934,0.394089,0.199215,0.379374,0.175546,...,-1.008335,-0.207431,0.534837,-0.452888,-0.741761,0.461134,0.229948,-0.391188,0.531165,-0.327088
1151553,661803,BR:MDA_MB_231,92.0,-5.0,-0.823669,0.440049,-1.044057,-0.844298,-1.095882,-0.843958,...,0.531158,0.785212,0.534837,-0.452888,-0.741761,0.461134,0.229948,0.147019,0.531165,0.612799
1151554,169547,RE:SN12C,93.0,-5.0,-1.004372,-0.281393,-1.007402,-0.903175,-1.053436,-0.871074,...,0.531158,-0.207431,-0.808080,-0.452888,-0.741761,0.461134,0.229948,-0.301486,-0.180011,0.612799
1151555,169547,LE:K_562,91.0,-5.0,-1.004372,-0.281393,-1.007402,-0.903175,-1.053436,-0.871074,...,0.531158,-0.207431,-0.808080,-0.452888,-0.741761,0.461134,0.229948,-0.301486,-0.180011,0.612799


### Merge dose + desc. datasets of GDSC and NSC

In [26]:
import pandas as pd

In [27]:
merged = pd.concat([merged_ccle, merged_nci], join="outer", axis=0,ignore_index=True) 

In [26]:
merged

Unnamed: 0,GDSC,CELLNAME,GROWTH,LOG_CONCENTRATION,MW,AMW,Sv,Se,Sp,Si,...,Psychotic-50,Hypertens-80,Hypertens-50,Hypnotic-80,Hypnotic-50,Neoplastic-80,Neoplastic-50,Infective-80,Infective-50,NSC
0,1,ACH-000002,0.701901,4.834112,-0.425792,-0.344160,-0.294356,-0.267755,-0.302332,-0.257365,...,5.95186,1.092858,-0.471195,-0.338355,-0.12674,1.034038,-0.393896,1.128206,-0.331059,
1,1,ACH-000002,0.705002,3.350517,-0.425792,-0.344160,-0.294356,-0.267755,-0.302332,-0.257365,...,5.95186,1.092858,-0.471195,-0.338355,-0.12674,1.034038,-0.393896,1.128206,-0.331059,
2,1,ACH-000004,-1.656763,1.199252,-0.425792,-0.344160,-0.294356,-0.267755,-0.302332,-0.257365,...,5.95186,1.092858,-0.471195,-0.338355,-0.12674,1.034038,-0.393896,1.128206,-0.331059,
3,1,ACH-000004,-1.655299,0.831239,-0.425792,-0.344160,-0.294356,-0.267755,-0.302332,-0.257365,...,5.95186,1.092858,-0.471195,-0.338355,-0.12674,1.034038,-0.393896,1.128206,-0.331059,
4,1,ACH-000006,-0.318435,3.261706,-0.425792,-0.344160,-0.294356,-0.267755,-0.302332,-0.257365,...,5.95186,1.092858,-0.471195,-0.338355,-0.12674,1.034038,-0.393896,1.128206,-0.331059,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1597666,,LE:MOLT_4,95.000000,-5.000000,0.053360,-0.532934,0.394089,0.199215,0.379374,0.175546,...,,,,,,,,,,661212
1597667,,BR:MDA_MB_231,92.000000,-5.000000,-0.823669,0.440049,-1.044057,-0.844298,-1.095882,-0.843958,...,,,,,,,,,,661803
1597668,,RE:SN12C,93.000000,-5.000000,-1.004372,-0.281393,-1.007402,-0.903175,-1.053436,-0.871074,...,,,,,,,,,,169547
1597669,,LE:K_562,91.000000,-5.000000,-1.004372,-0.281393,-1.007402,-0.903175,-1.053436,-0.871074,...,,,,,,,,,,169547


In [None]:
merged_ccle.to_csv('~/NCI-UDel-collab/candle/P1B3/merged_ccle.tsv', sep='\t', index=False)

In [9]:
cell_exp_nci, cell_exp_ccle = benchmark.load_cellline_expressions(cell_expr_path, gParameters['data_type'], gParameters['feature_subsample'], gParameters['scaling'])
#cell_exp_nci_ccle = benchmark.load_cellline_expressions(cell_expr_path, gParameters['data_type'], gParameters['feature_subsample'], gParameters['scaling'])

In [11]:
cell_exp_nci.to_csv('~/NCI-UDel-collab/candle/P1B3/cell_exp_nci.tsv', sep='\t', index=False)

In [17]:
cell_exp_ccle

Unnamed: 0,CELLNAME,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,...,H3C2,H3C3,AC098582.1,DUS4L-BCAP29,C8orf44-SGK3,ELOA3B,NPBWR1,ELOA3D,ELOA3,CDR1
0,ACH-000242,2.100918,-0.198153,0.076835,0.057863,0.078078,0.271306,2.535806,0.226084,2.074861,...,-0.056896,1.989640,-1.032600,-0.289988,-1.036605,-0.405567,0.256359,-0.376551,-0.374661,-0.317766
1,ACH-000327,0.030025,-0.198153,-0.920456,-0.836327,-1.613391,-0.357171,0.564983,0.474014,-1.014511,...,-0.093079,0.199429,-0.469809,-1.144682,-1.036605,1.517164,-0.193403,-0.376551,0.507425,1.301832
2,ACH-000233,-1.973512,-0.198153,-0.648568,2.702364,-0.116275,-0.345788,2.002142,-1.198447,-1.209990,...,-0.377710,0.567999,0.514339,1.536996,-0.278274,-0.405567,-0.468432,0.441488,-0.374661,-0.317766
3,ACH-000528,0.747213,-0.198153,0.995368,0.734125,1.221806,-0.357171,-0.522803,0.865979,-0.712993,...,0.179125,-0.274115,-0.372559,0.708923,-1.036605,0.036254,-0.401915,-0.376551,-0.074885,-0.317766
4,ACH-001655,0.185397,-0.198153,0.419131,-2.615911,-2.804861,-0.345788,-0.885939,0.102865,-2.634536,...,0.309593,0.336539,-1.226033,-1.195850,0.041182,-0.183563,-0.468432,-0.376551,-0.224031,3.714823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
689,ACH-000114,-0.015574,-0.057931,0.057729,0.630234,-1.681816,0.084308,1.369570,0.344744,0.637771,...,-1.431989,-0.763272,-0.884313,-0.478100,0.155734,-0.405567,0.765017,-0.376551,-0.374661,-0.220399
690,ACH-001578,1.866245,16.492250,1.177347,0.191931,0.797061,-0.334515,-0.906143,0.346952,-1.532657,...,-0.965580,-1.020572,-0.246460,-0.201856,-0.361552,-0.405567,-0.423870,-0.376551,-0.374661,1.049837
691,ACH-000973,0.634967,-0.198153,0.928266,-0.929666,-0.402534,-0.368668,0.942747,0.710818,-0.078142,...,0.279673,2.147732,0.436603,0.432026,-0.532544,0.253926,-0.446040,-0.376551,0.072806,0.252409
692,ACH-000750,0.149617,-0.198153,-0.003684,-1.047861,-0.716619,-0.357171,-0.847223,-0.082451,-0.222419,...,0.256926,0.397189,-1.186591,1.319960,-0.237167,-0.405567,-0.274504,-0.376551,-0.374661,1.049837


In [43]:
#cell_exp_ccle.to_csv('~/NCI-UDel-collab/candle/P1B3/cell_exp_ccle.tsv', sep='\t', index=False)

In [30]:
search_genes = ['CELLNAME', 'CYP2D6', 'HNF4A', 'NR0B2', 'NR1I3', 'NR1I2', 'TERT', 'TP53', 'CTNNB1', 'ARID1A']

In [31]:
nine_genes_exp = cell_exp_ccle[search_genes]
nine_genes_exp

Unnamed: 0,CELLNAME,CYP2D6,HNF4A,NR0B2,NR1I3,NR1I2,TERT,TP53,CTNNB1,ARID1A
0,ACH-000242,0.084460,-0.339696,-0.292394,-0.076083,-0.281702,1.039246,0.182461,0.600038,-0.420221
1,ACH-000327,-1.254940,-0.356454,-0.319693,1.631255,-0.281702,-0.873671,-1.622676,-1.925177,-1.787363
2,ACH-000233,-0.281768,-0.399746,-0.213599,1.946252,-0.319826,-0.428671,0.126360,-0.213359,0.298233
3,ACH-000528,-0.899328,-0.399746,-0.292394,-0.443109,-0.300669,-0.263979,1.043562,-0.540049,0.345198
4,ACH-001655,-0.938608,-0.426741,-0.292394,-1.270218,-0.319826,-0.444463,0.566481,-2.075458,-2.946114
...,...,...,...,...,...,...,...,...,...,...
689,ACH-000114,0.602366,0.321948,-0.319693,0.072166,0.874908,-0.881538,0.215055,-0.359713,-0.048524
690,ACH-001578,-0.406301,-0.315122,-0.252434,0.330533,-0.319826,0.446126,-1.506524,1.192738,1.418009
691,ACH-000973,0.058135,-0.417654,-0.319693,-0.443109,-0.319826,0.289434,0.960171,1.121598,0.045837
692,ACH-000750,-0.958484,-0.373526,-0.319693,-0.695836,-0.319826,0.985835,-0.248514,0.127027,-0.223498


In [32]:
nine_genes_combined_dataset = nine_genes_exp.merge(merged_ccle, on='CELLNAME')

In [48]:
merged_ccle['GROWTH']

0         0.701901
1         0.705002
2        -1.656763
3        -1.655299
4        -0.318435
            ...   
446109   -1.878165
446110   -1.878975
446111    0.976627
446112   -1.007117
446113   -1.005705
Name: GROWTH, Length: 446114, dtype: float32

In [52]:
nine_genes_combined_dataset[['CELLNAME', 'LOG_CONCENTRATION', 'GROWTH', 'GDSC']]

Unnamed: 0,CELLNAME,LOG_CONCENTRATION,GROWTH,GDSC
0,ACH-000242,-0.026016,1.084975,133
1,ACH-000242,-0.018060,1.089918,133
2,ACH-000242,5.897957,1.302678,134
3,ACH-000242,4.088079,1.306007,134
4,ACH-000242,2.780388,1.605856,135
...,...,...,...,...
320179,ACH-000052,-2.699995,-1.317056,1529
320180,ACH-000052,-2.037279,-1.471810,1529
320181,ACH-000052,-1.872448,-1.345048,1529
320182,ACH-000052,6.165053,-0.521059,1530


In [35]:
nine_genes_combined_dataset.to_csv('/global/homes/v/vineethg/NCI-DOE-Collab-Pilot1-Single-Drug-Response-Predictor/Data/Pilot1/ccle_nine_genes_exp_dose_desc.tsv', sep='\t', index=False)

In [30]:
len(set(cell_exp_nci.columns))

25723

In [31]:
len(set(cell_exp_ccle.columns))

19222

In [32]:
len(set(cell_exp_nci).intersection(set(cell_exp_ccle)))

16847

In [33]:
len(set(cell_exp_nci).union(set(cell_exp_ccle)))

28098

In [34]:
#merged_cell_exp = pd.concat([cell_exp_ccle, cell_exp_nci], join="outer", axis=0,ignore_index=True)

In [35]:
#merged_cell_exp

Unnamed: 0,CELLNAME,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,...,ZNRF2P3,ZRANB2-AS1,ZRF1PS,ZRSR1,ZSCAN12P1,ZSCAN5D,ZSWIM5P1,ZSWIM5P3,ZUFSP,ZYG11AP1
0,ACH-000242,2.100918,-0.198153,0.076835,0.057863,0.078078,0.271306,2.535806,0.226084,2.074861,...,,,,,,,,,,
1,ACH-000327,0.030025,-0.198153,-0.920456,-0.836327,-1.613391,-0.357171,0.564983,0.474014,-1.014511,...,,,,,,,,,,
2,ACH-000233,-1.973512,-0.198153,-0.648568,2.702364,-0.116275,-0.345788,2.002142,-1.198447,-1.209990,...,,,,,,,,,,
3,ACH-000528,0.747213,-0.198153,0.995368,0.734125,1.221806,-0.357171,-0.522803,0.865979,-0.712993,...,,,,,,,,,,
4,ACH-001655,0.185397,-0.198153,0.419131,-2.615911,-2.804861,-0.345788,-0.885939,0.102865,-2.634536,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751,RE:SN12C,0.113298,-0.197840,0.225205,-0.221576,-0.353150,-0.008979,-0.269384,-0.047904,-0.173807,...,0.496325,-0.112011,-3.192253e-01,1.329817,-3.983890e-01,4.329318e-01,-3.719890e-01,-0.054847,-8.058684e-01,-0.254365
752,RE:TK_10,0.050984,-0.144938,0.210843,-0.171937,-0.027016,-0.143931,-0.291041,0.412112,-0.170658,...,0.227233,-0.095134,6.516904e-01,-0.316365,1.023923e+00,1.790955e-01,1.927172e-01,-0.189251,-1.750942e-01,-0.200934
753,RE:UO_31,0.135958,-0.283805,0.103126,-0.254669,-0.306971,-0.177670,-0.235913,0.082782,-0.200566,...,-0.602467,-0.145767,1.000337e-01,-0.987032,-6.531315e-01,-1.875567e-01,-3.585436e-01,-0.148523,-1.533434e-01,-0.265051
754,Number good probes,2.577535,7.585340,5.575154,7.377408,7.511572,2.487647,7.572569,5.832977,7.723248,...,-6.275826,7.796860,-5.990256e+00,1.431433,5.237789e+00,6.271164e+00,-4.997202e+00,7.764994,3.392043e+00,7.530478


In [37]:
cell_exp_nci_ccle

Unnamed: 0,CELLNAME,5-HT3C2,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A4GALT,A4GNT,...,SPDYE11,AP001453.3,H3C2,H3C3,AC098582.1,DUS4L-BCAP29,C8orf44-SGK3,ELOA3B,ELOA3D,ELOA3
0,BR:MCF7,-0.145120,0.977077,-0.132770,-0.198819,-0.167883,-0.117512,0.412067,1.278977,0.309817,...,,,,,,,,,,
1,BR:MDA_MB_231,1.197110,-0.074454,-0.132770,-0.198819,-0.265530,-0.126441,0.551906,0.343324,0.190204,...,,,,,,,,,,
2,BR:HS578T,0.541974,0.574363,-0.074628,-0.167592,-0.119059,-0.128673,1.006384,-0.349419,0.768332,...,,,,,,,,,,
3,BR:BT_549,-0.544593,1.167247,-0.135830,-0.200306,-0.302149,-0.126441,0.167349,0.190381,0.568978,...,,,,,,,,,,
4,BR:T47D,0.302290,1.021823,-0.126650,-0.197332,-0.302149,-0.135370,-0.287129,-0.007546,-0.188570,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751,ACH-000114,,,-0.262176,-0.359265,,0.421968,,0.701213,0.760532,...,0.607875,1.634615,-1.431989,-0.763272,-0.884313,-0.478100,0.155734,-0.405567,-0.376551,-0.374661
752,ACH-001578,,,-0.314786,0.301365,,2.766113,,0.464002,-0.368467,...,-0.277428,-1.159061,-0.965580,-1.020572,-0.246460,-0.201856,-0.361552,-0.405567,-0.376551,-0.374661
753,ACH-000973,,,-0.314786,-0.441951,,-0.361814,,-0.698049,0.050499,...,-0.018373,-0.168789,0.279673,2.147732,0.436603,0.432026,-0.532544,0.253926,-0.376551,0.072806
754,ACH-000750,,,-0.279540,-0.328259,,-0.361814,,-1.296773,-0.245842,...,-0.147266,1.075962,0.256926,0.397189,-1.186591,1.319960,-0.237167,-0.405567,-0.376551,-0.374661


### Merge drug data w/ exp data seperately for NCI and CCLE to avoid memory issues

In [38]:
merged_ccle = merged_ccle.merge(cell_exp_ccle, on='CELLNAME')

NameError: name 'cell_exp_ccle' is not defined

In [None]:
merged_nci = merged_nci.merge(cell_exp_nci, on='CELLNAME')

In [None]:
merged_ccle

### Final merge between merged dose resp. and drug desc. sets and merged expression sets

In [39]:
merged = merged.merge(merged_cell_exp, on='CELLNAME')
# this runs out of memory

NameError: name 'merged_cell_exp' is not defined

In [36]:
merged

Unnamed: 0,GDSC,CELLNAME,GROWTH,LOG_CONCENTRATION,MW,AMW,Sv,Se,Sp,Si,...,Psychotic-50,Hypertens-80,Hypertens-50,Hypnotic-80,Hypnotic-50,Neoplastic-80,Neoplastic-50,Infective-80,Infective-50,NSC
0,1,ACH-000002,0.701901,4.834112,-0.425792,-0.344160,-0.294356,-0.267755,-0.302332,-0.257365,...,5.95186,1.092858,-0.471195,-0.338355,-0.12674,1.034038,-0.393896,1.128206,-0.331059,
1,1,ACH-000002,0.705002,3.350517,-0.425792,-0.344160,-0.294356,-0.267755,-0.302332,-0.257365,...,5.95186,1.092858,-0.471195,-0.338355,-0.12674,1.034038,-0.393896,1.128206,-0.331059,
2,1,ACH-000004,-1.656763,1.199252,-0.425792,-0.344160,-0.294356,-0.267755,-0.302332,-0.257365,...,5.95186,1.092858,-0.471195,-0.338355,-0.12674,1.034038,-0.393896,1.128206,-0.331059,
3,1,ACH-000004,-1.655299,0.831239,-0.425792,-0.344160,-0.294356,-0.267755,-0.302332,-0.257365,...,5.95186,1.092858,-0.471195,-0.338355,-0.12674,1.034038,-0.393896,1.128206,-0.331059,
4,1,ACH-000006,-0.318435,3.261706,-0.425792,-0.344160,-0.294356,-0.267755,-0.302332,-0.257365,...,5.95186,1.092858,-0.471195,-0.338355,-0.12674,1.034038,-0.393896,1.128206,-0.331059,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1597666,,LE:MOLT_4,95.000000,-5.000000,0.053360,-0.532934,0.394089,0.199215,0.379374,0.175546,...,,,,,,,,,,661212
1597667,,BR:MDA_MB_231,92.000000,-5.000000,-0.823669,0.440049,-1.044057,-0.844298,-1.095882,-0.843958,...,,,,,,,,,,661803
1597668,,RE:SN12C,93.000000,-5.000000,-1.004372,-0.281393,-1.007402,-0.903175,-1.053436,-0.871074,...,,,,,,,,,,169547
1597669,,LE:K_562,91.000000,-5.000000,-1.004372,-0.281393,-1.007402,-0.903175,-1.053436,-0.871074,...,,,,,,,,,,169547


### cuDF version

In [40]:
import cudf as cu

ModuleNotFoundError: No module named 'cudf'

#### What is P1B3 doing?

In [28]:
df = dose_response_gdsc

In [29]:
df = df.reset_index()

In [30]:
df = df.merge(cell_exp_ccle[['CELLNAME']], on='CELLNAME')

NameError: name 'cell_exp_ccle' is not defined

In [59]:
df

Unnamed: 0,GDSC,CELLNAME,GROWTH,LOG_CONCENTRATION
0,1,ACH-000002,0.701901,4.834112
1,1,ACH-000002,0.705002,3.350517
2,3,ACH-000002,0.381402,-2.102626
3,3,ACH-000002,0.381976,-1.457461
4,5,ACH-000002,-0.050878,3.494382
...,...,...,...,...
320179,1529,ACH-001653,0.631494,3.424556
320180,1529,ACH-001653,-0.586940,-0.715543
320181,1529,ACH-001653,0.708302,2.373593
320182,1530,ACH-001653,-0.024736,6.645494


In [60]:
df = df.merge(drug_desc_ccle[['GDSC']], on='GDSC')

In [61]:
df

Unnamed: 0,GDSC,CELLNAME,GROWTH,LOG_CONCENTRATION
0,1,ACH-000002,0.701901,4.834112
1,1,ACH-000002,0.705002,3.350517
2,1,ACH-000004,-1.656763,1.199252
3,1,ACH-000004,-1.655299,0.831239
4,1,ACH-000006,-0.318435,3.261706
...,...,...,...,...
320179,1203,ACH-000895,-0.270720,3.428846
320180,1203,ACH-000995,-1.853437,2.203236
320181,1203,ACH-000995,-1.851544,1.527169
320182,1203,ACH-001653,0.136026,5.662258
