In [1]:
import pandas as pd
import keras
import numpy as np
import time

Using Theano backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Parsl backend init

Here we load a barebones remote execution backend for parsl to form a decent baseline that includes all the costs of remote function instantiation.

In [2]:
import parsl
from parsl import python_app
from parsl.configs.htex_local import config

# Most of the app that hit the timeout will complete if retried.
# but for this demo, I'm not setting retries.
# config.retries = 2
parsl.load(config)

<parsl.dataflow.dflow.DataFlowKernel at 0x7fa8a23ba908>

In [3]:
# We do not need this import here, but we should ideally be loading 
# all the apps from a separate module
# import main

### Load the Smiles data
Here we load only 158 lines from the csv file, to avoid burning the laptop. Once we replace the config
with a config for theta, we can load and launch the whole file

In [4]:
print("Loading all data available")
smiles = pd.read_csv("train.csv", nrows=158).iloc[:,0].tolist()
print("Total of {} available".format(len(smiles)))

Loading all data available
Total of 158 available


### Update descript to process batches

We want the descript step to consume batches of smiles to minimize the task launch costs.
Here we add a `@python_app` decorator that marks this function for remote/distributed execution.

Key point to note is that we add a special `walltime=<int:seconds>` kwarg, that causes the function to raise a `parsl.app.errors.AppTimeout` exception if the function runs beyond the set walltime.

In [5]:
@python_app
def app_compute_descript_batches(smile_list, walltime=1):
    """ Takes a list of smiles and returns a corresponding list of descs.
    """
    from mordred import Calculator, descriptors
    from rdkit import Chem
    import numpy as np
    import pickle
    # this object doesn't need to be created everytime. Can make global I think?                                                                                                                                    
    calc = Calculator(descriptors, ignore_3D=True)

    results_list = []
    for smile in smile_list:
        #read smiles                                                                                                                                                                                                    
        mol = Chem.MolFromSmiles(smile)
        if mol is None:
            print("Error processing mol")
            result = None
        else:
            descs = calc(mol)
            result = pickle.dumps(np.array(descs).flatten().astype(np.float32))
            
        results_list.append(result)

    return results_list


# This will change, but the interface will not.
@python_app
def combine_drug_features_with_cell_features(vec_list):
    from sklearn.preprocessing import Imputer
    import numpy as np
    import pickle
    results = []
    for b_vec in vec_list:
        vec = pickle.loads(b_vec)
        vec_prime = np.zeros((60, vec.shape[0]))
        vec_prime[0] = vec

        #will need to impute missing values                                                                                
        imp = Imputer()
        vec_prime = imp.fit_transform(vec_prime)
        results.append(pickle.dumps(vec_prime)) # <-- Another serialization pain point

    return results

### Launch tasks on chunks of data

Parsl does batching internally, but we can do better!

We have an estimate of the runtime for a batch of N tasks, and we use that to our advantage by creating
chunks of "smiles" that are dispatched to the now batched, `app_compute_descript_batches` function.

`chunksize` is configurable. In a smarter version we could tie `chunksize` and `walltime` together.

In [6]:
def launch_tasks(data, chunksize=10):
    proc_chunks = {}
    result_chunks = {}
    for i in range(1, len(data), chunksize):    
        chunk = data[i:i+chunksize]
        descript_vecs_list = app_compute_descript_batches(chunk)
        training_batch_list = combine_drug_features_with_cell_features(descript_vecs_list)
        proc_chunks[i] = descript_vecs_list
        result_chunks[i] = training_batch_list
    return proc_chunks, result_chunks

In [7]:
# Initial launch of all tasks
proc_chunks, result_chunks = launch_tasks(smiles)

### Capture and report messages on failed chunks

This is just a demonstration of how batches of smiles that exceed the runtime limits will simply raise a python exception when the "future" that represents the batch is asked to produce the result.

In [8]:
# Wait for the results
from parsl.app.errors import AppTimeout
chunksize=10
for key in proc_chunks:
    try:
        x = proc_chunks[key].result()
    except AppTimeout as e:
        print("Caught timeout for chunk index: {}:{}".format(key,key+chunksize))

Caught timeout for chunk index: 71:81
Caught timeout for chunk index: 111:121
Caught timeout for chunk index: 131:141


In [9]:
print(proc_chunks)

{1: <AppFuture super=<AppFuture at 0x7fa89f3faa90 state=finished returned list>>, 11: <AppFuture super=<AppFuture at 0x7fa89cadaa58 state=finished returned list>>, 21: <AppFuture super=<AppFuture at 0x7fa89cada6a0 state=finished returned list>>, 31: <AppFuture super=<AppFuture at 0x7fa89cada710 state=finished returned list>>, 41: <AppFuture super=<AppFuture at 0x7fa89cadab70 state=finished returned list>>, 51: <AppFuture super=<AppFuture at 0x7fa89cadad30 state=finished returned list>>, 61: <AppFuture super=<AppFuture at 0x7fa89cadada0 state=finished returned list>>, 71: <AppFuture super=<AppFuture at 0x7fa89cadaa20 state=finished raised AppTimeout>>, 81: <AppFuture super=<AppFuture at 0x7fa89cb084a8 state=finished returned list>>, 91: <AppFuture super=<AppFuture at 0x7fa89cb084e0 state=finished returned list>>, 101: <AppFuture super=<AppFuture at 0x7fa89cb08cf8 state=finished returned list>>, 111: <AppFuture super=<AppFuture at 0x7fa89cb08208 state=finished raised AppTimeout>>, 121: <

### Handle the failed batches

For the failed batches, we call `launch_tasks` again, but in this instance we call it with a `chunksize=1` so that
we can pinpoint which `smile` is non-convergent.

In [10]:
unpacked = {}
unpacked_tail = {}
for key in proc_chunks:
    try:
        x = proc_chunks[key].result()
    except AppTimeout as e:        
        print("Launching unpacked tasks: {}:{}".format(key,key+chunksize))
        unpacked[key], unpacked_tail[key] = launch_tasks(smiles[key:key+chunksize], chunksize=1)

Launching unpacked tasks: 71:81
Launching unpacked tasks: 111:121
Launching unpacked tasks: 131:141


### Display specific smile that failed to terminate

In [11]:
for key in unpacked:
    print("Peeking inside batch {}:{} ------------".format(key, key+chunksize))
    for item in unpacked[key]:
        print("   Item {}".format(item))
        print(unpacked[key][item])
    print("---------------------------------------")

Peeking inside batch 71:81 ------------
   Item 1
<AppFuture super=<AppFuture at 0x7fa89ca62cf8 state=pending>>
   Item 2
<AppFuture super=<AppFuture at 0x7fa89ca76c18 state=pending>>
   Item 3
<AppFuture super=<AppFuture at 0x7fa89ca62f98 state=pending>>
   Item 4
<AppFuture super=<AppFuture at 0x7fa89ca76278 state=pending>>
   Item 5
<AppFuture super=<AppFuture at 0x7fa89ca761d0 state=pending>>
   Item 6
<AppFuture super=<AppFuture at 0x7fa89ca80d68 state=pending>>
   Item 7
<AppFuture super=<AppFuture at 0x7fa89ca76908 state=pending>>
   Item 8
<AppFuture super=<AppFuture at 0x7fa89ca800f0 state=pending>>
   Item 9
<AppFuture super=<AppFuture at 0x7fa89ca76e48 state=pending>>
---------------------------------------
Peeking inside batch 111:121 ------------
   Item 1
<AppFuture super=<AppFuture at 0x7fa89ca80908 state=pending>>
   Item 2
<AppFuture super=<AppFuture at 0x7fa89ca8b940 state=pending>>
   Item 3
<AppFuture super=<AppFuture at 0x7fa89ca80748 state=pending>>
   Item 4
<App

In [14]:
print(unpacked_tail)
for batch in unpacked_tail:
    for item in unpacked_tail[batch]:
        print(unpacked_tail[batch][item])

{71: {1: <AppFuture super=<AppFuture at 0x7fa89ca76c50 state=finished returned list>>, 2: <AppFuture super=<AppFuture at 0x7fa89ca62c50 state=finished returned list>>, 3: <AppFuture super=<AppFuture at 0x7fa89ca76048 state=finished returned list>>, 4: <AppFuture super=<AppFuture at 0x7fa89ca76860 state=finished returned list>>, 5: <AppFuture super=<AppFuture at 0x7fa89ca800b8 state=finished returned list>>, 6: <AppFuture super=<AppFuture at 0x7fa89ca76518 state=finished returned list>>, 7: <AppFuture super=<AppFuture at 0x7fa89ca806a0 state=finished returned list>>, 8: <AppFuture super=<AppFuture at 0x7fa89ca76080 state=finished returned list>>, 9: <AppFuture super=<AppFuture at 0x7fa89ca80080 state=finished returned list>>}, 111: {1: <AppFuture super=<AppFuture at 0x7fa89ca8b0f0 state=finished returned list>>, 2: <AppFuture super=<AppFuture at 0x7fa89ca80a90 state=finished returned list>>, 3: <AppFuture super=<AppFuture at 0x7fa89ca8b0b8 state=finished returned list>>, 4: <AppFuture s

In [17]:
x = unpacked_tail[71][1].result()

In [24]:
import pickle
r = pickle.loads(x[0])
print(r)

[[ 14.15871429  13.76257515   0.         ... 107.           8.66666698
    4.97222233]
 [  0.           0.           0.         ...   0.           0.
    0.        ]
 [  0.           0.           0.         ...   0.           0.
    0.        ]
 ...
 [  0.           0.           0.         ...   0.           0.
    0.        ]
 [  0.           0.           0.         ...   0.           0.
    0.        ]
 [  0.           0.           0.         ...   0.           0.
    0.        ]]
