In [1]:
import pandas as pd
import keras
import numpy as np
import time

Using Theano backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Parsl backend init

Here we load a barebones remote execution backend for parsl to form a decent baseline that includes all the costs of remote function instantiation.

In [2]:
import parsl
from parsl import python_app
from parsl.configs.htex_local import config
parsl.load(config)

<parsl.dataflow.dflow.DataFlowKernel at 0x7efcf3991780>

In [3]:
# We do not need this import here, but we should ideally be loading 
# all the apps from a separate module
# import main

### Load the Smiles data
Here we load only 158 lines from the csv file, to avoid burning the laptop. Once we replace the config
with a config for theta, we can load and launch the whole file

In [4]:
print("Loading all data available")
smiles = pd.read_csv("train.csv", nrows=158).iloc[:,0].tolist()
print("Total of {} available".format(len(smiles)))

Loading all data available
Total of 158 available


### Update descript to process batches

We want the descript step to consume batches of smiles to minimize the task launch costs.
Here we add a `@python_app` decorator that marks this function for remote/distributed execution.

Key point to note is that we add a special `walltime=<int:seconds>` kwarg, that causes the function to raise a `parsl.app.errors.AppTimeout` exception if the function runs beyond the set walltime.

In [5]:
@python_app
def app_compute_descript_batches(smile_list, walltime=1):
    """ Takes a list of smiles and returns a corresponding list of descs.
    """
    from mordred import Calculator, descriptors
    from rdkit import Chem
    import numpy as np
    import pickle
    # this object doesn't need to be created everytime. Can make global I think?                                                                                                                                    
    calc = Calculator(descriptors, ignore_3D=True)

    results_list = []
    for smile in smile_list:
        #read smiles                                                                                                                                                                                                    
        mol = Chem.MolFromSmiles(smile)
        if mol is None:
            print("Error processing mol")
            result = None
        else:
            descs = calc(mol)
            result = pickle.dumps(np.array(descs).flatten().astype(np.float32))
            
        results_list.append(result)

    return results_list

### Launch tasks on chunks of data

Parsl does batching internally, but we can do better!

We have an estimate of the runtime for a batch of N tasks, and we use that to our advantage by creating
chunks of "smiles" that are dispatched to the now batched, `app_compute_descript_batches` function.

`chunksize` is configurable. In a smarter version we could tie `chunksize` and `walltime` together.

In [12]:
def launch_tasks(data, chunksize=10):
    proc_chunks = {}

    for i in range(1, len(data), chunksize):    
        chunk = data[i:i+chunksize]
        r = app_compute_descript_batches(chunk)
        proc_chunks[i] = r
    return proc_chunks

In [13]:
# Initial launch of all tasks
proc_chunks = launch_tasks(smiles)

### Capture and report messages on failed chunks

This is just a demonstration of how batches of smiles that exceed the runtime limits will simply raise a python exception when the "future" that represents the batch is asked to produce the result.

In [14]:
# Wait for the results
from parsl.app.errors import AppTimeout

for key in proc_chunks:
    try:
        x = proc_chunks[key].result()
    except AppTimeout as e:
        print("Caught timeout for chunk index: {}:{}".format(key,key+chunksize))

Caught timeout for chunk index: 61:71
Caught timeout for chunk index: 81:91
Caught timeout for chunk index: 91:101
Caught timeout for chunk index: 131:141
Caught timeout for chunk index: 151:161


In [16]:
print(proc_chunks)

{1: <AppFuture super=<AppFuture at 0x7efd2c1345f8 state=finished returned list>>, 11: <AppFuture super=<AppFuture at 0x7efcf00a6898 state=finished returned list>>, 21: <AppFuture super=<AppFuture at 0x7efcf00a65f8 state=finished returned list>>, 31: <AppFuture super=<AppFuture at 0x7efcf00a6cf8 state=finished returned list>>, 41: <AppFuture super=<AppFuture at 0x7efcf00a6eb8 state=finished returned list>>, 51: <AppFuture super=<AppFuture at 0x7efcf00a6ba8 state=finished returned list>>, 61: <AppFuture super=<AppFuture at 0x7efcf003b208 state=finished raised AppTimeout>>, 71: <AppFuture super=<AppFuture at 0x7efcf00a6fd0 state=finished returned list>>, 81: <AppFuture super=<AppFuture at 0x7efcf00a68d0 state=finished raised AppTimeout>>, 91: <AppFuture super=<AppFuture at 0x7efcf003be10 state=finished raised AppTimeout>>, 101: <AppFuture super=<AppFuture at 0x7efcf003bba8 state=finished returned list>>, 111: <AppFuture super=<AppFuture at 0x7efcf003ba20 state=finished returned list>>, 12

### Handle the failed batches

For the failed batches, we call `launch_tasks` again, but in this instance we call it with a `chunksize=1` so that
we can pinpoint which `smile` is non-convergent.

In [17]:
unpacked = {}
for key in proc_chunks:
    try:
        x = proc_chunks[key].result()
    except AppTimeout as e:        
        print("Launching unpacked tasks: {}:{}".format(key,key+chunksize))
        unpacked[key] = launch_tasks(smiles[key:key+chunksize], chunksize=1)

Launching unpacked tasks: 61:71
Launching unpacked tasks: 81:91
Launching unpacked tasks: 91:101
Launching unpacked tasks: 131:141
Launching unpacked tasks: 151:161


### Display specific smile that failed to terminate

In [24]:
for key in unpacked:
    print("Peeking inside batch {}:{} ------------".format(key, key+chunksize))
    for item in unpacked[key]:
        print("   Item {}".format(item))
        print(unpacked[key][item])
    print("---------------------------------------")

Peeking inside batch 61:71 ------------
   Item 1
<AppFuture super=<AppFuture at 0x7efcf0076da0 state=finished returned list>>
   Item 2
<AppFuture super=<AppFuture at 0x7efce9fa1518 state=finished returned list>>
   Item 3
<AppFuture super=<AppFuture at 0x7efcf0076ef0 state=finished returned list>>
   Item 4
<AppFuture super=<AppFuture at 0x7efce9fa1860 state=finished raised AppTimeout>>
   Item 5
<AppFuture super=<AppFuture at 0x7efce9fa8d68 state=finished returned list>>
   Item 6
<AppFuture super=<AppFuture at 0x7efcf0076f60 state=finished returned list>>
   Item 7
<AppFuture super=<AppFuture at 0x7efce9fa8908 state=finished returned list>>
   Item 8
<AppFuture super=<AppFuture at 0x7efce9fa13c8 state=finished returned list>>
   Item 9
<AppFuture super=<AppFuture at 0x7efce9fa8ac8 state=finished returned list>>
---------------------------------------
Peeking inside batch 81:91 ------------
   Item 1
<AppFuture super=<AppFuture at 0x7efce9fa1908 state=finished returned list>>
   Ite