In [1]:
import pandas as pd
import keras
import numpy as np
import time

Using Theano backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Parsl backend init

Here we load a barebones remote execution backend for parsl to form a decent baseline that includes all the costs of remote function instantiation.

In [2]:
import parsl
from parsl import python_app
from parsl.configs.htex_local import config
parsl.load(config)

<parsl.dataflow.dflow.DataFlowKernel at 0x7f5318979780>

In [3]:
# We do not need this import here, but we should ideally be loading all the apps from a separate
# module
import main

In [4]:
print("Loading all data available")
smiles = pd.read_csv("train.csv", nrows=1584).iloc[:,0].tolist()
print("Total of {} available".format(len(smiles)))

Loading all data available
Total of 1584 available


In [5]:
@python_app
def app_compute_descript_batches(smile_list):
    """ Takes a list of smiles and returns a corresponding list of descs.
    """
    from mordred import Calculator, descriptors
    from rdkit import Chem
    import numpy as np
    import pickle
    # this object doesn't need to be created everytime. Can make global I think?                                                                                                                                    
    calc = Calculator(descriptors, ignore_3D=True)

    results_list = []
    for smile in smile_list:
        #read smiles                                                                                                                                                                                                    
        mol = Chem.MolFromSmiles(smile)
        if mol is None:
            print("Error processing mol")
            result = None
        else:
            descs = calc(mol)
            result = pickle.dumps(np.array(descs).flatten().astype(np.float32))
            
        results_list.append(result)

    return results_list

In [7]:
chunksize = 100
proc_chunks = {}

for i in range(1, len(smiles), chunksize):    
    chunk = smiles[i:i+chunksize]
    r = app_compute_descript_batches(chunk)
    proc_chunks[i] = r        

In [8]:
print(proc_chunks)

{1: <AppFuture super=<AppFuture at 0x7f53140b3c18 state=pending>>, 101: <AppFuture super=<AppFuture at 0x7f53140b3a20 state=pending>>, 201: <AppFuture super=<AppFuture at 0x7f53140b3908 state=pending>>, 301: <AppFuture super=<AppFuture at 0x7f53140b3da0 state=pending>>, 401: <AppFuture super=<AppFuture at 0x7f53140b3fd0 state=pending>>, 501: <AppFuture super=<AppFuture at 0x7f5359597358 state=pending>>, 601: <AppFuture super=<AppFuture at 0x7f5314067128 state=pending>>, 701: <AppFuture super=<AppFuture at 0x7f5314067240 state=pending>>, 801: <AppFuture super=<AppFuture at 0x7f53140672b0 state=pending>>, 901: <AppFuture super=<AppFuture at 0x7f53140673c8 state=pending>>, 1001: <AppFuture super=<AppFuture at 0x7f5314067518 state=pending>>, 1101: <AppFuture super=<AppFuture at 0x7f5314067668 state=pending>>, 1201: <AppFuture super=<AppFuture at 0x7f53140677b8 state=pending>>, 1301: <AppFuture super=<AppFuture at 0x7f5314067908 state=pending>>, 1401: <AppFuture super=<AppFuture at 0x7f5314

In [None]:
# Let's first time the cost of running num_items_to_load in sequence
start = time.time()
all_results = []
for smile in smiles:
    r = compute_descript(smile)
    all_results.append(r)
    
delta = time.time() - start
print(f"Seconds elapsed : {delta}")

In [None]:
start = time.time()
x = smiles[0]
r = compute_descript_batches(smiles)
print("Completed ", len(r))

delta = time.time() - start
print(f"Seconds elapsed : {delta}")

In [None]:
# Looks like there's some small gain from batching, this is most likely from avoiding the 
# module load costs

In [None]:
start = time.time()
x = smiles[0]
r = app_compute_descript_batches(smiles).result()

print("Completed ", len(r))

delta = time.time() - start
print(f"Seconds elapsed : {delta}")

In [None]:
# Cost of shipping serialized buffers appears to only add a ~4% overhead