In [1]:
import pandas as pd
import keras
import numpy as np
import time

Using Theano backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [8]:
import parsl
from parsl import python_app
from parsl.configs.htex_local import config
parsl.load(config)

<parsl.dataflow.dflow.DataFlowKernel at 0x7f92650fad30>

In [None]:
import main

In [2]:
num_items_to_load = 100
print(f"Loading {num_items_to_load} csv items as data")
smiles = pd.read_csv("train.csv", nrows=num_items_to_load).iloc[:,0].tolist()

Loading 100 csv items as data


In [3]:
def compute_descript(smile):
    from mordred import Calculator, descriptors
    from rdkit import Chem
    import numpy as np
    # this object doesn't need to be created everytime. Can make global I think?                                                                                                                                    
    calc = Calculator(descriptors, ignore_3D=True)

    #read smiles                                                                                                                                                                                                    
    mol = Chem.MolFromSmiles(smile)
    if mol is None:
        print("Error processing mol")
        return None

    descs = calc(mol)

    return np.array(descs).flatten().astype(np.float32)


In [10]:
@python_app
def compute_descript_batches(smile_list):
    """ Takes a list of smiles and returns a corresponding list of descs.
    """
    from mordred import Calculator, descriptors
    from rdkit import Chem
    import numpy as np
    import pickle
    # this object doesn't need to be created everytime. Can make global I think?                                                                                                                                    
    calc = Calculator(descriptors, ignore_3D=True)

    results_list = []
    for smile in smile_list:
        #read smiles                                                                                                                                                                                                    
        mol = Chem.MolFromSmiles(smile)
        if mol is None:
            print("Error processing mol")
            result = None
        else:
            descs = calc(mol)
            result = pickle.dumps(np.array(descs).flatten().astype(np.float32))
            
        results_list.append(result)

    return results_list

@python_app
def app_compute_descript_batches(smile_list):
    """ Takes a list of smiles and returns a corresponding list of descs.
    """
    from mordred import Calculator, descriptors
    from rdkit import Chem
    import numpy as np
    import pickle
    # this object doesn't need to be created everytime. Can make global I think?                                                                                                                                    
    calc = Calculator(descriptors, ignore_3D=True)

    results_list = []
    for smile in smile_list:
        #read smiles                                                                                                                                                                                                    
        mol = Chem.MolFromSmiles(smile)
        if mol is None:
            print("Error processing mol")
            result = None
        else:
            descs = calc(mol)
            result = pickle.dumps(np.array(descs).flatten().astype(np.float32))
            
        results_list.append(result)

    return results_list

In [4]:
# Let's first time the cost of running num_items_to_load in sequence
start = time.time()
all_results = []
for smile in smiles:
    r = compute_descript(smile)
    all_results.append(r)
    
delta = time.time() - start
print(f"Seconds elapsed : {delta}")



Seconds elapsed : 29.05618143081665


In [7]:
start = time.time()
x = smiles[0]
r = compute_descript_batches(smiles)
print("Completed ", len(r))

delta = time.time() - start
print(f"Seconds elapsed : {delta}")

Completed  100
Seconds elapsed : 24.257925987243652


In [None]:
# Looks like there's some small gain from batching, this is most likely from avoiding the 
# module load costs

In [11]:
start = time.time()
x = smiles[0]
r = app_compute_descript_batches(smiles).result()

print("Completed ", len(r))

delta = time.time() - start
print(f"Seconds elapsed : {delta}")

Completed  100
Seconds elapsed : 25.2049822807312


In [None]:
# Cost of shipping serialized buffers appears to only add a ~4% overhead