Skip to content
Browse files

added script

  • Loading branch information
undeadpixel committed Jul 1, 2019
1 parent eb26691 commit f0951de3b11ab376b6790d226cc122c3218a95f6
Showing with 98 additions and 4 deletions.
  1. +53 −0
  2. +4 −4 utils/
  3. +41 −0 utils/
@@ -0,0 +1,53 @@
#!/usr/bin/env python

import argparse
import os
import functools

import utils.log as ul
import utils.chem as uc
import utils.spark as us

def parse_args():
"""Parses input arguments."""
parser = argparse.ArgumentParser(
description="Creates many datasets.")
parser.add_argument("--input-smi-path", "-i", help="Path to a SMILES file to convert.", type=str, required=True)
parser.add_argument("--output-smi-folder-path", "-o",
help="Path to a folder that will have the converted SMILES files.", type=str, required=True)
parser.add_argument("--random-type", "-r", help="Type of the converted SMILES TYPES=(restricted,unrestricted) \
[DEFAULT: restricted].", type=str, default="restricted")
"--num-files", "-n", help="Number of SMILES files to create (numbered from 000 ...) [DEFAULT: 1]",
type=int, default=1)
parser.add_argument("--num-partitions", "-p", help="Number of SPARK partitions to use [DEFAULT: 1000]",
type=int, default=1000)

return parser.parse_args()

def main():
"""Main function."""
args = parse_args()

mols_rdd = SC.textFile(args.input_smi_path) \
.repartition(args.num_partitions) \

os.makedirs(args.output_smi_folder_path, exist_ok=True)

smiles_func = functools.partial(uc.randomize_smiles, random_type=args.random_type)
for i in range(args.num_files):
with open("{}/{:03d}.smi".format(args.output_smi_folder_path, i), "w+") as out_file:
for smi in


LOG = ul.get_logger("create_randomized_smiles")
if __name__ == "__main__":
SPARK, SC = us.SparkSessionSingleton.get("create_randomized_smiles")
@@ -85,19 +85,19 @@ def to_smiles(mol):
return rkc.MolToSmiles(mol, isomericSmiles=False)

def randomize_smiles(mol, random_type="order"):
def randomize_smiles(mol, random_type="restricted"):
Returns a random SMILES given a SMILES of a molecule.
:param mol: A Mol object
:param random_type: The type (branching, order) of randomization performed.
:param random_type: The type (unrestricted, restricted) of randomization performed.
:return : A random SMILES string of the same molecule or None if the molecule is invalid.
if not mol:
return None

if random_type == "branching":
if random_type == "unrestricted":
return rkc.MolToSmiles(mol, canonical=False, doRandom=True, isomericSmiles=False)
if random_type == "order":
if random_type == "restricted":
new_atom_order = list(range(mol.GetNumHeavyAtoms()))
random_mol = rkc.RenumberAtoms(mol, newOrder=new_atom_order)
@@ -0,0 +1,41 @@
Spark util functions

import pyspark.sql as ps

class SparkSessionSingleton:
"""Manages unique spark sessions for each app name."""


def __init__(self):
raise NotImplementedError("SparkSessionSingleton is not instantiable.")

def get(cls, app_name, params_func=None):
Retrieves (or creates) a session with a given app name.

if app_name not in cls.SESSIONS:
session = ps.SparkSession.builder \
if params_func:
session = session.getOrCreate()
context = session.sparkContext

cls.SESSIONS[app_name] = (session, context)
return cls.SESSIONS[app_name]

def cleanup(cls):
Closes all sessions.
for session, _ in cls.SESSIONS.values():
cls.SESSIONS = {}

0 comments on commit f0951de

Please sign in to comment.
You can’t perform that action at this time.