In [3]:
import os
import gzip

from rdkit.Chem import ForwardSDMolSupplier, MolToSmiles, TDTMolSupplier
from chemreader.readers import Smiles
from chemreader.writers import GraphWriter

In [6]:
def determine_file_format(path):
    filename_components = os.path.basename(path).split(".")
    format_ =  filename_components[-1]
    if format_ in ["gz", "gzip"]:
        format_ = filename_components[-2]
    return format_


def save_decoys(path):
    format_ = determine_file_format(path)
    smiles = list()
    if format_ == "sdf":
        mols = ForwardSDMolSupplier(gzip.open(path))
        for mol in mols:
            if mol is not None:
                smiles.append(MolToSmiles(mol))
    elif format_ == "smi": 
        with gzip.open(path) as f:
            line = f.readline().decode()
            while line:
                smiles.append(line.split()[0])
                line = f.readline().decode()
    smiles = [Smiles(s) for s in set(smiles)]
    writer = GraphWriter(smiles)
    prefix = os.path.basename(path).split(".")[0]
    save_path = os.path.join(os.path.dirname(path), "graphs", "decoys")
    writer.write(save_path, prefix=prefix)

In [7]:
path = "../../data/JAK/JAK1_decoys.smi.gz"
save_decoys(path)

In [8]:
path = '../../data/JAK/JAK2_decoys.sdf.gz'
save_decoys(path)

In [None]:
path = '../../data/JAK/JAK3_decoys.smi.gz'
save_decoys(path)