In [1]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://postgres@db:5432/postgres', echo=True)

In [2]:
from sqlalchemy.orm import sessionmaker
Session = sessionmaker(bind=engine)

In [3]:
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base(bind=engine)

In [4]:
from sqlalchemy import Column, Index, Integer, String
from razi.rdkit_postgresql.types import Mol, Bfp
from razi.rdkit_postgresql.functions import atompairbv_fp, torsionbv_fp, morganbv_fp
from rdkit import Chem


class Compound(Base):
    __tablename__ = 'compounds'

    id = Column(Integer, primary_key=True)
    name = Column(String)
    structure = Column(Mol)
    atompair = Column(Bfp)
    torsion = Column(Bfp)
    morgan = Column(Bfp)

    __table_args__ = (
        Index('compounds_structure', 'structure',
              postgresql_using='gist'),
        )

    def __init__(self, name, structure):
        self.name = name
        self.structure = structure
        if isinstance(self.structure, str):
            mol = Chem.MolFromSmiles(self.structure)
        else:
            mol = self.structure
        self.atompair = atompairbv_fp(self.structure)
        self.torsion = torsionbv_fp(self.structure)
        self.morgan = morganbv_fp(self.structure, 2)
#        self.atompair = atompairbv_fp(mol)
#        self.torsion = torsionbv_fp(mol)
#        self.morgan = morganbv_fp(mol, 2)

    def __repr__(self):
        if isinstance(self.structure, Chem.Mol):
            return '(%s) < %s >' % (self.name, Chem.MolToSmiles(self.structure))
        return '(%s) < %s >' % (self.name, self.structure)

In [5]:
Base.metadata.create_all()

2017-12-13 21:52:42,425 INFO sqlalchemy.engine.base.Engine select version()
2017-12-13 21:52:42,427 INFO sqlalchemy.engine.base.Engine {}
2017-12-13 21:52:42,432 INFO sqlalchemy.engine.base.Engine select current_schema()
2017-12-13 21:52:42,435 INFO sqlalchemy.engine.base.Engine {}
2017-12-13 21:52:42,442 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2017-12-13 21:52:42,446 INFO sqlalchemy.engine.base.Engine {}
2017-12-13 21:52:42,449 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2017-12-13 21:52:42,452 INFO sqlalchemy.engine.base.Engine {}
2017-12-13 21:52:42,455 INFO sqlalchemy.engine.base.Engine show standard_conforming_strings
2017-12-13 21:52:42,457 INFO sqlalchemy.engine.base.Engine {}
2017-12-13 21:52:42,463 INFO sqlalchemy.engine.base.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
20

In [None]:
!head -n3 chembl_23_chemreps.txt

In [None]:
from collections import namedtuple
Record = namedtuple('Record', 'chembl_id, smiles, inchi, inchi_key')

In [None]:
import csv
from rdkit import Chem

def read_chembldb(filepath, limit=0):

    with open(filepath, 'rt') as inputfile:
        reader = csv.reader(inputfile, delimiter='\t', skipinitialspace=True)
        next(reader) # skip header

        for count, record in enumerate(map(Record._make, reader), 1):

            smiles = record.smiles

            # skip problematic smiles
            if len(smiles) > 300: continue
            smiles = smiles.replace('=N#N','=[N+]=[N-]')
            smiles = smiles.replace('N#N=','[N-]=[N+]=')
            if not Chem.MolFromSmiles(smiles):
                continue

            yield count, record.chembl_id, smiles
            if count == limit:
                break

In [None]:
session = Session()
for count, chembl_id, smiles in read_chembldb('chembl_23_chemreps.txt', 100):
    compound = Compound(chembl_id, smiles)
    session.add(compound)
session.commit()

In [7]:
session = Session()

In [8]:
session.query(Compound).count()

2017-12-13 21:53:04,141 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2017-12-13 21:53:04,145 INFO sqlalchemy.engine.base.Engine SELECT count(*) AS count_1 
FROM (SELECT compounds.id AS compounds_id, compounds.name AS compounds_name, compounds.structure AS compounds_structure, compounds.atompair AS compounds_atompair, compounds.torsion AS compounds_torsion, compounds.morgan AS compounds_morgan 
FROM compounds) AS anon_1
2017-12-13 21:53:04,149 INFO sqlalchemy.engine.base.Engine {}


87

In [9]:
for compound in session.query(Compound)[:5]:
    print(compound)

2017-12-13 21:53:06,255 INFO sqlalchemy.engine.base.Engine SELECT compounds.id AS compounds_id, compounds.name AS compounds_name, mol_to_pkl(compounds.structure) AS compounds_structure, bfp_to_binary_text(compounds.atompair) AS compounds_atompair, bfp_to_binary_text(compounds.torsion) AS compounds_torsion, bfp_to_binary_text(compounds.morgan) AS compounds_morgan 
FROM compounds 
 LIMIT %(param_1)s
2017-12-13 21:53:06,258 INFO sqlalchemy.engine.base.Engine {'param_1': 5}
(CHEMBL153534) < Cc1cc(-c2csc(N=C(N)N)n2)cn1C >
(CHEMBL405398) < Brc1cccc(Nc2ncnc3ccncc23)c1NCCN1CCOCC1 >
(CHEMBL503634) < COc1c(O)cc(O)c(C(=N)Cc2ccc(O)cc2)c1O >
(CHEMBL503643) < CCOC(=O)c1cc2cc(C(=O)O)ccc2[nH]1 >
(CHEMBL503865) < CC(=O)OC1C(C)=CC2OC(=O)C3(C)OC23C(OC(C)=O)C2C(C)(O)C(O)C=CC2(C)C(OC(C)=O)C1OC(C)=O >


In [10]:
subset = session.query(Compound)
subset = subset.filter(Compound.structure.hassubstruct('c1ccccc1'))
print(subset.count())
#for mol in subset[:5]:
#    print(mol)

2017-12-13 21:53:07,831 INFO sqlalchemy.engine.base.Engine SELECT count(*) AS count_1 
FROM (SELECT compounds.id AS compounds_id, compounds.name AS compounds_name, compounds.structure AS compounds_structure, compounds.atompair AS compounds_atompair, compounds.torsion AS compounds_torsion, compounds.morgan AS compounds_morgan 
FROM compounds 
WHERE compounds.structure @> mol_from_pkl(%(structure_1)s)) AS anon_1
2017-12-13 21:53:07,833 INFO sqlalchemy.engine.base.Engine {'structure_1': <memory at 0x7f9f8411c888>}
63


In [11]:
fp = morganbv_fp('c1ccccc1')
fp

<razi.rdkit_postgresql.functions.morganbv_fp at 0x7f9f71c356a0; morganbv_fp>

In [12]:
mol = Chem.MolFromSmiles('c1ccccc1')
bytea = memoryview(mol.ToBinary())
bytea

<memory at 0x7f9f8411c888>

In [13]:
subset = session.query(Compound)
subset = subset.filter(Compound.morgan.tanimoto_sml(bytea))
subset.count()
#type(subset)
#subset.count()
#type(subset)

2017-12-13 21:53:13,115 INFO sqlalchemy.engine.base.Engine SELECT count(*) AS count_1 
FROM (SELECT compounds.id AS compounds_id, compounds.name AS compounds_name, compounds.structure AS compounds_structure, compounds.atompair AS compounds_atompair, compounds.torsion AS compounds_torsion, compounds.morgan AS compounds_morgan 
FROM compounds 
WHERE compounds.morgan %% bfp_from_binary_text(%(morgan_1)s)) AS anon_1
2017-12-13 21:53:13,118 INFO sqlalchemy.engine.base.Engine {'morgan_1': <memory at 0x7f9f8411c888>}


0

In [14]:
subset = session.query(Compound)
subset = subset.filter(Compound.morgan.dice_sml(bytea))
subset.count()
#type(subset)
#subset.count()
#type(subset)

2017-12-13 21:54:06,223 INFO sqlalchemy.engine.base.Engine SELECT count(*) AS count_1 
FROM (SELECT compounds.id AS compounds_id, compounds.name AS compounds_name, compounds.structure AS compounds_structure, compounds.atompair AS compounds_atompair, compounds.torsion AS compounds_torsion, compounds.morgan AS compounds_morgan 
FROM compounds 
WHERE compounds.morgan # bfp_from_binary_text(%(morgan_1)s)) AS anon_1
2017-12-13 21:54:06,226 INFO sqlalchemy.engine.base.Engine {'morgan_1': <memory at 0x7f9f8411c888>}


0

In [None]:
session.s

In [None]:
Base.metadata.drop_all()