In [1]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://postgres@db:5432/postgres', echo=False)

In [2]:
from sqlalchemy.orm import sessionmaker
Session = sessionmaker(bind=engine)

In [3]:
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base(bind=engine)

In [4]:
from sqlalchemy import Column, Index, Integer, String
from razi.rdkit_postgresql.types import Mol, Bfp
from razi.rdkit_postgresql.functions import atompairbv_fp, torsionbv_fp, morganbv_fp
from rdkit import Chem


class Compound(Base):
    __tablename__ = 'compounds'

    id = Column(Integer, primary_key=True)
    name = Column(String)
    structure = Column(Mol)
    atompair = Column(Bfp)
    torsion = Column(Bfp)
    morgan = Column(Bfp)

    __table_args__ = (
        Index('compounds_structure', 'structure',
              postgresql_using='gist'),
        )

    def __init__(self, name, structure):
        self.name = name
        self.structure = structure
        if isinstance(self.structure, str):
            mol = Chem.MolFromSmiles(self.structure)
        else:
            mol = self.structure
        self.atompair = atompairbv_fp(self.structure)
        self.torsion = torsionbv_fp(self.structure)
        self.morgan = morganbv_fp(self.structure, 2)
#        self.atompair = atompairbv_fp(mol)
#        self.torsion = torsionbv_fp(mol)
#        self.morgan = morganbv_fp(mol, 2)

    def __repr__(self):
        if isinstance(self.structure, Chem.Mol):
            return '(%s) < %s >' % (self.name, Chem.MolToSmiles(self.structure))
        return '(%s) < %s >' % (self.name, self.structure)

In [5]:
Base.metadata.create_all()

In [6]:
!head -n3 chembl_23_chemreps.txt

chembl_id	canonical_smiles	standard_inchi	standard_inchi_key
CHEMBL153534	Cc1cc(cn1C)c2csc(N=C(N)N)n2	InChI=1S/C10H13N5S/c1-6-3-7(4-15(6)2)8-5-16-10(13-8)14-9(11)12/h3-5H,1-2H3,(H4,11,12,13,14)	MFRNFCWYPYSFQQ-UHFFFAOYSA-N
CHEMBL440060	CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@@H](N)CCSC)[C@@H](C)O)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C)C(=O)N[C@@H](Cc1c[nH]cn1)C(=O)N[C@@H](CC(=O)N)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(=O)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CCC(=O)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)NCC(=O)N[C@@H](CCC(=O)N)C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N2CCC[C@H]2C(=O)N3CCC[C@H]3C(=O)NCC(=O)N[C@@H](CO)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N	InChI=1S/C123H212N44O34S/c1-19-63(12)96(164-115(196)81(47-62(10)11)163-119(200)97(68(17)169)165-103(184)70(124)36-42-202-18)118(199)143-52-92(175)147-65(14)100(181)149-67(16)102(183)157-82(48-69-50-136-57-145-69)114(195)162-83(49-90(128)173)106(187)141-51-91(174)146-64(

In [7]:
from collections import namedtuple
Record = namedtuple('Record', 'chembl_id, smiles, inchi, inchi_key')

In [8]:
import csv
from rdkit import Chem

def read_chembldb(filepath, limit=0):

    with open(filepath, 'rt') as inputfile:
        reader = csv.reader(inputfile, delimiter='\t', skipinitialspace=True)
        next(reader) # skip header

        for count, record in enumerate(map(Record._make, reader), 1):

            smiles = record.smiles

            # skip problematic smiles
            if len(smiles) > 300: continue
            smiles = smiles.replace('=N#N','=[N+]=[N-]')
            smiles = smiles.replace('N#N=','[N-]=[N+]=')
            if not Chem.MolFromSmiles(smiles):
                continue

            yield count, record.chembl_id, smiles
            if count == limit:
                break

In [9]:
session = Session()
for count, chembl_id, smiles in read_chembldb('chembl_23_chemreps.txt', 500):
    compound = Compound(chembl_id, smiles)
    session.add(compound)
session.commit()

In [10]:
session = Session()

In [11]:
session.query(Compound).count()

439

In [12]:
for compound in session.query(Compound)[:5]:
    print(compound)

(CHEMBL153534) < Cc1cc(-c2csc(N=C(N)N)n2)cn1C >
(CHEMBL405398) < Brc1cccc(Nc2ncnc3ccncc23)c1NCCN1CCOCC1 >
(CHEMBL503634) < COc1c(O)cc(O)c(C(=N)Cc2ccc(O)cc2)c1O >
(CHEMBL503643) < CCOC(=O)c1cc2cc(C(=O)O)ccc2[nH]1 >
(CHEMBL503865) < CC(=O)OC1C(C)=CC2OC(=O)C3(C)OC23C(OC(C)=O)C2C(C)(O)C(O)C=CC2(C)C(OC(C)=O)C1OC(C)=O >


In [13]:
subset = session.query(Compound)
subset = subset.filter(Compound.structure.hassubstruct('c1ccccc1'))
print(subset.count())
for mol in subset[:5]:
    print(mol)

319
(CHEMBL405398) < Brc1cccc(Nc2ncnc3ccncc23)c1NCCN1CCOCC1 >
(CHEMBL503634) < COc1c(O)cc(O)c(C(=N)Cc2ccc(O)cc2)c1O >
(CHEMBL503643) < CCOC(=O)c1cc2cc(C(=O)O)ccc2[nH]1 >
(CHEMBL503870) < CC(NC(=O)OCc1ccccc1)C(=O)NC(C)C(=O)NN(CC(N)=O)C(=O)C=CC(=O)N(Cc1ccco1)Cc1ccco1 >
(CHEMBL503873) < COC1CC(COCC2C(C)OC(OC3CCC4(C)C(=CCC5(O)C4CC(OC(=O)C=Cc4ccccc4)C4(C)C(O)(C(C)=O)CCC54O)C3)CC2OC)OC(C)C1COCC1CC(OC)C(COCC2CC(OC)C(OC3OC(CO)C(O)C(O)C3O)C(C)O2)C(C)O1 >


In [14]:
fp = morganbv_fp('COC(=O)Nc1nc2cc(C(=O)Nc3cccc(C)n3)ccc2[nH]1')
subset = session.query(Compound)
subset = subset.filter(Compound.morgan.dice_sml(fp))
print(subset.count())

0


In [15]:
fp = morganbv_fp('COC(=O)Nc1nc2cc(C(=O)Nc3cccc(C)n3)ccc2[nH]1')
subset = session.query(Compound)
subset = subset.filter(Compound.morgan.tanimoto_sml(fp))
print(subset.count())

0


In [None]:
from sqlalchemy import desc


fp = morganbv_fp('COC(=O)Nc1nc2ccc(C(=O)c3cccs3)cc2[nH]1')
tanimoto_similar = Compound.morgan.tanimoto_similar(fp).label('tanimoto')
constraint = Compound.morgan.tanimoto_sml(fp)
rs = session.query(Compound, tanimoto_similar).filter(constraint).order_by(tanimoto_similar)
print(rs.count())
for i in rs:
    print(i)

In [None]:
from sqlalchemy import desc


fp = morganbv_fp('COC(=O)Nc1nc2ccc(C(=O)c3cccs3)cc2[nH]1')
dice_similar = Compound.morgan.dice_similar(fp).label('dice')
constraint = Compound.morgan.dice_sml(fp)
rs = session.query(Compound, dice_similar).filter(constraint).order_by(desc(dice_similar))
print(rs.count())
for i in rs:
    print(i)

In [17]:
session.scalar('show rdkit.tanimoto_threshold')

'0.5'

In [18]:
session.scalar('show rdkit.dice_threshold')

'0.5'

In [19]:
session.execute('set rdkit.tanimoto_threshold=0.4')

<sqlalchemy.engine.result.ResultProxy at 0x7f57ba86fc88>

In [20]:
session.scalar('show rdkit.tanimoto_threshold')

'0.4'

In [21]:
subset = session.query(Compound)
subset = subset.filter(Compound.morgan.dice_sml(fp))
subset.count()

0

In [22]:
constraint = Compound.structure.hassubstruct('c1ncncn1')
subset = session.query(Compound).filter(constraint).order_by(desc('id'))
print(subset.count())
for compound in subset:
        print(compound.id)

1
306


In [23]:
constraint = Compound.structure.issubstruct('c1ccc(C(COC(c2c(=O)oc3c(ccc(O)c3)c2)=O)=O)cc1')
for compound in session.query(Compound).filter(constraint):
    print(compound)

In [24]:
compound_name = session.query(Compound.name).all()
compound_name[:5]

[('CHEMBL153534'),
 ('CHEMBL405398'),
 ('CHEMBL503634'),
 ('CHEMBL503643'),
 ('CHEMBL503865')]

In [25]:
compound_name = session.query(Compound).filter(Compound.name=='CHEMBL153534').all()
compound_name

[(CHEMBL153534) < Cc1cc(-c2csc(N=C(N)N)n2)cn1C >]

In [26]:
session.query(Compound).limit(10).all()

[(CHEMBL153534) < Cc1cc(-c2csc(N=C(N)N)n2)cn1C >,
 (CHEMBL405398) < Brc1cccc(Nc2ncnc3ccncc23)c1NCCN1CCOCC1 >,
 (CHEMBL503634) < COc1c(O)cc(O)c(C(=N)Cc2ccc(O)cc2)c1O >,
 (CHEMBL503643) < CCOC(=O)c1cc2cc(C(=O)O)ccc2[nH]1 >,
 (CHEMBL503865) < CC(=O)OC1C(C)=CC2OC(=O)C3(C)OC23C(OC(C)=O)C2C(C)(O)C(O)C=CC2(C)C(OC(C)=O)C1OC(C)=O >,
 (CHEMBL503870) < CC(NC(=O)OCc1ccccc1)C(=O)NC(C)C(=O)NN(CC(N)=O)C(=O)C=CC(=O)N(Cc1ccco1)Cc1ccco1 >,
 (CHEMBL503873) < COC1CC(COCC2C(C)OC(OC3CCC4(C)C(=CCC5(O)C4CC(OC(=O)C=Cc4ccccc4)C4(C)C(O)(C(C)=O)CCC54O)C3)CC2OC)OC(C)C1COCC1CC(OC)C(COCC2CC(OC)C(OC3OC(CO)C(O)C(O)C3O)C(C)O2)C(C)O1 >,
 (CHEMBL1082532) < Oc1ccc(N=C(Cc2ccc(Cl)cc2)c2ccc(O)c(O)c2O)cc1 >,
 (CHEMBL504077) < COc1ccc2c(c1OC)C(CC1(C)C=Cc3c(c4cccc(OC)c4n(C)c3=O)O1)N(C)c1c-2ccc2cc3c(cc12)OCO3 >,
 (CHEMBL501567) < CC(C)CC(NC(=O)C(Cc1cnc[nH]1)NC(=O)C(Cc1c[nH]c2ccccc12)NC(=O)C1CCCN1C(=O)C(N)CS)C(=O)NC(CC(C)C)C(=O)N1CCCC1C(=O)NC(Cc1ccccc1)C(=O)NC(CS)C(=O)O >]

In [27]:
session.query(Compound).order_by(Compound.name).limit(10).all()

[(CHEMBL1082532) < Oc1ccc(N=C(Cc2ccc(Cl)cc2)c2ccc(O)c(O)c2O)cc1 >,
 (CHEMBL1163216) < CN(C)CC(=O)Nc1ccc(C(=O)Nc2cc(C(=O)NCCn3nc4c5c(cccc53)C(=O)c3c(Cl)cccc3-4)n(C)c2)n1C >,
 (CHEMBL1163233) < COc1ccccc1C(=O)OC1CCC2(C)C(CCC3(C)C4CCC(C5(O)CC(C=C(C)C)OC5=O)C4CCC32)C1(C)C >,
 (CHEMBL1163234) < CC(C)(C)OC(=O)NC(CC(=O)OCc1ccccc1)C(=O)NC(COCc1ccccc1)C(=O)NCC(=O)OCc1ccccc1 >,
 (CHEMBL1163427) < CCCC1C(C(=O)NC(C)c2ccccc2)=CN(CC(=O)OC(C)C)C=C1C(=O)NC(Cc1ccccc1)C(O)CNC1CC1 >,
 (CHEMBL1203109) < CNC(=O)c1cc(C(O)CNC(C)CCc2ccc3c(c2)OCO3)ccc1O.Cl >,
 (CHEMBL1203132) < COc1ccc(OCC2=NC(c3ccccc3)c3ccccc3CN2C)cc1.Cl >,
 (CHEMBL1203140) < COc1ccccc1N1CCN(CC(O)c2ccc(O)c(C(N)=O)c2)CC1.Cl >,
 (CHEMBL1203155) < CC1CC(=O)NN=C1c1ccc(NC(=O)CCNCC(O)COc2cccc3[nH]ccc23)cc1.Cl >,
 (CHEMBL1203199) < CC(NC(=O)C1Cc2c(sc3ccccc23)CN1)c1ccccc1.Cl >]

In [28]:
constraint = session.query(Compound).filter(Compound.name=='CHEMBL153534')
sql_statement = constraint.statement
print(sql_statement)

SELECT compounds.id, compounds.name, mol_to_pkl(compounds.structure) AS structure, bfp_to_binary_text(compounds.atompair) AS atompair, bfp_to_binary_text(compounds.torsion) AS torsion, bfp_to_binary_text(compounds.morgan) AS morgan 
FROM compounds 
WHERE compounds.name = %(name_1)s


In [29]:
session.query(Compound).first()

(CHEMBL153534) < Cc1cc(-c2csc(N=C(N)N)n2)cn1C >

In [30]:
from razi.rdkit_postgresql import functions

In [31]:
session.query(functions.mol_amw(Compound.structure).label('MW')).limit(10).all()

[(235.316),
 (429.322),
 (289.287),
 (233.223),
 (580.583),
 (622.635),
 (1333.61),
 (369.804),
 (618.686),
 (1115.39)]

In [32]:
session.query(functions.tanimoto_sml(Compound.morgan, morganbv_fp('c1ccccc1', 2))).limit(10).all()

[(0.0263157894736842),
 (0.037037037037037),
 (0.0285714285714286),
 (0.0571428571428571),
 (0.0188679245283019),
 (0.05),
 (0.0306122448979592),
 (0.0277777777777778),
 (0.0273972602739726),
 (0.0365853658536585)]