In [1]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://postgres@db:5432/postgres')

In [2]:
from sqlalchemy.orm import sessionmaker
Session = sessionmaker(bind=engine)

In [3]:
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base(bind=engine)

In [4]:
from sqlalchemy import Column, Index, Integer, String
from razi.rdkit_postgresql.types import Mol

class Compound(Base):
    __tablename__ = 'compounds'

    id = Column(Integer, primary_key=True)
    name = Column(String)
    structure = Column(Mol)

    __table_args__ = (
        Index('compounds_structure', 'structure',
              postgresql_using='gist'),
        )

    def __init__(self, name, structure):
        self.name = name
        self.structure = structure

    def __repr__(self):
        return '(%s) < %s >' % (self.name, self.structure)

In [5]:
Base.metadata.create_all()

In [6]:
!head -n3 chembl_23_chemreps.txt

chembl_id	canonical_smiles	standard_inchi	standard_inchi_key
CHEMBL153534	Cc1cc(cn1C)c2csc(N=C(N)N)n2	InChI=1S/C10H13N5S/c1-6-3-7(4-15(6)2)8-5-16-10(13-8)14-9(11)12/h3-5H,1-2H3,(H4,11,12,13,14)	MFRNFCWYPYSFQQ-UHFFFAOYSA-N
CHEMBL440060	CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@@H](N)CCSC)[C@@H](C)O)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C)C(=O)N[C@@H](Cc1c[nH]cn1)C(=O)N[C@@H](CC(=O)N)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(=O)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CCC(=O)N)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)NCC(=O)N[C@@H](CCC(=O)N)C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N2CCC[C@H]2C(=O)N3CCC[C@H]3C(=O)NCC(=O)N[C@@H](CO)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N	InChI=1S/C123H212N44O34S/c1-19-63(12)96(164-115(196)81(47-62(10)11)163-119(200)97(68(17)169)165-103(184)70(124)36-42-202-18)118(199)143-52-92(175)147-65(14)100(181)149-67(16)102(183)157-82(48-69-50-136-57-145-69)114(195)162-83(49-90(128)173)106(187)141-51-91(174)146-64(

In [7]:
from collections import namedtuple
Record = namedtuple('Record', 'chembl_id, smiles, inchi, inchi_key')

In [8]:
import csv
from rdkit import Chem

def read_chembldb(filepath, limit=0):

    with open(filepath, 'rt') as inputfile:
        reader = csv.reader(inputfile, delimiter='\t', skipinitialspace=True)
        next(reader) # skip header

        for count, record in enumerate(map(Record._make, reader), 1):

            smiles = record.smiles

            # skip problematic smiles
            if len(smiles) > 300: continue
            smiles = smiles.replace('=N#N','=[N+]=[N-]')
            smiles = smiles.replace('N#N=','[N-]=[N+]=')
            if not Chem.MolFromSmiles(smiles):
                continue

            yield count, record.chembl_id, smiles
            if count == limit:
                break

In [9]:
session = Session()
for count, chembl_id, smiles in read_chembldb('chembl_23_chemreps.txt', 25000):
    compound = Compound(chembl_id, smiles)
    session.add(compound)
session.commit()

In [10]:
session.query(Compound).count()

24637

In [11]:
for compound in session.query(Compound)[:5]:
    print(compound)

(CHEMBL153534) < <rdkit.Chem.rdchem.Mol object at 0x7f9a336ae960> >
(CHEMBL405398) < <rdkit.Chem.rdchem.Mol object at 0x7f9a336ae5a8> >
(CHEMBL503634) < <rdkit.Chem.rdchem.Mol object at 0x7f9a336ae3b0> >
(CHEMBL503643) < <rdkit.Chem.rdchem.Mol object at 0x7f9a336ae4c8> >
(CHEMBL503865) < <rdkit.Chem.rdchem.Mol object at 0x7f9a336ae570> >


In [12]:
subset = session.query(Compound)
subset = subset.filter(Compound.structure.hassubstruct('c1cccc2c1nncc2'))
print(subset.count())
for mol in subset[:5]:
    print(mol)

2
(CHEMBL26025) < <rdkit.Chem.rdchem.Mol object at 0x7f9a33db13e8> >
(CHEMBL12112) < <rdkit.Chem.rdchem.Mol object at 0x7f9a33db1298> >
