In [1]:
import _pickle as cPickle
import gzip
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from collections import defaultdict
import random
import createFingerprintsReaction

In [2]:
dataDir = "../data/"

Create different transformation FPs (AP3, MG2 and TT) as SparseIntVect

In [3]:
infile = gzip.open(dataDir+'training_test_set_patent_data.pkl.gz', 'rb')
pklfile = gzip.open(dataDir+'transformationFPs_test_set_patent_data.pkl.gz','wb+')

lineNo=0
while 1:
    lineNo+=1
    try:
        smi,lbl,klass = cPickle.load(infile) 
    except EOFError:
        break
    try:
        rxn = AllChem.ReactionFromSmarts(smi,useSmiles=True)
        fp_AP3 = createFingerprintsReaction.create_transformation_FP(rxn,AllChem.FingerprintType.AtomPairFP)
        fp_MG2 = createFingerprintsReaction.create_transformation_FP(rxn,AllChem.FingerprintType.MorganFP)
        fp_TT = createFingerprintsReaction.create_transformation_FP(rxn,AllChem.FingerprintType.TopologicalTorsion)
    except:
        print("Cannot build fingerprint/reaction of: %s\n"%smi)
        continue;
    cPickle.dump((lbl,klass,fp_AP3, fp_MG2, fp_TT),pklfile,2)
    if not lineNo%5000:
        print("Done: %d"%lineNo)
        
#     print(smi[1])

Done: 5000
Done: 10000
Done: 15000
Done: 20000
Done: 25000
Done: 30000
Done: 35000
Done: 40000
Done: 45000
Done: 50000


In [4]:
print(smi, lbl, klass)

rxn
# fp_AP3

[CH3:2][CH2:1][O:3][C:4](=[O:5])[C:6]1([C:14]#[N:15])[CH2:7][C:8]12[CH2:9][CH2:10][CH2:11][CH2:12][CH2:13]2.CCN(CC)CC>CCO>[CH3:2][CH2:1][O:3][C:4](=[O:5])[C:6]1([CH2:14][NH2:15])[CH2:7][C:8]12[CH2:9][CH2:10][CH2:11][CH2:12][CH2:13]2 US07030267 7.3.1


<rdkit.Chem.rdChemReactions.ChemicalReaction at 0x1d71f460990>

Combine AP3 fingerprint with agent feature und Morgan2 FPs

In [12]:
infile = gzip.open(dataDir+'training_test_set_patent_data.pkl.gz', 'rb')
pklfile = gzip.open(dataDir+'transformationFPs_MG2_agentFPs_test_set_patent_data.pkl.gz','wb+')

lineNo=0
while 1:
    lineNo+=1
    try:
        smi,lbl,klass = cPickle.load(infile) 
    except EOFError:
        break
    try:
        rxn = AllChem.ReactionFromSmarts(smi,useSmiles=True)
        fp_AP3 = createFingerprintsReaction.create_transformation_FP(rxn,AllChem.FingerprintType.AtomPairFP)
        fp_MG2_agents = createFingerprintsReaction.create_agent_morgan2_FP(rxn)
        if fp_MG2_agents is None:
            fp_MG2_agents = DataStructs.UIntSparseIntVect(4096)
        fp_featureAgent = createFingerprintsReaction.create_agent_feature_FP(rxn)
    except:
        print("Cannot build fingerprint/reaction of: %s\n"%smi)
        continue;
    cPickle.dump((lbl,klass,fp_AP3,fp_featureAgent,fp_MG2_agents),pklfile,2)
    if not lineNo%5000:
        print("Done: %d"%lineNo)

Done: 5000
Done: 10000
Done: 15000
Done: 20000
Done: 25000
Done: 30000
Done: 35000
Done: 40000
Done: 45000
Done: 50000


Create transformation FP (AP3 + agent featureFP) for external test set A

In [13]:
infile = gzip.open(dataDir+'training_test_set_patent_data.pkl.gz', 'rb')
pklfile = gzip.open(dataDir+'transformationFPs_agentFPs_external_test_set_A.pkl.gz','wb+')

lineNo=0
while 1:
    lineNo+=1
    try:
        smi,lbl,klass = cPickle.load(infile) 
    except EOFError:
        break
    try:
        rxn = AllChem.ReactionFromSmarts(smi,useSmiles=True)
        fp_AP3 = createFingerprintsReaction.create_transformation_FP(rxn,AllChem.FingerprintType.AtomPairFP)
        fp_featureAgent = createFingerprintsReaction.create_agent_feature_FP(rxn)
    except:
        print("Cannot build fingerprint/reaction of: %s\n"%smi)
        continue;
    cPickle.dump((lbl,klass,fp_AP3,fp_featureAgent),pklfile,2)
    if not lineNo%5000:
        print("Done: %d"%lineNo)

Done: 5000
Done: 10000
Done: 15000
Done: 20000
Done: 25000
Done: 30000
Done: 35000
Done: 40000
Done: 45000
Done: 50000


Create transformation FP (AP3 + agent featureFP) for external test set B (unclassified reactions)

In [14]:
infile = gzip.open(dataDir+'unclassified_reactions_patent_data.pkl.gz', 'rb')
pklfile = gzip.open(dataDir+'transformationFPs_agentFPs_external_test_set_B.pkl.gz','wb+')

lineNo=0
while 1:
    lineNo+=1
    try:
        smi,lbl,klass = cPickle.load(infile) 
    except EOFError:
        break
    try:
        rxn = AllChem.ReactionFromSmarts(smi,useSmiles=True)
        fp_AP3 = createFingerprintsReaction.create_transformation_FP(rxn,AllChem.FingerprintType.AtomPairFP)
        fp_featureAgent = createFingerprintsReaction.create_agent_feature_FP(rxn)
    except:
        print("Cannot build fingerprint/reaction of: %s\n"%smi)
        continue;
    cPickle.dump((lbl,smi,fp_AP3,fp_featureAgent),pklfile,2)
    if not lineNo%5000:
        print("Done: %d"%lineNo)

Done: 5000
Done: 10000
Done: 15000
Done: 20000
Done: 25000
Done: 30000
Done: 35000
Done: 40000
Done: 45000
Done: 50000
