In [98]:
import os
import requests
import numpy as np
import pandas as pd
from rdkit import Chem
import Levenshtein as le
from multiprocessing import Pool
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.DataStructs import TanimotoSimilarity
from rdkit.Chem import AllChem, rdMolDescriptors

In [106]:
df = pd.read_csv("/data/guoliming/project/gitlabl_project/patent_benchmark/Patent_res.csv")
df.head()

Unnamed: 0,key,Img_path,GD,OCMR,Is_OCMR_true,MolVec,Is_MolVec_true,OSRA,Is_OSRA_true,Imago,...,Imago_TanimotoSimilarity,OCMR_edit_distance,MolVec_edit_distance,OSRA_edit_distance,Imago_edit_distance,len_gd,len_ocmr,len_molvec,len_osra,len_imago
0,P001,./images/P001.png,CN(C/C=C/C(=O)N1C[C@H](CC1)OC(=O)N1CCC(CC1)NC1...,CC1=NC2=C(C(C)C)C=NN2C(NC2CCN(C(=O)O[C@H]3CCN(...,1,CC1=NC2=C(C(C)C)C=NN2C(NC2CCN(C(=O)O[C@H]3CCN(...,1,C*(CCCC(N1CC[C@@H](C1)OC(=O)N1CCC(CC1)Nc1cc(C)...,0,Cc1cc(NC2CCN(C(=O)OC3CCN(C(=O)/C=C/CN(C)C)C3)C...,...,0.913581,0,0,42,4,71,71,73,64,65
1,P002,./images/P002.png,C(C)(C)C=1C=NN2C1N=C(C=C2NC2CCN(CC2)C(=O)O[C@H...,CC1=NC2=C(C(C)C)C=NN2C(NC2CCN(C(=O)O[C@@H]3CCN...,1,CC1=NC2=C(C(C)C)C=NN2C(CC2CCN(C(=O)O[C@@H]3CCN...,0,CN(C/C=C/C(=O)N1CC[C@H](C1)OC(=O)N1CCC(CC1)*c1...,0,Cc1cc(NC2CCN(C(=O)O[C@@H]3CCN(C(=O)/C=C/CN(C)C...,...,1.0,0,1,1,0,73,72,74,67,70
2,P003,./images/P003.png,Cl.CN(C/C=C/C(=O)N1C[C@H](CC1)OC(=O)N1CCC(CC1)...,CC1=NC2=C(C(C)C)C=NN2C(NC2CCN(C(=O)O[C@H]3CCN(...,0,CC1=NC2=C(C(C)C)C=NN2C(NC2CCN(C(=O)O[C@H]3CCN(...,0,CN(C/C=C/C(=O)N1CC[C@@H](C1)OC(=O)N1CCC(CC1)Nc...,0,Cc1cc(NC2CCN(C(=O)O[C@H]3CCN(C(=O)/C=C/CN(C)C)...,...,0.844373,2,2,2,2,76,71,73,68,69
3,P004,./images/P004.png,NC=1C=C(C=CC1)C1=NC(=NC=C1)NC1=CC=C(C=C1)OCCOC,COCCOC1=CC=C(NC2=NC=CC(C3=CC(N)=CC=C3)=N2)C=C1,1,COCCOC1=CC=C(NC2=NC=CC(C3=CC(N)=CC=C3)=N2)C=C1,1,C/[O]=C/COc1ccc(cc1)Nc1nccc(n1)C1=C[C](#CC=C1)N,0,C=C/C(N)=C\C=C/Cc1ccnc(NCc2ccc(CCCOC)cc2)n1,...,0.365738,0,0,38,26,46,46,46,47,43
4,P005,./images/P005.png,C(C)N(C(C)=O)C1=CC(=CC=C1)C1=NC(=NC=C1)NC1=CC=...,CCN(C(C)=O)C1=CC=CC(C2=NC(NC3=CC=C(O)C=C3)=NC=...,1,CCN(C(C)=O)C1=CC=CC(C2=NC(NC3=CC=C(O)C=C3)=NC=...,1,CCN(c1cccc(c1)c1ccnc(n1)Nc1ccc(cc1)O)C(=O)C,1,CCN(C(C)=O)c1cccc(-c2ccnc(NC3=CC=C(O)C(C)C3)n2)c1,...,0.698125,0,0,0,11,54,52,52,43,49


In [100]:
# TanimotoSimilarity
def calc_fingerprints(mols, fp_type="mg", radius=2, bit_size=2048):
    if type(mols) == Chem.rdchem.Mol:
        mols = [mols]

    if fp_type=="mg":
        fps = [AllChem.GetMorganFingerprintAsBitVect(m, radius, bit_size, useChirality=True) for m in mols]

    elif fp_type=="rdk":
        fps = [AllChem.RDKFingerprint(m, fpSize=bit_size) for m in mols]

    elif fp_type=="tt":
        fps = [rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(m, nBits=bit_size) for m in mols]

    elif fp_type == "ap":
        fps = [rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(m, nBits=bit_size, includeChirality=True) for m in mols]

    else:
        raise Exception("Supported options for 'fp_type': 'mg'/'rdk'/'tt'/'ap")

    return fps

def calc_matrix(fps1, fps2):
    row_num = len(fps1)
    col_num = len(fps2)
    simi_matrix = np.eye(col_num,row_num)

    for i in range(col_num):
        for j in range(row_num):
            simi_matrix[i,j] = TanimotoSimilarity(fps2[i],fps1[j])
    return simi_matrix

def calc_fp_similarity(x,name):
    """
    fp_types: one or multi of mg(MorganFingerprint)、rdk(RDKFingerprint)、tt(TopologicalTorsionFingerprint)、ap(AtomPairFingerprint)
    """
    fp_types=('mg', 'ap')
    is_smi=True
    scaffold=False
    ref_m, prb_m = x["GD"],x[name]
    if is_smi:
        ref_m = Chem.MolFromSmiles(ref_m)
        prb_m = Chem.MolFromSmiles(prb_m)
    if ref_m is None or prb_m is None:
        return 0
    if scaffold:
        ref_m = MurckoScaffold.GetScaffoldForMol(ref_m)
        prb_m = MurckoScaffold.GetScaffoldForMol(ref_m)
    simi_values = []
    for fp_type in fp_types:
        fps = calc_fingerprints([ref_m, prb_m], fp_type=fp_type, radius=2, bit_size=2048)
        simi_values.append(calc_matrix(fps, fps)[0, 1])
    return np.mean(simi_values)
    
df = df.fillna("c")
df["OCMR_TanimotoSimilarity"] = df.apply(calc_fp_similarity, name="OCMR", axis=1)
df["MolVec_TanimotoSimilarity"] = df.apply(calc_fp_similarity, name="MolVec", axis=1)
df["OSRA_TanimotoSimilarity"] = df.apply(calc_fp_similarity, name="OSRA", axis=1)
df["Imago_TanimotoSimilarity"] = df.apply(calc_fp_similarity, name="Imago", axis=1)


[11:54:53] Explicit valence for atom # 11 C, 5, is greater than permitted
[11:54:53] Explicit valence for atom # 18 C, 5, is greater than permitted
[11:54:53] Explicit valence for atom # 13 C, 5, is greater than permitted
[11:54:53] Explicit valence for atom # 12 C, 5, is greater than permitted
[11:54:53] Explicit valence for atom # 3 C, 5, is greater than permitted
[11:54:53] Explicit valence for atom # 18 F, 2, is greater than permitted
[11:54:53] Explicit valence for atom # 10 C, 5, is greater than permitted
[11:54:53] Explicit valence for atom # 1 C, 6, is greater than permitted
[11:54:53] Explicit valence for atom # 1 O, 7, is greater than permitted
[11:54:53] Explicit valence for atom # 1 C, 6, is greater than permitted
[11:54:53] Explicit valence for atom # 1 C, 6, is greater than permitted
[11:54:53] Explicit valence for atom # 2 C, 6, is greater than permitted
[11:54:53] Explicit valence for atom # 9 C, 5, is greater than permitted
[11:54:53] Explicit valence for atom # 8 C, 5

In [101]:
# 编辑距离
def norm_func(smile_str):
    try:
        smile = Chem.MolToSmiles(Chem.MolFromSmiles(smile_str),
                                 isomericSmiles=True,
                                 canonical=True).replace("\\",
                                                         "").replace("/", "")
    except:
        smile = ""

    if "." in smile:
        smiles = smile.split(".")
        smile = smiles[np.argmax([len(_) for _ in smiles])]
    return smile

def edit_distance(x, name):
    sm1 = norm_func(x["GD"])
    sm2 = norm_func(x[name])
    return le.distance(sm1,sm2)

df["OCMR_edit_distance"] = df.apply(edit_distance, name="OCMR", axis=1)
df["MolVec_edit_distance"] = df.apply(edit_distance, name="MolVec", axis=1)
df["OSRA_edit_distance"] = df.apply(edit_distance, name="OSRA", axis=1)
df["Imago_edit_distance"] = df.apply(edit_distance, name="Imago", axis=1)




[11:54:55] Explicit valence for atom # 11 C, 5, is greater than permitted
[11:54:55] Explicit valence for atom # 18 C, 5, is greater than permitted
[11:54:55] Explicit valence for atom # 13 C, 5, is greater than permitted
[11:54:55] Explicit valence for atom # 12 C, 5, is greater than permitted
[11:54:55] Explicit valence for atom # 3 C, 5, is greater than permitted
[11:54:55] Explicit valence for atom # 18 F, 2, is greater than permitted
[11:54:55] Explicit valence for atom # 10 C, 5, is greater than permitted
[11:54:55] Explicit valence for atom # 1 C, 6, is greater than permitted
[11:54:55] Explicit valence for atom # 1 O, 7, is greater than permitted
[11:54:55] Explicit valence for atom # 1 C, 6, is greater than permitted
[11:54:55] Explicit valence for atom # 1 C, 6, is greater than permitted
[11:54:55] Explicit valence for atom # 2 C, 6, is greater than permitted
[11:54:55] Explicit valence for atom # 9 C, 5, is greater than permitted
[11:54:55] Explicit valence for atom # 8 C, 5

In [102]:
def len_str(x,name):
    sm1 = x[name]
    return len(sm1)

df["len_gd"] = df.apply(len_str, name="GD", axis=1)
df["len_ocmr"] = df.apply(len_str, name="OCMR", axis=1)
df["len_molvec"] = df.apply(len_str, name="MolVec", axis=1)
df["len_osra"] = df.apply(len_str, name="OSRA", axis=1)
df["len_imago"] = df.apply(len_str, name="Imago", axis=1)



In [103]:
df.mean()

  """Entry point for launching an IPython kernel.


Is_OCMR_true                  0.900000
Is_MolVec_true                0.611538
Is_OSRA_true                  0.657692
Is_Imago_true                 0.230769
OCMR_TanimotoSimilarity       0.962395
MolVec_TanimotoSimilarity     0.805560
OSRA_TanimotoSimilarity       0.818060
Imago_TanimotoSimilarity      0.522837
OCMR_edit_distance            1.705769
MolVec_edit_distance         10.923077
OSRA_edit_distance            9.705769
Imago_edit_distance          28.067308
len_gd                       65.771154
len_ocmr                     64.578846
len_molvec                   66.792308
len_osra                     57.867308
len_imago                    41.305769
dtype: float64

In [104]:
df.to_csv("Patent_res.csv",index=None)