In [1]:
import numpy as np
import pandas as pd
from rdkit import DataStructs
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Descriptors
from sklearn.datasets import dump_svmlight_file

In [2]:
df = pd.read_csv("../output/Chembl_bioactivity_SIRT1_cleaned.csv")

In [3]:
"""
feature
* ECFP4
* rdkit descriptor
"""
def get_desc_names():
    ret = []
    for desc_name, _ in Descriptors.descList:
        if "Charge" in desc_name:
            continue
        ret.append(desc_name)
    return ret

def get_feature_names():
    ecfp4_names = ["ECFP4_{}".format(i + 1) for i in range(2048)]
    desc_names = get_desc_names()
    feature_names = ecfp4_names + desc_names
    return feature_names

def smiles2feature(smiles, desc_names):
    mol = Chem.MolFromSmiles(smiles)
    # ECFP4
    ecfp4 = Chem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    ecfp4_arr = np.zeros((1, ))
    DataStructs.ConvertToNumpyArray(ecfp4, ecfp4_arr)
    ret = {}
    for i, bit in enumerate(ecfp4_arr):
        ret["ECFP4_{}".format(i + 1)] = int(bit)
    
    # Descriptor
    desc_name_set = set(get_desc_names())
    for desc_name, desc_func in Descriptors.descList:
        if desc_name not in desc_name_set:
            continue
        ret[desc_name] = desc_func(mol)
        
    return ret

In [4]:
df.columns

Index(['CMPD_CHEMBLID', 'DOC_CHEMBLID', 'CANONICAL_SMILES', 'STANDARD_TYPE',
       'RELATION', 'STANDARD_VALUE', 'STANDARD_UNITS', 'relevance', 'qid'],
      dtype='object')

In [5]:
desc_names = get_desc_names()
feature_data_dict = {row[1]["CMPD_CHEMBLID"]: smiles2feature(row[1]["CANONICAL_SMILES"], desc_names) for row in df.iterrows()}

In [6]:
df_feature = pd.DataFrame.from_dict(feature_data_dict, orient="index")
df_feature["CMPD_CHEMBLID"] = df_feature.index

In [7]:
df_merged = pd.merge(df, df_feature, on="CMPD_CHEMBLID")

In [8]:
df_merged.head()

Unnamed: 0,CMPD_CHEMBLID,DOC_CHEMBLID,CANONICAL_SMILES,STANDARD_TYPE,RELATION,STANDARD_VALUE,STANDARD_UNITS,relevance,qid,ECFP4_1,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CHEMBL3311063,CHEMBL3351989,CCOc1ccccc1c2nn3c(nnc3s2)c4occc4,IC50,=,23600.0,nM,4.627088,1154,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL3233748,CHEMBL3232806,CC(C)c1ccc(cc1)C2=C(Cc3c(O)ccc4nc(Cl)ccc34)C(=...,IC50,=,21000.0,nM,4.677781,119,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL3233748,CHEMBL3232806,CC(C)c1ccc(cc1)C2=C(Cc3c(O)ccc4nc(Cl)ccc34)C(=...,Inhibition,=,85.0,%,85.0,129,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL3408982,CHEMBL3407343,CCCCCCCCn1c2CCN(CCO)Cc2c3cc(ccc13)c4cnc(N)nc4,IC50,,,,0.0,1148,0,...,0,0,0,0,0,0,0,0,4,0
4,CHEMBL1140,CHEMBL3390834,NC(=O)c1cccnc1,IC50,=,25000.0,nM,4.60206,1118,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df_merged_sorted_by_qid = df_merged.sort_values(by="qid")

In [10]:
feature_names = get_feature_names()
X = df_merged_sorted_by_qid[feature_names].as_matrix()
ys = df_merged_sorted_by_qid[["relevance"]].as_matrix().flatten()
qid = df_merged_sorted_by_qid[["qid"]].as_matrix()

In [11]:
print(X.shape, ys.shape, qid.shape)

(1131, 2240) (1131,) (1131, 1)


In [12]:
dump_svmlight_file(X, ys, "notebook_test.svmlight", query_id=qid)

In [13]:
df_merged_sorted_by_qid.qid.drop_duplicates().count()

92

In [22]:
df_merged.columns

Index(['CMPD_CHEMBLID', 'DOC_CHEMBLID', 'CANONICAL_SMILES', 'STANDARD_TYPE',
       'RELATION', 'STANDARD_VALUE', 'STANDARD_UNITS', 'relevance', 'qid',
       'ECFP4_1',
       ...
       'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene',
       'fr_tetrazole', 'fr_thiazole', 'fr_thiocyan', 'fr_thiophene',
       'fr_unbrch_alkane', 'fr_urea'],
      dtype='object', length=2249)

In [23]:
len(feature_names)

2240

In [26]:
df_merged.fr_urea.describe()

count    1131.000000
mean        0.022989
std         0.149933
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: fr_urea, dtype: float64