In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from rdkit import Chem 
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
import rdkit.Chem.rdMolDescriptors as d
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import Normalizer, StandardScaler

# 1. Read Training Dataset and Test Dataset

In [10]:
training_df= pd.read_csv("training_smiles.csv")

In [11]:
test_df= pd.read_csv("test_smiles.csv")

# 2. Convert SMILE String into Molecule Object

In [12]:
training_df['mol'] = training_df['SMILES'].apply(lambda x: Chem.MolFromSmiles(x)) 



In [13]:
test_df['mol'] = test_df['SMILES'].apply(lambda x: Chem.MolFromSmiles(x)) 



# 3. Extract Molecule Descriptor Features (with Feature Selection Algorithms)

In [14]:
def descriptor_features(df):
    df['mol'] = df['mol'].apply(lambda x: Chem.AddHs(x))
    df['num_of_atoms'] = df['mol'].apply(lambda x: x.GetNumAtoms())
    df['num_of_heavy_atoms'] = df['mol'].apply(lambda x: x.GetNumHeavyAtoms())

    def number_of_atoms(atom_list, df):
        for i in atom_list:
            df['num_of_{}_atoms'.format(i)] = df['mol'].apply(lambda x: len(x.GetSubstructMatches(Chem.MolFromSmiles(i))))

    number_of_atoms(['C','O', 'N', 'Cl'], df)

    df['tpsa'] = df['mol'].apply(lambda x: Descriptors.TPSA(x))
    df['mol_weight'] = df['mol'].apply(lambda x: Descriptors.ExactMolWt(x))
    df['num_valence_electrons'] = df['mol'].apply(lambda x: Descriptors.NumValenceElectrons(x))
    df['num_heteroatoms'] = df['mol'].apply(lambda x: Descriptors.NumHeteroatoms(x))
    df['logp'] = df['mol'].apply(lambda x: Descriptors.MolLogP(x))

    df['num_rings'] = df['mol'].apply(lambda x: d.CalcNumRings(x))
    df['num_rotate_bonds'] = df['mol'].apply(lambda x: d.CalcNumRotatableBonds(x))
    df['num_h_acceptors'] = df['mol'].apply(lambda x: d.CalcNumHBA(x))
    df['num_h_donors'] = df['mol'].apply(lambda x: d.CalcNumHBD(x))

def feature_selection(X, y, k=1):
    clf = ExtraTreesClassifier(n_estimators=50)
    clf = clf.fit(X, y)
    idx = np.argsort(clf.feature_importances_)
    select_features = X.columns[idx[-k:]]
    return select_features

In [15]:
Descriptor_train = training_df.copy()
descriptor_features(Descriptor_train)

In [16]:
X = Descriptor_train.drop(columns=["INDEX", "SMILES", "mol", "ACTIVE"])
y = Descriptor_train["ACTIVE"].values
select_features = feature_selection(X, y, 10)

In [17]:
DL_train = pd.concat([Descriptor_train["ACTIVE"], Descriptor_train[select_features]], axis = 1)

In [18]:
DL_train

Unnamed: 0,ACTIVE,num_of_O_atoms,num_of_N_atoms,num_of_C_atoms,num_h_acceptors,num_valence_electrons,num_rotate_bonds,num_of_atoms,mol_weight,tpsa,logp
0,0.0,1,4,14,4,98,5,35,256.132411,75.01,1.48330
1,0.0,4,2,23,7,152,10,50,420.114378,83.67,4.80230
2,0.0,4,2,21,5,138,8,47,364.142307,89.12,2.84420
3,0.0,4,3,18,5,144,5,53,381.172227,78.95,1.52200
4,0.0,2,1,18,3,110,9,42,283.157229,30.49,4.43310
...,...,...,...,...,...,...,...,...,...,...,...
156253,0.0,4,5,16,7,138,6,38,434.997844,115.56,3.09810
156254,0.0,6,3,29,7,206,13,72,551.209007,106.20,5.15540
156255,0.0,1,4,18,4,120,3,45,310.179361,52.23,1.58080
156256,0.0,3,3,16,5,110,5,35,295.095691,80.63,3.00692


In [19]:
Descriptor_test = test_df.copy()
descriptor_features(Descriptor_test)

In [20]:
DL_test = Descriptor_test[select_features]

In [21]:
DL_test

Unnamed: 0,num_of_O_atoms,num_of_N_atoms,num_of_C_atoms,num_h_acceptors,num_valence_electrons,num_rotate_bonds,num_of_atoms,mol_weight,tpsa,logp
0,1,5,17,6,120,9,44,311.174610,64.86,2.88074
1,1,4,17,7,130,9,43,390.064274,66.91,4.36854
2,1,2,17,3,98,2,34,262.110613,42.85,3.30650
3,4,3,16,5,122,5,42,317.137556,108.29,1.61028
4,5,2,17,5,138,5,43,396.054670,88.85,2.60580
...,...,...,...,...,...,...,...,...,...,...
52081,2,3,11,5,90,5,30,251.072848,66.96,1.85710
52082,2,1,11,3,90,8,31,263.047984,41.49,2.34420
52083,6,1,26,6,172,10,60,449.183838,98.00,3.61240
52084,4,4,17,5,148,5,43,396.104540,113.06,0.62510


# 4. Extract Morgan Fingerprint Features (vectors)

In [22]:
training_fp = np.zeros((training_df.index.size,124))
for i in range(training_df.index.size):
    training_fp[i] = np.array(AllChem.GetMorganFingerprintAsBitVect(training_df['mol'][i],1,nBits=124))
print(training_fp)

[[0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 1. ... 0. 1. 0.]
 [0. 0. 1. ... 0. 1. 0.]]


In [23]:
training_fp_df = pd.DataFrame(training_fp,columns = ["fp{}".format(j) for j in range(training_fp.shape[1])])

In [24]:
Morgan_train = pd.concat([training_df,training_fp_df], axis=1)
Morgan_train.drop(columns=["INDEX", "SMILES", "mol"], inplace = True)

In [25]:
Morgan_train

Unnamed: 0,ACTIVE,fp0,fp1,fp2,fp3,fp4,fp5,fp6,fp7,fp8,...,fp114,fp115,fp116,fp117,fp118,fp119,fp120,fp121,fp122,fp123
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156253,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
156254,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
156255,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
156256,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [26]:
test_fp = np.zeros((test_df.index.size,124))
for i in range(test_df.index.size):
    test_fp[i] = np.array(AllChem.GetMorganFingerprintAsBitVect(test_df['mol'][i],1,nBits=124))
print(test_fp)

[[0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 1. ... 0. 1. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 1. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]]


In [27]:
test_fp_df = pd.DataFrame(test_fp,columns = ["fp{}".format(j) for j in range(test_fp.shape[1])])

In [28]:
Morgan_test = pd.concat([test_df,test_fp_df], axis=1)
Morgan_test.drop(columns=["INDEX", "SMILES", "mol"], inplace = True)

In [29]:
Morgan_test

Unnamed: 0,fp0,fp1,fp2,fp3,fp4,fp5,fp6,fp7,fp8,fp9,...,fp114,fp115,fp116,fp117,fp118,fp119,fp120,fp121,fp122,fp123
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52081,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
52082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
52083,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
52084,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0


# 5. Feature Combination

In [30]:
Both_train = pd.concat([DL_train, Morgan_train], axis = 1)
Both_train = Both_train.loc[:,~Both_train.columns.duplicated()].copy()

In [31]:
Both_train

Unnamed: 0,ACTIVE,num_of_O_atoms,num_of_N_atoms,num_of_C_atoms,num_h_acceptors,num_valence_electrons,num_rotate_bonds,num_of_atoms,mol_weight,tpsa,...,fp114,fp115,fp116,fp117,fp118,fp119,fp120,fp121,fp122,fp123
0,0.0,1,4,14,4,98,5,35,256.132411,75.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,4,2,23,7,152,10,50,420.114378,83.67,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,4,2,21,5,138,8,47,364.142307,89.12,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,4,3,18,5,144,5,53,381.172227,78.95,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,2,1,18,3,110,9,42,283.157229,30.49,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156253,0.0,4,5,16,7,138,6,38,434.997844,115.56,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
156254,0.0,6,3,29,7,206,13,72,551.209007,106.20,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
156255,0.0,1,4,18,4,120,3,45,310.179361,52.23,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
156256,0.0,3,3,16,5,110,5,35,295.095691,80.63,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [32]:
Both_test = pd.concat([DL_test, Morgan_test], axis = 1)
Both_test

Unnamed: 0,num_of_O_atoms,num_of_N_atoms,num_of_C_atoms,num_h_acceptors,num_valence_electrons,num_rotate_bonds,num_of_atoms,mol_weight,tpsa,logp,...,fp114,fp115,fp116,fp117,fp118,fp119,fp120,fp121,fp122,fp123
0,1,5,17,6,120,9,44,311.174610,64.86,2.88074,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,4,17,7,130,9,43,390.064274,66.91,4.36854,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,1,2,17,3,98,2,34,262.110613,42.85,3.30650,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4,3,16,5,122,5,42,317.137556,108.29,1.61028,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,5,2,17,5,138,5,43,396.054670,88.85,2.60580,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52081,2,3,11,5,90,5,30,251.072848,66.96,1.85710,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
52082,2,1,11,3,90,8,31,263.047984,41.49,2.34420,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
52083,6,1,26,6,172,10,60,449.183838,98.00,3.61240,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
52084,4,4,17,5,148,5,43,396.104540,113.06,0.62510,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0


# 6. Save Feature Sets into csv Files

In [33]:
DL_train.to_csv("DL_train.csv")

In [34]:
DL_test.to_csv("DL_test.csv")

In [35]:
Morgan_train.to_csv("Morgan_train.csv")

In [36]:
Morgan_test.to_csv("Morgan_test.csv")

In [37]:
Both_train.to_csv("Both_train.csv")

In [38]:
Both_test.to_csv("Both_test.csv")