In [1]:
# Installing numpy version 1.23. This version is specifically desired for calculating descriptors using Mordred.
!pip install numpy==1.23



In [2]:
import numpy
numpy.__version__

'1.23.0'

In [3]:
# Installing rdkit and mordred
!pip install rdkit==2023.09.3
!pip install mordred



In [4]:
# Importing modules
import rdkit
from rdkit import Chem
from rdkit.Chem import Draw, PandasTools, AllChem

import mordred
from mordred import Calculator, descriptors

import pandas as pd
from tqdm import tqdm

Failed to find the pandas get_adjustment() function to patch
Failed to patch pandas - PandasTools will have limited functionality


In [19]:
# Loading the raw dataset containing SMILES and Hammett's Sigma constant values
df = pd.read_csv('data.csv')

## **Data Generation**

In [20]:
df.head()

Unnamed: 0,SMILES,Sigma
0,O=C(O)c1cccc(B(F)F)c1,0.32
1,O=C(O)c1cccc(Br)c1,0.39
2,O=C(O)c1cccc([Ge](Br)(Br)Br)c1,0.66
3,O=C(O)c1cccc([Si](Br)(Br)Br)c1,0.48
4,O=C(O)c1cccc(Cl)c1,0.37


In [21]:
# Generating column containing molecular objects
mol_list = []

for smile in df['SMILES']:
  mol = Chem.MolFromSmiles(smile)
  mol = Chem.AddHs(mol)
  mol_list.append(mol)

df = pd.concat([df, pd.DataFrame(mol_list, columns = (['Mol']))], axis=1)

In [25]:
# Creating a descriptor calculator with all descriptors
calc_2d = Calculator(descriptors, ignore_3D=True)

In [26]:
df.head()

Unnamed: 0,SMILES,Sigma,Mol
0,O=C(O)c1cccc(B(F)F)c1,0.32,<rdkit.Chem.rdchem.Mol object at 0x7a03a01d5e00>
1,O=C(O)c1cccc(Br)c1,0.39,<rdkit.Chem.rdchem.Mol object at 0x7a03a01d4900>
2,O=C(O)c1cccc([Ge](Br)(Br)Br)c1,0.66,<rdkit.Chem.rdchem.Mol object at 0x7a03a01d5af0>
3,O=C(O)c1cccc([Si](Br)(Br)Br)c1,0.48,<rdkit.Chem.rdchem.Mol object at 0x7a03a01d47b0>
4,O=C(O)c1cccc(Cl)c1,0.37,<rdkit.Chem.rdchem.Mol object at 0x7a03a01d5310>


In [27]:

# Iterate through molecules
d = []
for mol in tqdm(df['Mol']):
# Calculate descriptors
  result = calc_2d(mol)
  d.append(result)


final_df_2d = pd.concat([df, pd.DataFrame(d, columns = (str(key) for key in result.keys()))], axis=1)

100%|██████████| 982/982 [01:46<00:00,  9.21it/s]


In [28]:
final_df_2d.head()

Unnamed: 0,SMILES,Sigma,Mol,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,O=C(O)c1cccc(B(F)F)c1,0.32,<rdkit.Chem.rdchem.Mol object at 0x7a03a01d5e00>,8.84196,8.356358,1,0,14.379234,2.288246,4.576491,...,9.089866,42.251151,170.035066,10.002063,197,15,56.0,62.0,5.444444,2.722222
1,O=C(O)c1cccc(Br)c1,0.39,<rdkit.Chem.rdchem.Mol object at 0x7a03a01d4900>,7.358797,6.989468,1,0,11.908678,2.242838,4.485676,...,8.830689,39.160939,199.947292,13.329819,117,11,46.0,50.0,4.333333,2.277778
2,O=C(O)c1cccc([Ge](Br)(Br)Br)c1,0.66,<rdkit.Chem.rdchem.Mol object at 0x7a03a01d5af0>,9.785874,9.203562,1,0,14.978073,2.344507,4.689013,...,9.380674,44.214989,431.705144,23.983619,240,17,64.0,71.0,6.395833,2.777778
3,O=C(O)c1cccc([Si](Br)(Br)Br)c1,0.48,<rdkit.Chem.rdchem.Mol object at 0x7a03a01d47b0>,9.785874,9.203562,1,0,14.978073,2.344507,4.689013,...,9.380674,44.214989,385.760892,21.431161,240,17,64.0,71.0,6.395833,2.777778
4,O=C(O)c1cccc(Cl)c1,0.37,<rdkit.Chem.rdchem.Mol object at 0x7a03a01d5310>,7.358797,6.989468,1,0,11.908678,2.242838,4.485676,...,8.830689,39.160939,155.997807,10.399854,117,11,46.0,50.0,4.333333,2.277778


In [29]:
final_df_2d.drop('Mol', axis=1, inplace=True)

# **Data Preprocessing**

In [30]:
final_df_2d_desc = final_df_2d.drop(['Sigma', 'SMILES'], axis=1)

In [32]:
# Checking for nan vlaues
final_df_2d_desc.isna().sum().sum()

0

In [33]:
final_df_2d_desc.head()

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,8.84196,8.356358,1,0,14.379234,2.288246,4.576491,14.379234,1.19827,3.37579,...,9.089866,42.251151,170.035066,10.002063,197,15,56.0,62.0,5.444444,2.722222
1,7.358797,6.989468,1,0,11.908678,2.242838,4.485676,11.908678,1.190868,3.197246,...,8.830689,39.160939,199.947292,13.329819,117,11,46.0,50.0,4.333333,2.277778
2,9.785874,9.203562,1,0,14.978073,2.344507,4.689013,14.978073,1.152159,3.4646,...,9.380674,44.214989,431.705144,23.983619,240,17,64.0,71.0,6.395833,2.777778
3,9.785874,9.203562,1,0,14.978073,2.344507,4.689013,14.978073,1.152159,3.4646,...,9.380674,44.214989,385.760892,21.431161,240,17,64.0,71.0,6.395833,2.777778
4,7.358797,6.989468,1,0,11.908678,2.242838,4.485676,11.908678,1.190868,3.197246,...,8.830689,39.160939,155.997807,10.399854,117,11,46.0,50.0,4.333333,2.277778


In [34]:
# Grabbing only the columns containing numberical and boolean values (True/False)
bool_cols = []
num_cols = []
for col in final_df_2d_desc.columns:
  if final_df_2d_desc[col].dtype == 'object':
    pass
  elif final_df_2d_desc[col].dtype =='bool':
      bool_cols.append(col)
  else:
      num_cols.append(col)

In [35]:
final_df_2d_desc.shape

(982, 1613)

In [36]:
# Creating a dataset containing only numerical and boolean columns
final_df_2d_desc_ = final_df_2d_desc[num_cols + bool_cols]

In [37]:
final_df_2d_desc_.shape

(982, 1336)

In [38]:
# Defining a function to remove constant values (variance=0) from the dataset
def remove_constant_values(data):
    return [e for e in data.columns if data[e].nunique() == 1]

drop_col = remove_constant_values(final_df_2d_desc_)
#drop_col

new_df_columns = [e for e in final_df_2d_desc_.columns if e not in drop_col]
new_df = final_df_2d_desc_[new_df_columns]
new_df

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2,Lipinski,GhoseFilter
0,8.841960,8.356358,1,0,14.379234,2.288246,4.576491,14.379234,1.198270,3.375790,...,170.035066,10.002063,197,15,56.0,62.0,5.444444,2.722222,True,False
1,7.358797,6.989468,1,0,11.908678,2.242838,4.485676,11.908678,1.190868,3.197246,...,199.947292,13.329819,117,11,46.0,50.0,4.333333,2.277778,True,False
2,9.785874,9.203562,1,0,14.978073,2.344507,4.689013,14.978073,1.152159,3.464600,...,431.705144,23.983619,240,17,64.0,71.0,6.395833,2.777778,True,False
3,9.785874,9.203562,1,0,14.978073,2.344507,4.689013,14.978073,1.152159,3.464600,...,385.760892,21.431161,240,17,64.0,71.0,6.395833,2.777778,True,False
4,7.358797,6.989468,1,0,11.908678,2.242838,4.485676,11.908678,1.190868,3.197246,...,155.997807,10.399854,117,11,46.0,50.0,4.333333,2.277778,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
977,22.620928,17.887780,1,0,38.870645,2.530735,5.061470,38.870645,1.340367,4.300253,...,397.123166,8.104554,2034,49,152.0,181.0,7.729167,6.486111,False,True
978,21.852211,17.533757,1,0,37.448307,2.565975,5.131949,37.448307,1.337440,4.269147,...,380.123256,7.919235,1767,50,148.0,179.0,7.479167,6.277778,True,True
979,21.852211,17.533757,1,0,37.448307,2.565975,5.131949,37.448307,1.337440,4.269147,...,472.048525,9.834344,1767,50,148.0,179.0,7.479167,6.277778,True,True
980,21.852211,17.533757,1,0,37.448307,2.565975,5.131949,37.448307,1.337440,4.269147,...,364.146330,7.586382,1767,50,148.0,179.0,7.479167,6.277778,False,False


In [54]:
# Loading the dataset containing quantum chemical descriptors
df_2 = pd.read_csv('data_QC.csv')

In [60]:
df_2.head(2)

Unnamed: 0,SMILES,Sigma,NBO_CC,NBO_SC,E(HOMO),E(LUMO)
0,O=C(O)c1cccc(B(F)F)c1,0.32,-0.183,-0.489,-0.28817,-0.08233
1,O=C(O)c1cccc(Br)c1,0.39,-0.162,-0.13,-0.26324,-0.07616


In [61]:
# Concatanating the two dataset, one with quantum chemical descriptors and other with Mordred descriptors
new_df_ = pd.concat([df_2, new_df], axis=1)
# Dropping SMILES and Sigman columns for further processing
new_df_final = new_df_.drop(['SMILES', 'Sigma'], axis=1)

In [63]:
new_df_final.shape

(982, 1210)

In [64]:
# To calclulate  Correlation and remove highly  correlated columns
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [65]:
# Dropping highly correlated Features
corr_features = correlation(new_df_final, 0.80)
print("No. of features to drop : ",len(set(corr_features)))

new_df_final.drop(corr_features,axis=1,inplace=True)

No. of features to drop :  927


In [66]:
new_df_final.shape

(982, 283)

In [68]:
new_df_final.head(2)

Unnamed: 0,NBO_CC,NBO_SC,E(HOMO),ABC,nAcid,nBase,SpMAD_A,nAromAtom,nBridgehead,nHetero,...,JGI3,JGI4,JGI5,JGI6,JGI7,JGI8,JGI9,TopoShapeIndex,Lipinski,GhoseFilter
0,-0.183,-0.489,-0.28817,8.84196,1,0,1.19827,6,0,5,...,0.05,0.049293,0.059028,0.0,0.0,0.0,0.0,1.0,True,False
1,-0.162,-0.13,-0.26324,7.358797,1,0,1.190868,6,0,3,...,0.056818,0.066032,0.03125,0.0,0.0,0.0,0.0,0.666667,True,False


In [70]:
# Converting boolean descriptors to numbers 0s and 1s.
new_df_final['Lipinski'] = new_df_final["Lipinski"].astype(int)
new_df_final['GhoseFilter'] = new_df_final["GhoseFilter"].astype(int)

In [71]:
new_df_final.head(2)

Unnamed: 0,NBO_CC,NBO_SC,E(HOMO),ABC,nAcid,nBase,SpMAD_A,nAromAtom,nBridgehead,nHetero,...,JGI3,JGI4,JGI5,JGI6,JGI7,JGI8,JGI9,TopoShapeIndex,Lipinski,GhoseFilter
0,-0.183,-0.489,-0.28817,8.84196,1,0,1.19827,6,0,5,...,0.05,0.049293,0.059028,0.0,0.0,0.0,0.0,1.0,1,0
1,-0.162,-0.13,-0.26324,7.358797,1,0,1.190868,6,0,3,...,0.056818,0.066032,0.03125,0.0,0.0,0.0,0.0,0.666667,1,0


In [72]:
new_df_final.shape

(982, 283)

In [73]:
df_2.head()

Unnamed: 0,SMILES,Sigma,NBO_CC,NBO_SC,E(HOMO),E(LUMO)
0,O=C(O)c1cccc(B(F)F)c1,0.32,-0.183,-0.489,-0.28817,-0.08233
1,O=C(O)c1cccc(Br)c1,0.39,-0.162,-0.13,-0.26324,-0.07616
2,O=C(O)c1cccc([Ge](Br)(Br)Br)c1,0.66,-0.169,-0.515,-0.29236,-0.09336
3,O=C(O)c1cccc([Si](Br)(Br)Br)c1,0.48,-0.173,-0.577,-0.28562,-0.08182
4,O=C(O)c1cccc(Cl)c1,0.37,-0.162,-0.059,-0.26772,-0.07613


In [74]:
new_df_final_283 = pd.concat([df_2[['SMILES', 'Sigma']], new_df_final], axis=1)

In [75]:
new_df_final_283.head(2)

Unnamed: 0,SMILES,Sigma,NBO_CC,NBO_SC,E(HOMO),ABC,nAcid,nBase,SpMAD_A,nAromAtom,...,JGI3,JGI4,JGI5,JGI6,JGI7,JGI8,JGI9,TopoShapeIndex,Lipinski,GhoseFilter
0,O=C(O)c1cccc(B(F)F)c1,0.32,-0.183,-0.489,-0.28817,8.84196,1,0,1.19827,6,...,0.05,0.049293,0.059028,0.0,0.0,0.0,0.0,1.0,1,0
1,O=C(O)c1cccc(Br)c1,0.39,-0.162,-0.13,-0.26324,7.358797,1,0,1.190868,6,...,0.056818,0.066032,0.03125,0.0,0.0,0.0,0.0,0.666667,1,0


In [77]:
new_df_final_283.shape

(982, 285)

In [76]:
# Saving the final dataset as final_data.csv
new_df_final_283.to_csv('final_data.csv', index=None)