# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Installation Of RDKit

In [1]:
pip install rdkit

Note: you may need to restart the kernel to use updated packages.


## Data Preprocessing

In [2]:
import numpy as np 
import pandas as pd
train_data = pd.read_csv('C:/Users/RAMZAN KHAN/Toxicity Prediction/train_II.csv')
test_data = pd.read_csv('C:/Users/RAMZAN KHAN/Toxicity Prediction/test_II.csv')

In [3]:
train_data[["Id", "Assay"]] = train_data["Id"].str.split(";", expand=True)
train_data.head()

Unnamed: 0,Id,Expected,Assay
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O,2,1644
1,CCCCCCCCC(=O)C,2,2451
2,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],2,1384
3,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl,2,16
4,[Na+].[I-],2,1856


In [4]:
test_data[["Id","Assay"]] = test_data["x"].str.split(";", expand=True)
test_data.head()

Unnamed: 0,x,Id,Assay
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,CC1=CC(=C(C=C1)C(C)(C)C)O,1682
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,1656
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,36
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,30


## Generated RDKit Features

In [5]:
import pandas as pd
import numpy as np
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

def generate_rdkit_features(smiles):
    # Convert SMILES string to RDKit molecule object
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # Return NaN if SMILES string is invalid
        return [np.nan]*265
    
    # Calculate additional RDKit descriptors
    MaxEStateIndex= Descriptors.MaxEStateIndex(mol)
    MinEStateIndex= Descriptors.MinEStateIndex(mol)
    MaxAbsEStateIndex= Descriptors.MaxAbsEStateIndex(mol)
    MinAbsEStateIndex= Descriptors.MinAbsEStateIndex(mol)
    qed= Descriptors.qed(mol)
    MolWt= Descriptors.MolWt(mol)
    HeavyAtomMolWt= Descriptors.HeavyAtomMolWt(mol)
    ExactMolWt= Descriptors.ExactMolWt(mol)
    NumValenceElectrons= Descriptors.NumValenceElectrons(mol)
    NumRadicalElectrons= Descriptors.NumRadicalElectrons(mol)
    MaxPartialCharge= Descriptors.MaxPartialCharge(mol)
    MinPartialCharge= Descriptors.MinPartialCharge(mol)
    MaxAbsPartialCharge= Descriptors.MaxAbsPartialCharge(mol)
    MinAbsPartialCharge= Descriptors.MinAbsPartialCharge(mol)
    FpDensityMorgan1= Descriptors.FpDensityMorgan1(mol)
    FpDensityMorgan2= Descriptors.FpDensityMorgan2(mol)
    FpDensityMorgan3= Descriptors.FpDensityMorgan3(mol)
    BalabanJ= Descriptors.BalabanJ(mol)
    BertzCT= Descriptors.BertzCT(mol)
    Chi0= Descriptors.Chi0(mol)
    Chi0n= Descriptors.Chi0n(mol)
    Chi0v= Descriptors.Chi0v(mol)
    Chi1= Descriptors.Chi1(mol)
    Chi1n= Descriptors.Chi1n(mol)
    Chi1v= Descriptors.Chi1v(mol)
    Chi2n= Descriptors.Chi2n(mol)
    Chi2v= Descriptors.Chi2v(mol)
    Chi3n= Descriptors.Chi3n(mol)
    Chi3v= Descriptors.Chi3v(mol)
    Chi4n= Descriptors.Chi4n(mol)
    Chi4v= Descriptors.Chi4v(mol)
    HallKierAlpha= Descriptors.HallKierAlpha(mol)
    Ipc= Descriptors.Ipc(mol)
    Kappa1= Descriptors.Kappa1(mol)
    Kappa2= Descriptors.Kappa2(mol)
    Kappa3= Descriptors.Kappa3(mol)
    LabuteASA= Descriptors.LabuteASA(mol)
    PEOE_VSA1= Descriptors.PEOE_VSA1(mol)
    PEOE_VSA10= Descriptors.PEOE_VSA10(mol)
    PEOE_VSA11= Descriptors.PEOE_VSA11(mol)
    PEOE_VSA12= Descriptors.PEOE_VSA12(mol)
    PEOE_VSA13= Descriptors.PEOE_VSA13(mol)
    PEOE_VSA14= Descriptors.PEOE_VSA14(mol)
    PEOE_VSA2= Descriptors.PEOE_VSA2(mol)
    PEOE_VSA3 = Descriptors.PEOE_VSA3(mol)
    PEOE_VSA4 = Descriptors.PEOE_VSA4(mol)
    PEOE_VSA5 = Descriptors.PEOE_VSA5(mol)
    PEOE_VSA6 = Descriptors.PEOE_VSA6(mol)
    PEOE_VSA7 = Descriptors.PEOE_VSA7(mol)
    PEOE_VSA8 = Descriptors.PEOE_VSA8(mol)
    PEOE_VSA9 = Descriptors.PEOE_VSA9(mol)
    SMR_VSA1 = Descriptors.SMR_VSA1(mol)
    SMR_VSA10 = Descriptors.SMR_VSA10(mol)
    SMR_VSA2 = Descriptors.SMR_VSA2(mol)
    SMR_VSA3 = Descriptors.SMR_VSA3(mol)
    SMR_VSA4 = Descriptors.SMR_VSA4(mol)
    SMR_VSA5 = Descriptors.SMR_VSA5(mol)
    SMR_VSA6 = Descriptors.SMR_VSA6(mol)
    SMR_VSA7 = Descriptors.SMR_VSA7(mol)
    SMR_VSA8 = Descriptors.SMR_VSA8(mol)
    SMR_VSA9 = Descriptors.SMR_VSA9(mol)
    SlogP_VSA1 = Descriptors.SlogP_VSA1(mol)
    SlogP_VSA10 = Descriptors.SlogP_VSA10(mol)
    SlogP_VSA11 = Descriptors.SlogP_VSA11(mol)
    SlogP_VSA12 = Descriptors.SlogP_VSA12(mol)
    SlogP_VSA2 = Descriptors.SlogP_VSA2(mol)
    SlogP_VSA3 = Descriptors.SlogP_VSA3(mol)
    SlogP_VSA4 = Descriptors.SlogP_VSA4(mol)
    SlogP_VSA5 = Descriptors.SlogP_VSA5(mol)
    SlogP_VSA6 = Descriptors.SlogP_VSA6(mol)
    SlogP_VSA7 = Descriptors.SlogP_VSA7(mol)
    SlogP_VSA8 = Descriptors.SlogP_VSA8(mol)
    SlogP_VSA9 = Descriptors.SlogP_VSA9(mol)
    TPSA = Descriptors.TPSA(mol)
    EState_VSA1 = Descriptors.EState_VSA1(mol)
    EState_VSA10 = Descriptors.EState_VSA10(mol)
    EState_VSA11 = Descriptors.EState_VSA11(mol)
    EState_VSA2 = Descriptors.EState_VSA2(mol)
    EState_VSA3 = Descriptors.EState_VSA3(mol)
    EState_VSA4 = Descriptors.EState_VSA4(mol)
    EState_VSA5 = Descriptors.EState_VSA5(mol)
    EState_VSA6 = Descriptors.EState_VSA6(mol)
    EState_VSA7 = Descriptors.EState_VSA7(mol)
    EState_VSA8 = Descriptors.EState_VSA8(mol)
    EState_VSA9 = Descriptors.EState_VSA9(mol)
    VSA_EState1 = Descriptors.VSA_EState1(mol)
    VSA_EState10 = Descriptors.VSA_EState10(mol)
    VSA_EState2 = Descriptors.VSA_EState2(mol)
    VSA_EState3 = Descriptors.VSA_EState3(mol)
    VSA_EState4 = Descriptors.VSA_EState4(mol)
    VSA_EState5 = Descriptors.VSA_EState5(mol)
    VSA_EState6 = Descriptors.VSA_EState6(mol)
    VSA_EState7 = Descriptors.VSA_EState7(mol)
    VSA_EState8 = Descriptors.VSA_EState8(mol)
    VSA_EState9 = Descriptors.VSA_EState9(mol)
    FractionCSP3 = Descriptors.FractionCSP3(mol)
    HeavyAtomCount = Descriptors.HeavyAtomCount(mol)
    NHOHCount = Descriptors.NHOHCount(mol)
    NOCount = Descriptors.NOCount(mol)
    NumAliphaticCarbocycles = Descriptors.NumAliphaticCarbocycles(mol)
    NumAliphaticHeterocycles = Descriptors.NumAliphaticHeterocycles(mol)
    NumAliphaticRings = Descriptors.NumAliphaticRings(mol)
    NumAromaticCarbocycles = Descriptors.NumAromaticCarbocycles(mol)
    NumAromaticHeterocycles = Descriptors.NumAromaticHeterocycles(mol)
    NumAromaticRings = Descriptors.NumAromaticRings(mol)
    NumHAcceptors = Descriptors.NumHAcceptors(mol)
    NumHDonors = Descriptors.NumHDonors(mol)
    NumHeteroatoms = Descriptors.NumHeteroatoms(mol)
    NumRotatableBonds = Descriptors.NumRotatableBonds(mol)
    NumSaturatedCarbocycles = Descriptors.NumSaturatedCarbocycles(mol)
    NumSaturatedHeterocycles = Descriptors.NumSaturatedHeterocycles(mol)
    NumSaturatedRings = Descriptors.NumSaturatedRings(mol)   
    RingCount = Descriptors.RingCount(mol)
    MolLogP = Descriptors.MolLogP(mol)
    MolMR = Descriptors.MolMR(mol) 
    
    # Calculate MACCS keys
    maccs_keys = MACCSkeys.GenMACCSKeys(mol)
    maccs_list = list(maccs_keys.ToBitString())

    # Return a list of features
    features = [MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,MaxPartialCharge,MinPartialCharge,MaxAbsPartialCharge,MinAbsPartialCharge,FpDensityMorgan1,FpDensityMorgan2,FpDensityMorgan3,BalabanJ,BertzCT,Chi0,Chi0n,Chi0v,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3n,Chi3v,Chi4n,Chi4v,HallKierAlpha,Ipc,Kappa1,Kappa2,Kappa3,LabuteASA,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA1,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA5,SMR_VSA6,SMR_VSA7,SMR_VSA8,SMR_VSA9,SMR_VSA10,SlogP_VSA1,SlogP_VSA10,SlogP_VSA11,SlogP_VSA12,SlogP_VSA2,SlogP_VSA3,SlogP_VSA4,SlogP_VSA5,SlogP_VSA6,SlogP_VSA7,SlogP_VSA8,SlogP_VSA9,TPSA,EState_VSA1,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,EState_VSA9,VSA_EState1,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState6,VSA_EState7,VSA_EState8,VSA_EState9,VSA_EState10,FractionCSP3,HeavyAtomCount,NHOHCount,NOCount,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticCarbocycles,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount,MolLogP,MolMR] + maccs_list

    return features
    
smiles = train_data['Id']
data = []
for s in smiles:
    data.append(generate_rdkit_features(s))

# Define the column names
columns = ['MaxEStateIndex','MinEStateIndex','MaxAbsEStateIndex','MinAbsEStateIndex','QED','MolWt','HeavyAtomMolWt','ExactMolWt','NumValenceElectrons','NumRadicalElectrons','MaxPartialCharge','MinPartialCharge','MaxAbsPartialCharge','MinAbsPartialCharge','FpDensityMorgan1','FpDensityMorgan2','FpDensityMorgan3','BalabanJ','BertzCT','Chi0','Chi0n','Chi0v','Chi1','Chi1n','Chi1v','Chi2n','Chi2v','Chi3n','Chi3v','Chi4n','Chi4v','HallKierAlpha','Ipc','Kappa1','Kappa2','Kappa3','LabuteASA','PEOE_VSA1','PEOE_VSA10','PEOE_VSA11','PEOE_VSA12','PEOE_VSA13','PEOE_VSA14','PEOE_VSA2','PEOE_VSA3','PEOE_VSA4','PEOE_VSA5','PEOE_VSA6','PEOE_VSA7','PEOE_VSA8','PEOE_VSA9','SMR_VSA1','SMR_VSA2','SMR_VSA3','SMR_VSA4','SMR_VSA5','SMR_VSA6','SMR_VSA7','SMR_VSA8','SMR_VSA9','SMR_VSA10','SlogP_VSA1','SlogP_VSA10','SlogP_VSA11','SlogP_VSA12','SlogP_VSA2','SlogP_VSA3','SlogP_VSA4','SlogP_VSA5','SlogP_VSA6','SlogP_VSA7','SlogP_VSA8','SlogP_VSA9','TPSA','EState_VSA1','EState_VSA10','EState_VSA11','EState_VSA2','EState_VSA3','EState_VSA4','EState_VSA5','EState_VSA6','EState_VSA7','EState_VSA8','EState_VSA9','VSA_EState1','VSA_EState2','VSA_EState3','VSA_EState4','VSA_EState5','VSA_EState6','VSA_EState7','VSA_EState8','VSA_EState9','VSA_EState10','FractionCSP3','HeavyAtomCount','NHOHCount','NOCount','NumAliphaticCarbocycles','NumAliphaticHeterocycles','NumAliphaticRings','NumAromaticCarbocycles','NumAromaticHeterocycles','NumAromaticRings','NumHAcceptors','NumHDonors','NumHeteroatoms','NumRotatableBonds','NumSaturatedCarbocycles','NumSaturatedHeterocycles','NumSaturatedRings','RingCount','MolLogP','MolMR'] + ['MACCS_'+str(i) for i in range(167)]

#Create a pandas DataFrame with the molecular descriptors as rows
df = pd.DataFrame(data, columns=columns)

# Extract the desired column
desired_column = train_data['Assay']

# Merge the two dataframes
df = pd.concat([df, desired_column], axis=1)

# Display the DataFrame
print(df)

[14:17:27] Explicit valence for atom # 1 Si, 8, is greater than permitted
[14:20:31] Explicit valence for atom # 1 Si, 8, is greater than permitted
[14:23:29] Explicit valence for atom # 1 Si, 8, is greater than permitted
[14:24:19] Explicit valence for atom # 1 Si, 8, is greater than permitted
[14:27:10] Explicit valence for atom # 1 Si, 8, is greater than permitted
[14:28:53] Explicit valence for atom # 1 Si, 8, is greater than permitted


       MaxEStateIndex  MinEStateIndex  MaxAbsEStateIndex  MinAbsEStateIndex  \
0            9.316200       -1.533785           9.316200           0.150485   
1           10.532611        0.333788          10.532611           0.333788   
2            2.433032        0.000000           2.433032           0.000000   
3           10.355080       -0.613825          10.355080           0.282361   
4            0.000000        0.000000           0.000000           0.000000   
...               ...             ...                ...                ...   
75378       11.460021       -3.868472          11.460021           0.053611   
75379        5.928972       -2.841623           5.928972           0.082346   
75380        4.975926        0.848333           4.975926           0.848333   
75381       10.241948        0.324028          10.241948           0.324028   
75382       12.552405       -0.188577          12.552405           0.048913   

            QED    MolWt  HeavyAtomMolWt  ExactMolW

## Applying Variance Thresold

In [6]:
import pandas as pd
import numpy as np
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from sklearn.feature_selection import VarianceThreshold
# Define the threshold
threshold = 0.05

# Calculate the variance of each feature
variances = np.var(df, axis=0)

# Get the indices of the selected features
selected_indices = np.where(variances > threshold)[0]

# Get the names of the selected columns
selected_columns = df.columns[selected_indices]

# Create a new DataFrame with selected features
df_new = df[selected_columns]
df_new

  return var(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,MaxPartialCharge,MaxAbsPartialCharge,...,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount,MolLogP,MolMR
0,9.316200,-1.533785,9.316200,0.150485,317.599,306.511,315.982463,100.0,0.200859,0.507967,...,2.0,2.0,5.0,2.0,0.0,0.0,0.0,2.0,4.59990,78.3466
1,10.532611,0.333788,10.532611,0.333788,156.269,136.109,156.151415,66.0,0.129340,0.300031,...,1.0,0.0,1.0,7.0,0.0,0.0,0.0,0.0,3.32600,48.6740
2,2.433032,0.000000,2.433032,0.000000,362.086,313.702,361.347528,148.0,0.078199,1.000000,...,0.0,0.0,2.0,18.0,0.0,0.0,0.0,0.0,4.34820,107.0624
3,10.355080,-0.613825,10.355080,0.282361,255.665,245.585,255.052302,90.0,0.256798,0.332260,...,5.0,1.0,8.0,3.0,0.0,0.0,0.0,2.0,0.68790,62.0891
4,0.000000,0.000000,0.000000,0.000000,149.894,149.894,149.894242,8.0,1.000000,1.000000,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,-5.99200,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75378,11.460021,-3.868472,11.460021,0.053611,230.245,220.165,230.036128,82.0,0.420384,0.452172,...,5.0,2.0,7.0,2.0,0.0,0.0,0.0,1.0,0.31350,53.5809
75379,5.928972,-2.841623,5.928972,0.082346,313.747,296.611,313.041677,104.0,0.382438,0.387901,...,7.0,0.0,9.0,7.0,0.0,0.0,0.0,1.0,3.18870,73.5820
75380,4.975926,0.848333,4.975926,0.848333,167.258,162.218,166.986341,50.0,0.158935,0.336924,...,2.0,1.0,3.0,0.0,0.0,0.0,0.0,2.0,2.95889,47.0097
75381,10.241948,0.324028,10.241948,0.324028,128.215,112.087,128.120115,54.0,0.122557,0.303089,...,1.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,2.40170,39.3700


In [7]:
# Get the names of all columns
column_names = df_new.columns.tolist()

# Print the column names
print(column_names)

#NO assay id here, so have to add it in df_new and df_new1

['MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'MaxPartialCharge', 'MaxAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA9', 'SMR_VSA10', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'TPSA', 'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4', 'ESta

In [8]:
# Extract the desired column
desired_column = train_data['Assay']

# Merge the two dataframes
df_new = pd.concat([df_new, desired_column], axis=1)

In [9]:
# Get the names of all columns
column_names = df_new.columns.tolist()

# Print the column names
print(column_names)

['MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'MaxPartialCharge', 'MaxAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA9', 'SMR_VSA10', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'TPSA', 'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4', 'ESta

## Generated RDKit Features for Test Data

In [10]:
import pandas as pd
import numpy as np
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

def generate_rdkit_features(smiles):
    # Convert SMILES string to RDKit molecule object
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # Return NaN if SMILES string is invalid
        return [np.nan]*265
    
    # Calculate additional RDKit descriptors
    MaxEStateIndex= Descriptors.MaxEStateIndex(mol)
    MinEStateIndex= Descriptors.MinEStateIndex(mol)
    MaxAbsEStateIndex= Descriptors.MaxAbsEStateIndex(mol)
    MinAbsEStateIndex= Descriptors.MinAbsEStateIndex(mol)
    qed= Descriptors.qed(mol)
    MolWt= Descriptors.MolWt(mol)
    HeavyAtomMolWt= Descriptors.HeavyAtomMolWt(mol)
    ExactMolWt= Descriptors.ExactMolWt(mol)
    NumValenceElectrons= Descriptors.NumValenceElectrons(mol)
    NumRadicalElectrons= Descriptors.NumRadicalElectrons(mol)
    MaxPartialCharge= Descriptors.MaxPartialCharge(mol)
    MinPartialCharge= Descriptors.MinPartialCharge(mol)
    MaxAbsPartialCharge= Descriptors.MaxAbsPartialCharge(mol)
    MinAbsPartialCharge= Descriptors.MinAbsPartialCharge(mol)
    FpDensityMorgan1= Descriptors.FpDensityMorgan1(mol)
    FpDensityMorgan2= Descriptors.FpDensityMorgan2(mol)
    FpDensityMorgan3= Descriptors.FpDensityMorgan3(mol)
    BalabanJ= Descriptors.BalabanJ(mol)
    BertzCT= Descriptors.BertzCT(mol)
    Chi0= Descriptors.Chi0(mol)
    Chi0n= Descriptors.Chi0n(mol)
    Chi0v= Descriptors.Chi0v(mol)
    Chi1= Descriptors.Chi1(mol)
    Chi1n= Descriptors.Chi1n(mol)
    Chi1v= Descriptors.Chi1v(mol)
    Chi2n= Descriptors.Chi2n(mol)
    Chi2v= Descriptors.Chi2v(mol)
    Chi3n= Descriptors.Chi3n(mol)
    Chi3v= Descriptors.Chi3v(mol)
    Chi4n= Descriptors.Chi4n(mol)
    Chi4v= Descriptors.Chi4v(mol)
    HallKierAlpha= Descriptors.HallKierAlpha(mol)
    Ipc= Descriptors.Ipc(mol)
    Kappa1= Descriptors.Kappa1(mol)
    Kappa2= Descriptors.Kappa2(mol)
    Kappa3= Descriptors.Kappa3(mol)
    LabuteASA= Descriptors.LabuteASA(mol)
    PEOE_VSA1= Descriptors.PEOE_VSA1(mol)
    PEOE_VSA10= Descriptors.PEOE_VSA10(mol)
    PEOE_VSA11= Descriptors.PEOE_VSA11(mol)
    PEOE_VSA12= Descriptors.PEOE_VSA12(mol)
    PEOE_VSA13= Descriptors.PEOE_VSA13(mol)
    PEOE_VSA14= Descriptors.PEOE_VSA14(mol)
    PEOE_VSA2= Descriptors.PEOE_VSA2(mol)
    PEOE_VSA3 = Descriptors.PEOE_VSA3(mol)
    PEOE_VSA4 = Descriptors.PEOE_VSA4(mol)
    PEOE_VSA5 = Descriptors.PEOE_VSA5(mol)
    PEOE_VSA6 = Descriptors.PEOE_VSA6(mol)
    PEOE_VSA7 = Descriptors.PEOE_VSA7(mol)
    PEOE_VSA8 = Descriptors.PEOE_VSA8(mol)
    PEOE_VSA9 = Descriptors.PEOE_VSA9(mol)
    SMR_VSA1 = Descriptors.SMR_VSA1(mol)
    SMR_VSA10 = Descriptors.SMR_VSA10(mol)
    SMR_VSA2 = Descriptors.SMR_VSA2(mol)
    SMR_VSA3 = Descriptors.SMR_VSA3(mol)
    SMR_VSA4 = Descriptors.SMR_VSA4(mol)
    SMR_VSA5 = Descriptors.SMR_VSA5(mol)
    SMR_VSA6 = Descriptors.SMR_VSA6(mol)
    SMR_VSA7 = Descriptors.SMR_VSA7(mol)
    SMR_VSA8 = Descriptors.SMR_VSA8(mol)
    SMR_VSA9 = Descriptors.SMR_VSA9(mol)
    SlogP_VSA1 = Descriptors.SlogP_VSA1(mol)
    SlogP_VSA10 = Descriptors.SlogP_VSA10(mol)
    SlogP_VSA11 = Descriptors.SlogP_VSA11(mol)
    SlogP_VSA12 = Descriptors.SlogP_VSA12(mol)
    SlogP_VSA2 = Descriptors.SlogP_VSA2(mol)
    SlogP_VSA3 = Descriptors.SlogP_VSA3(mol)
    SlogP_VSA4 = Descriptors.SlogP_VSA4(mol)
    SlogP_VSA5 = Descriptors.SlogP_VSA5(mol)
    SlogP_VSA6 = Descriptors.SlogP_VSA6(mol)
    SlogP_VSA7 = Descriptors.SlogP_VSA7(mol)
    SlogP_VSA8 = Descriptors.SlogP_VSA8(mol)
    SlogP_VSA9 = Descriptors.SlogP_VSA9(mol)
    TPSA = Descriptors.TPSA(mol)
    EState_VSA1 = Descriptors.EState_VSA1(mol)
    EState_VSA10 = Descriptors.EState_VSA10(mol)
    EState_VSA11 = Descriptors.EState_VSA11(mol)
    EState_VSA2 = Descriptors.EState_VSA2(mol)
    EState_VSA3 = Descriptors.EState_VSA3(mol)
    EState_VSA4 = Descriptors.EState_VSA4(mol)
    EState_VSA5 = Descriptors.EState_VSA5(mol)
    EState_VSA6 = Descriptors.EState_VSA6(mol)
    EState_VSA7 = Descriptors.EState_VSA7(mol)
    EState_VSA8 = Descriptors.EState_VSA8(mol)
    EState_VSA9 = Descriptors.EState_VSA9(mol)
    VSA_EState1 = Descriptors.VSA_EState1(mol)
    VSA_EState10 = Descriptors.VSA_EState10(mol)
    VSA_EState2 = Descriptors.VSA_EState2(mol)
    VSA_EState3 = Descriptors.VSA_EState3(mol)
    VSA_EState4 = Descriptors.VSA_EState4(mol)
    VSA_EState5 = Descriptors.VSA_EState5(mol)
    VSA_EState6 = Descriptors.VSA_EState6(mol)
    VSA_EState7 = Descriptors.VSA_EState7(mol)
    VSA_EState8 = Descriptors.VSA_EState8(mol)
    VSA_EState9 = Descriptors.VSA_EState9(mol)
    FractionCSP3 = Descriptors.FractionCSP3(mol)
    HeavyAtomCount = Descriptors.HeavyAtomCount(mol)
    NHOHCount = Descriptors.NHOHCount(mol)
    NOCount = Descriptors.NOCount(mol)
    NumAliphaticCarbocycles = Descriptors.NumAliphaticCarbocycles(mol)
    NumAliphaticHeterocycles = Descriptors.NumAliphaticHeterocycles(mol)
    NumAliphaticRings = Descriptors.NumAliphaticRings(mol)
    NumAromaticCarbocycles = Descriptors.NumAromaticCarbocycles(mol)
    NumAromaticHeterocycles = Descriptors.NumAromaticHeterocycles(mol)
    NumAromaticRings = Descriptors.NumAromaticRings(mol)
    NumHAcceptors = Descriptors.NumHAcceptors(mol)
    NumHDonors = Descriptors.NumHDonors(mol)
    NumHeteroatoms = Descriptors.NumHeteroatoms(mol)
    NumRotatableBonds = Descriptors.NumRotatableBonds(mol)
    NumSaturatedCarbocycles = Descriptors.NumSaturatedCarbocycles(mol)
    NumSaturatedHeterocycles = Descriptors.NumSaturatedHeterocycles(mol)
    NumSaturatedRings = Descriptors.NumSaturatedRings(mol)   
    RingCount = Descriptors.RingCount(mol)
    MolLogP = Descriptors.MolLogP(mol)
    MolMR = Descriptors.MolMR(mol) 
    
    # Calculate MACCS keys
    maccs_keys = MACCSkeys.GenMACCSKeys(mol)
    maccs_list = list(maccs_keys.ToBitString())

    # Return a list of features
    features = [MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,MaxPartialCharge,MinPartialCharge,MaxAbsPartialCharge,MinAbsPartialCharge,FpDensityMorgan1,FpDensityMorgan2,FpDensityMorgan3,BalabanJ,BertzCT,Chi0,Chi0n,Chi0v,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3n,Chi3v,Chi4n,Chi4v,HallKierAlpha,Ipc,Kappa1,Kappa2,Kappa3,LabuteASA,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA1,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA5,SMR_VSA6,SMR_VSA7,SMR_VSA8,SMR_VSA9,SMR_VSA10,SlogP_VSA1,SlogP_VSA10,SlogP_VSA11,SlogP_VSA12,SlogP_VSA2,SlogP_VSA3,SlogP_VSA4,SlogP_VSA5,SlogP_VSA6,SlogP_VSA7,SlogP_VSA8,SlogP_VSA9,TPSA,EState_VSA1,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,EState_VSA9,VSA_EState1,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState6,VSA_EState7,VSA_EState8,VSA_EState9,VSA_EState10,FractionCSP3,HeavyAtomCount,NHOHCount,NOCount,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticCarbocycles,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount,MolLogP,MolMR] + maccs_list

    return features
    
smiles = test_data['Id']
data = []
for s in smiles:
    data.append(generate_rdkit_features(s))

# Define the column names
columns = ['MaxEStateIndex','MinEStateIndex','MaxAbsEStateIndex','MinAbsEStateIndex','QED','MolWt','HeavyAtomMolWt','ExactMolWt','NumValenceElectrons','NumRadicalElectrons','MaxPartialCharge','MinPartialCharge','MaxAbsPartialCharge','MinAbsPartialCharge','FpDensityMorgan1','FpDensityMorgan2','FpDensityMorgan3','BalabanJ','BertzCT','Chi0','Chi0n','Chi0v','Chi1','Chi1n','Chi1v','Chi2n','Chi2v','Chi3n','Chi3v','Chi4n','Chi4v','HallKierAlpha','Ipc','Kappa1','Kappa2','Kappa3','LabuteASA','PEOE_VSA1','PEOE_VSA10','PEOE_VSA11','PEOE_VSA12','PEOE_VSA13','PEOE_VSA14','PEOE_VSA2','PEOE_VSA3','PEOE_VSA4','PEOE_VSA5','PEOE_VSA6','PEOE_VSA7','PEOE_VSA8','PEOE_VSA9','SMR_VSA1','SMR_VSA2','SMR_VSA3','SMR_VSA4','SMR_VSA5','SMR_VSA6','SMR_VSA7','SMR_VSA8','SMR_VSA9','SMR_VSA10','SlogP_VSA1','SlogP_VSA10','SlogP_VSA11','SlogP_VSA12','SlogP_VSA2','SlogP_VSA3','SlogP_VSA4','SlogP_VSA5','SlogP_VSA6','SlogP_VSA7','SlogP_VSA8','SlogP_VSA9','TPSA','EState_VSA1','EState_VSA10','EState_VSA11','EState_VSA2','EState_VSA3','EState_VSA4','EState_VSA5','EState_VSA6','EState_VSA7','EState_VSA8','EState_VSA9','VSA_EState1','VSA_EState2','VSA_EState3','VSA_EState4','VSA_EState5','VSA_EState6','VSA_EState7','VSA_EState8','VSA_EState9','VSA_EState10','FractionCSP3','HeavyAtomCount','NHOHCount','NOCount','NumAliphaticCarbocycles','NumAliphaticHeterocycles','NumAliphaticRings','NumAromaticCarbocycles','NumAromaticHeterocycles','NumAromaticRings','NumHAcceptors','NumHDonors','NumHeteroatoms','NumRotatableBonds','NumSaturatedCarbocycles','NumSaturatedHeterocycles','NumSaturatedRings','RingCount','MolLogP','MolMR'] + ['MACCS_'+str(i) for i in range(167)]

#Create a pandas DataFrame with the molecular descriptors as rows
df1 = pd.DataFrame(data, columns=columns)

# Extract the desired column
desired_column1 = test_data['Assay']

# Merge the two dataframes
df1 = pd.concat([df1, desired_column1], axis=1)

# Display the DataFrame
print(df1)

       MaxEStateIndex  MinEStateIndex  MaxAbsEStateIndex  MinAbsEStateIndex  \
0            9.626968        0.025579           9.626968           0.025579   
1           12.473362       -4.605249          12.473362           0.061165   
2           14.627193       -4.140552          14.627193           0.064351   
3           10.420833       -3.973958          10.420833           0.000000   
4           12.865865       -0.601027          12.865865           0.094949   
...               ...             ...                ...                ...   
10989       12.851673       -4.443395          12.851673           0.048352   
10990       11.617073       -0.992448          11.617073           0.147894   
10991       12.406479       -4.356493          12.406479           0.049357   
10992       11.026605       -3.480301          11.026605           0.167292   
10993       10.933565        0.014444          10.933565           0.014444   

            QED    MolWt  HeavyAtomMolWt  ExactMolW

In [11]:
train_feats = df_new.columns
df_new1 = df1[train_feats]

## Imputing Null Values

In [12]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
df_new_imputed = imputer.fit_transform(df_new)
df_new1_imputed = imputer.transform(df_new1)

## Check if any values in df_new_imputed or df_new1_imputed are NaN or infinity

In [13]:
import numpy as np

print(np.isnan(df_new_imputed).sum())
print(np.isinf(df_new_imputed).sum())
print(np.isnan(df_new1_imputed).sum())
print(np.isinf(df_new1_imputed).sum())

0
0
0
0


In [14]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [15]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.


## Trained using XGBoost Model

In [16]:
import xgboost as xgb
from imblearn.over_sampling import SMOTE

X = df_new_imputed
X_test = df_new1_imputed
y = train_data['Expected']

y_binary = y.map({1: 0, 2: 1})

# Apply SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y_binary)

model = xgb.XGBClassifier(
    booster='gbtree',
    n_estimators=850,
    learning_rate=0.1,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_res, y_res)

predictions = model.predict(X_test)

predictions_int = pd.Series(predictions).map({0: 1, 1: 2}).astype(int)

subm_df = pd.read_csv('C:/Users/RAMZAN KHAN/Toxicity Prediction/sample_submission.csv')
subm_df['Predicted'] = predictions_int
subm_df.to_csv('submission.csv', index=False)

print("Your submission was successfully saved!")

Your submission was successfully saved!
