### Calculating molecular descriptors and fingerprints
https://pubmed.ncbi.nlm.nih.gov/21425294/

In [3]:
#############
# LIBRARIES #
#############

import os
# Current working directory
curr_dir = os.getcwd()
data_dir = os.path.join('data')
notebook_dir = os.path.join('notebooks')
figures_dir = os.path.join('figures')
analyses_dir = os.path.join('analyses')
project_dir = os.path.join('..')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from padelpy import padeldescriptor, from_smiles

In [22]:
# Loading the PaDELPy Files
# 1. Downloading the XML data files
! wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.zip
! wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.sh

--2023-02-24 00:11:29--  https://github.com/dataprofessor/bioinformatics/raw/master/padel.zip
Resolving github.com (github.com)... 192.30.255.112
Connecting to github.com (github.com)|192.30.255.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/dataprofessor/bioinformatics/master/padel.zip [following]
--2023-02-24 00:11:29--  https://raw.githubusercontent.com/dataprofessor/bioinformatics/master/padel.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8003::154, 2606:50c0:8000::154, 2606:50c0:8001::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8003::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 25768637 (25M) [application/zip]
Saving to: ‘padel.zip’


2023-02-24 00:11:30 (31.3 MB/s) - ‘padel.zip’ saved [25768637/25768637]

--2023-02-24 00:11:30--  https://github.com/dataprofessor/bioinformatics/raw/master/padel.sh
Resol

In [24]:
! unzip padel.zip

Archive:  padel.zip
replace __MACOSX/._PaDEL-Descriptor? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [4]:
# Opening the preprocessed data
bioactivity_df = pd.read_csv(os.path.join(project_dir, data_dir, 'CASP2_bioactive_data_LIP_descriptors.csv'))
bioactivity_df.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioavtivity_class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,CHEMBL366927,CCCCCCN(C)CC(=O)C(CC(=O)O)NC(=O)C(CC)n1cc(C(C)...,active,612.172,3.25792,3.0,11.0,8.0
1,CHEMBL179503,CCCCCN(C)CC(=O)C(CC(=O)O)NC(=O)C(CC)n1cc(C(C)(...,active,598.145,2.86782,3.0,11.0,8.0
2,CHEMBL203709,COc1ccccc1NC(=O)CCC(=O)Nc1ccc2c(c1)C(=O)C(=O)N...,active,395.371,1.5053,3.0,6.0,6.270026
3,CHEMBL438969,O=C(CCC(=O)N1CCCCC1)Nc1ccc2c(c1)C(=O)C(=O)NC2=O,active,357.366,0.8705,2.0,5.0,6.636388
4,CHEMBL202971,O=C(O)CCC(=O)Nc1ccc2c(c1)C(=O)C(=O)NC2=O,active,290.231,-0.0574,3.0,5.0,6.066007


In [5]:
# Creating a list of SMILES with the id
smiles_list = bioactivity_df[[ 'canonical_smiles', 'molecule_chembl_id']]
# Saving as a CSV file
smiles_list.to_csv(os.path.join(project_dir, data_dir, 'smiles_list.smi'), index=False, header=False, sep='\t')

In [8]:
! cat ../data/smiles_list.smi | wc -l

      50


In [9]:
! cat padel.sh

java -Xms1G -Xmx1G -Djava.awt.headless=true -jar ./PaDEL-Descriptor/PaDEL-Descriptor.jar -removesalt -standardizenitro -fingerprints -descriptortypes ./PaDEL-Descriptor/PubchemFingerprinter.xml -dir ./ -file descriptors_output.csv


In [12]:
! bash padel.sh

Processing CHEMBL366927 in smiles_list.smi (1/50). 
Processing CHEMBL179503 in smiles_list.smi (2/50). 
Processing CHEMBL203709 in smiles_list.smi (3/50). 
Processing CHEMBL438969 in smiles_list.smi (4/50). 
Processing CHEMBL202971 in smiles_list.smi (5/50). 
Processing CHEMBL382481 in smiles_list.smi (6/50). 
Processing CHEMBL439872 in smiles_list.smi (7/50). 
Processing CHEMBL1835209 in smiles_list.smi (8/50). 
Processing CHEMBL1835208 in smiles_list.smi (9/50). 
Processing CHEMBL1835210 in smiles_list.smi (10/50). 
Processing CHEMBL1835212 in smiles_list.smi (12/50). Average speed: 0.83 s/mol.
Processing CHEMBL1835211 in smiles_list.smi (11/50). Average speed: 0.79 s/mol.
Processing CHEMBL1835313 in smiles_list.smi (13/50). Average speed: 0.31 s/mol.
Processing CHEMBL1835314 in smiles_list.smi (14/50). Average speed: 0.24 s/mol.
Processing CHEMBL1835315 in smiles_list.smi (15/50). Average speed: 0.20 s/mol.
Processing CHEMBL1835316 in smiles_list.smi (16/50). Average speed: 0.21 s/m

In [14]:
! ls -l

total 51200
drwxrwxr-x  21 rogerlefort  staff       672 May 30  2020 [34mPaDEL-Descriptor[m[m
-rw-r--r--   1 rogerlefort  staff     18538 Feb 24 00:26 PaDEL-descriptor.ipynb
drwxr-xr-x   4 rogerlefort  staff       128 Feb 24 00:11 [34m__MACOSX[m[m
-rw-r--r--   1 rogerlefort  staff     31524 Feb 23 15:59 bioactivity_data.ipynb
-rw-r--r--   1 rogerlefort  staff    100241 Feb 24 00:25 descriptors_output.csv
-rw-r--r--   1 rogerlefort  staff    242552 Feb 23 23:52 eda.ipynb
-rw-r--r--   1 rogerlefort  staff       231 Feb 24 00:11 padel.sh
-rw-r--r--   1 rogerlefort  staff  25768637 Feb 24 00:11 padel.zip
-rw-r--r--   1 rogerlefort  staff      5473 Feb 24 00:14 smiles_list.smi


In [27]:
# Opening descriptor file
descriptor_file = pd.read_csv('descriptors_output.csv')
descriptor_file.head()

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,CHEMBL382481,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL202971,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL438969,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL1835209,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL439872,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
bioactivity_df.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioavtivity_class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,CHEMBL366927,CCCCCCN(C)CC(=O)C(CC(=O)O)NC(=O)C(CC)n1cc(C(C)...,active,612.172,3.25792,3.0,11.0,8.0
1,CHEMBL179503,CCCCCN(C)CC(=O)C(CC(=O)O)NC(=O)C(CC)n1cc(C(C)(...,active,598.145,2.86782,3.0,11.0,8.0
2,CHEMBL203709,COc1ccccc1NC(=O)CCC(=O)Nc1ccc2c(c1)C(=O)C(=O)N...,active,395.371,1.5053,3.0,6.0,6.270026
3,CHEMBL438969,O=C(CCC(=O)N1CCCCC1)Nc1ccc2c(c1)C(=O)C(=O)NC2=O,active,357.366,0.8705,2.0,5.0,6.636388
4,CHEMBL202971,O=C(O)CCC(=O)Nc1ccc2c(c1)C(=O)C(=O)NC2=O,active,290.231,-0.0574,3.0,5.0,6.066007


In [29]:
X = descriptor_file.drop(['Name'], axis=1)
y = bioactivity_df['pIC50']

In [20]:
X.head()

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [30]:
y.head()

0    8.000000
1    8.000000
2    6.270026
3    6.636388
4    6.066007
Name: pIC50, dtype: float64

In [32]:
# Combining the two dataframes
dataset = pd.concat([X, y], axis=1)
dataset.head()

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pIC50
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,8.0
1,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,8.0
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.270026
3,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.636388
4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.066007


In [33]:
dataset_to_csv = dataset.to_csv(os.path.join(project_dir, data_dir, 'CASP2_bioactivity_data_3class_pIC50_pubchem_fp.csv'), index=False)