# **Computational Drug Discovery [Part 3] Descriptor Calculation and Dataset Preparation**


## **Download PaDEL-Descriptor**

In [2]:
import requests
import os

# Download padel.zip
url_zip = "https://github.com/dataprofessor/bioinformatics/raw/master/padel.zip"
url_sh = "https://github.com/dataprofessor/bioinformatics/raw/master/padel.sh"

def download_file(url, filename):
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"Successfully downloaded {filename}")
    else:
        print(f"Failed to download {filename}")

# Download both files
download_file(url_zip, "padel.zip")
download_file(url_sh, "padel.sh")

Successfully downloaded padel.zip
Successfully downloaded padel.sh


In [4]:
import zipfile

with zipfile.ZipFile('padel.zip', 'r') as zip_ref:
    zip_ref.extractall('padel')

## **Load bioactivity data**

In [5]:
import pandas as pd

In [7]:
df3 = pd.read_csv('enoyl_acyl_carrier_protein_reductase_04_bioactivity_data_3class_pIC50.csv') 

In [8]:
df3

Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,canonical_smiles,class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,0,CHEMBL217926,O=C(Nc1ccccc1)C1CC(=O)N(C2CCCCC2)C1,inactive,286.375,2.80630,1.0,2.0,4.972243
1,1,CHEMBL216547,O=C(Nc1ccccc1Br)C1CC(=O)N(C2CCCCC2)C1,inactive,365.271,3.56880,1.0,2.0,4.000000
2,2,CHEMBL213720,O=C(Nc1ccc2c(c1)OCCO2)C1CC(=O)N(C2CCCCC2)C1,inactive,344.411,2.57750,1.0,4.0,4.000000
3,3,CHEMBL217274,Cc1cccc(C)c1NC(=O)C1CC(=O)N(C2CCCCC2)C1,inactive,314.429,3.42314,1.0,2.0,4.000000
4,4,CHEMBL217773,O=C(Nc1ccc(Oc2ccccc2)cc1)C1CC(=O)N(C2CCCCC2)C1,inactive,378.472,4.59860,1.0,3.0,4.000000
...,...,...,...,...,...,...,...,...,...
335,335,CHEMBL4794589,O=[N+]([O-])c1ccc(NC(=S)NCc2ccccc2Br)cc1,inactive,366.240,3.84390,2.0,3.0,4.806875
336,336,CHEMBL5190515,CCS(=O)(=O)N1N=Cc2ccc(Oc3ccc(C(F)(F)F)cc3/C=N/...,active,441.196,1.99250,2.0,7.0,7.522879
337,337,CHEMBL5174930,CS(=O)(=O)N1N=Cc2ccc(Oc3ccc(C(F)(F)F)cc3/C=N/O...,active,427.169,1.60240,2.0,7.0,6.397940
338,338,CHEMBL5284077,C=C1C(=O)OC[C@H]1C/C=C(\C)CC(=O)C=C(C)C,active,248.322,2.97740,0.0,3.0,8.346787


In [9]:
selection = ['canonical_smiles','molecule_chembl_id']
df3_selection = df3[selection]
df3_selection.to_csv('molecule.smi', sep='\t', index=False, header=False)

In [11]:
# Read and display first 5 lines of the file
with open('molecule.smi', 'r') as file:
    for i, line in enumerate(file):
        if i < 5:  # Only show first 5 lines
            print(line.strip())

O=C(Nc1ccccc1)C1CC(=O)N(C2CCCCC2)C1	CHEMBL217926
O=C(Nc1ccccc1Br)C1CC(=O)N(C2CCCCC2)C1	CHEMBL216547
O=C(Nc1ccc2c(c1)OCCO2)C1CC(=O)N(C2CCCCC2)C1	CHEMBL213720
Cc1cccc(C)c1NC(=O)C1CC(=O)N(C2CCCCC2)C1	CHEMBL217274
O=C(Nc1ccc(Oc2ccccc2)cc1)C1CC(=O)N(C2CCCCC2)C1	CHEMBL217773


In [13]:
with open('molecule.smi', 'r') as file:
    line_count = sum(1 for line in file)
print(f"Number of lines in molecule.smi: {line_count}")

Number of lines in molecule.smi: 340


## **Calculate fingerprint descriptors**


### **Calculate PaDEL descriptors**

In [15]:
with open('padel.sh', 'r') as file:
    content = file.read()
print(content)

java -Xms1G -Xmx1G -Djava.awt.headless=true -jar ./PaDEL-Descriptor/PaDEL-Descriptor.jar -removesalt -standardizenitro -fingerprints -descriptortypes ./PaDEL-Descriptor/PubchemFingerprinter.xml -dir ./ -file descriptors_output.csv



In [19]:
import os
print(os.listdir('padel/PaDEL-Descriptor'))

['.DS_Store', 'AtomPairs2DFingerprintCount.xml', 'AtomPairs2DFingerprinter.xml', 'config', 'Descriptors.xls', 'descriptors.xml', 'EStateFingerprinter.xml', 'ExtendedFingerprinter.xml', 'Fingerprinter.xml', 'GraphOnlyFingerprinter.xml', 'KlekotaRothFingerprintCount.xml', 'KlekotaRothFingerprinter.xml', 'lib', 'license', 'MACCSFingerprinter.xml', 'PaDEL-Descriptor.jar', 'PubchemFingerprinter.xml', 'SubstructureFingerprintCount.xml', 'SubstructureFingerprinter.xml']


In [34]:
import os
import subprocess

# Construct the Java command for Windows with PubChem fingerprints
java_cmd = [
    'java',
    '-jar',
    'padel/PaDEL-Descriptor/PaDEL-Descriptor.jar',
    '-removesalt',
    '-standardizenitro',
    '-fingerprints',        # Enable fingerprint calculation
    '-file',               
    'pubchem_fingerprints.csv',  
    '-dir',                
    '.'                    
]

try:
    process = subprocess.run(java_cmd, capture_output=True, text=True)
    print(process.stdout)
    print(process.stderr)
except Exception as e:
    print(f"Error running PaDEL: {e}")

Processing CHEMBL217926 in molecule.smi (1/340). 
Processing CHEMBL216547 in molecule.smi (2/340). 
Processing CHEMBL213720 in molecule.smi (3/340). 
Processing CHEMBL217274 in molecule.smi (4/340). 
Processing CHEMBL217773 in molecule.smi (5/340). 
Processing CHEMBL217273 in molecule.smi (6/340). 
Processing CHEMBL265016 in molecule.smi (8/340). Average speed: 0.61 s/mol.
Processing CHEMBL216781 in molecule.smi (7/340). Average speed: 0.81 s/mol.
Processing CHEMBL217524 in molecule.smi (9/340). Average speed: 0.72 s/mol.
Processing CHEMBL384149 in molecule.smi (10/340). Average speed: 0.49 s/mol.
Processing CHEMBL216704 in molecule.smi (12/340). Average speed: 0.41 s/mol.
Processing CHEMBL216339 in molecule.smi (11/340). Average speed: 0.44 s/mol.
Processing CHEMBL386324 in molecule.smi (13/340). Average speed: 0.36 s/mol.
Processing CHEMBL216807 in molecule.smi (14/340). Average speed: 0.34 s/mol.
Processing CHEMBL385294 in molecule.smi (15/340). Average speed: 0.33 s/mol.
Processing

## **Preparing the X and Y Data Matrices**

### **X data matrix**

In [35]:
df3_X = pd.read_csv('pubchem_fingerprints.csv')

In [36]:
df3_X

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,CHEMBL217926,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL217274,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL213720,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL216547,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL217273,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,CHEMBL5284077,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
336,CHEMBL4794589,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
337,CHEMBL5174930,1,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
338,CHEMBL5190515,1,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
df3_X = df3_X.drop(columns=['Name'])
df3_X

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
336,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
337,1,1,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
338,1,1,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## **Y variable**

### **Convert IC50 to pIC50**

In [38]:
df3_Y = df3['pIC50']
df3_Y

0      4.972243
1      4.000000
2      4.000000
3      4.000000
4      4.000000
         ...   
335    4.806875
336    7.522879
337    6.397940
338    8.346787
339    5.327902
Name: pIC50, Length: 340, dtype: float64

## **Combining X and Y variable**

In [39]:
dataset3 = pd.concat([df3_X,df3_Y], axis=1)
dataset3

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pIC50
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.972243
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.000000
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.000000
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.000000
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.806875
336,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.522879
337,1,1,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,6.397940
338,1,1,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,8.346787


In [40]:
dataset3.to_csv('enoyl_acyl_carrier_protein_reductase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv', index=False)