### Import relevant libraries and read in data

In [1]:
# Import relevant libraries
import pandas as pd # for data manipulation
from padelpy import from_smiles # for calculation of PaDEL fingerprints
from tqdm.notebook import tqdm # for counter

In [2]:
# Read in data
df = pd.read_csv("../data/processed/bioactivity_data_final.csv")

In [3]:
# Inspect original dataframe
df.head()

Unnamed: 0,molecule_chembl_id,class_label,canonical_smiles,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,CHEMBL185698,inactive,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,421.19,2.6605,0.0,4.0,4.869666
1,CHEMBL426082,inactive,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,293.347,3.6308,0.0,3.0,4.882397
2,CHEMBL365134,active,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c(Br)cccc21,372.243,4.3933,0.0,3.0,6.008774
3,CHEMBL190743,active,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccc(I)cc21,419.243,4.2354,0.0,3.0,6.022276
4,CHEMBL365469,inactive,O=C1C(=O)N(Cc2cc3ccccc3s2)c2cccc(Cl)c21,327.792,4.2842,0.0,3.0,4.950782


### Calculate PaDEL fingerprint descriptors for all molecules

In [4]:
# List of SMILES strings
smiles_list = df["canonical_smiles"].tolist()

fingerprints = []

# Calculate PaDEL fingerprints for all SMILE strings
for smiles in tqdm(smiles_list, desc="Calculating fingerprints"):
    fingerprint = from_smiles(smiles, fingerprints=True, descriptors=False)
    fingerprints.append(fingerprint)

Calculating fingerprints:   0%|          | 0/148 [00:00<?, ?it/s]

In [5]:
# Convert list of dictionaries to dataframe
df_fingerprints = pd.DataFrame(fingerprints)
df_fingerprints.head()

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Add the target variable pIC50 to the dataframe
df_final = pd.concat([df_fingerprints, df["pIC50"]], axis=1)

In [11]:
# Inspect dataframe
df_final.head()

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pIC50
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.869666
1,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.882397
2,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.008774
3,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.022276
4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.950782


In [13]:
# Save to csv
df_final.to_csv("../data/processed/fingerprints_pic50.csv", index=False)