In this notebook, I generate fingerprints from SMILES for a subset from the Lenselink dataset.

In [13]:
import pandas as pd
import rdkit.Chem as chem

### Loading the data

In [14]:
lense = pd.read_csv('../datasets/Lenselink_et_al/Mapped_Lenselink.csv')

### Removing duplicate SMILES

In [3]:
#lense.shape

(204085, 21)

In [15]:
bool_series = lense['canonical_smiles'].duplicated(keep=False)
lense[bool_series]    # There are 596 duplicates. I will throw them out.

Unnamed: 0.1,Unnamed: 0,CMP_CLASS_SMALL_MOLECULE,CMP_CLASS_BIOLOGICAL,CMP_CLASS_PEPTIDE,CMP_CLASS_ORGANIC,CMP_CLASS_INORGANIC,CMP_CLASS_ACID,CMP_CLASS_BASE,CMP_CLASS_NEUTRAL,CMP_FORMALCHARGE,...,CMP_NUM_AROMATICRINGS,CMP_NUM_CHAINS,CMP_ATOMCOUNT_METALATOMS,CMP_MOLECULAR_SURFACEAREA,CMP_LOGP,CMP_SOLUBILITY,CMP_ATOMS_POSITIVE_FRAC,CMP_ATOMS_NEGATIVE_FRAC,CMP_MOLECULAR_WEIGHT,canonical_smiles
1416,1416,1,0,0,1,0,0,0,0,0,...,1,7,0,331.50,3.146,-4.389,0.0,0.0,313.39082,
1422,1422,1,0,0,1,0,0,0,0,0,...,1,7,0,314.08,2.690,-3.991,0.0,0.0,299.36424,
1566,1566,1,0,0,1,0,0,0,0,0,...,1,5,0,358.61,1.660,-6.342,0.0,0.0,368.45270,
2907,2907,1,0,0,1,0,0,0,0,0,...,1,5,0,322.05,3.648,-5.062,0.0,0.0,311.46106,
4044,4044,1,0,0,1,0,0,0,0,0,...,3,4,0,377.53,3.718,-5.604,0.0,0.0,386.48614,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201220,201220,1,0,0,1,0,0,0,0,0,...,0,15,0,469.31,2.081,-4.379,0.0,0.0,432.46361,
201223,201223,1,0,0,1,0,0,0,0,0,...,1,14,0,444.72,2.899,-3.316,0.0,0.0,418.43703,
201224,201224,1,0,0,1,0,0,0,0,0,...,1,14,0,462.00,3.566,-3.415,0.0,0.0,432.46361,
201229,201229,1,0,0,1,0,0,0,0,0,...,1,14,0,503.49,4.247,-4.302,0.0,0.0,460.51677,


In [None]:
#lense.shape

In [16]:
lense = lense[~bool_series]

In [None]:
#lense.shape

### Take a subset

In [18]:
#lense.describe()

In [19]:
# Remove un-interesting columns
lense = lense.drop(columns=['CMP_CLASS_INORGANIC',
                           'CMP_CLASS_ACID',
                           'CMP_CLASS_BASE',
                           'CMP_CLASS_NEUTRAL',
                           'CMP_ATOMCOUNT_METALATOMS'])

In [20]:
lense_subset = lense.sample(n=150,
                           random_state=42)
lense_subset.shape

(150, 16)

### Generate Fingerprints from SMILES

In [21]:
smiles_list = lense_subset['canonical_smiles'].tolist()

mols_list = [chem.MolFromSmiles(x) for x in smiles_list]   # Attention: this can give None values!
                                                            # check with "is None"

fingerprints_list = [chem.RDKFingerprint(x) for x in mols_list]

In [22]:
# Check for NULL values
None in fingerprints_list      # there are no Null/ None objects :)

False

In [23]:
lense_subset['fingerprint'] = fingerprints_list

In [None]:
#lense_subset.iloc[3,16]

In [24]:
for i in range(len(lense_subset)):
    lense_subset.iloc[i,16] = lense_subset.iloc[i,16].ToBitString()

### Save the output of this notebook

In [25]:
lense_subset.to_csv('../datasets/Lenselink_et_al/Lenselink_1_Molecular_Notation_Transformation_150samples.csv')