# GROVER
Generate GROVER fingerprints for SMILES-drugs coming from LINCS + SciPlex3.

Steps:
1. Load `lincs_trapnell.smiles` as the list of SMILES to be encoded
2. Generate fingerprints using GROVER
3. Save SMILES -> fingerprint mapping as a pandas df.

In [1]:
import scanpy as sc
from rdkit import Chem
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
import rdkit
rdkit.__version__

'2019.03.4'

In [4]:
# SET
datasets_fpath = Path("/home/icb/simon.boehm/Masters_thesis/MT_code/datasets")
all_smiles_fpath = Path.cwd().parent / "lincs_trapnell.smiles"

## Step 1: Generate fingerprints

- TODO: Right now we generate `rdkit_2d_normalized` features. Are these the correct ones?
- TODO: There are pretrained & finetuned models also available, maybe that's useful for us:
    - SIDER: Drug side effect prediction task
    - ClinTox: Drug toxicity prediction task
    - ChEMBL log P prediction task

In [10]:
%%bash
set -euox pipefail

# move csv of all smiles to be encoded into current workdir
cp ../lincs_trapnell.smiles data/embeddings/lincs_trapnell.csv
file="data/embeddings/lincs_trapnell.csv"

# First we generate the feature embedding for the SMILES, which is an extra input
# into GROVER
echo "FILE: $file"
features=$(echo $file | sed 's:.csv:.npz:')
if [[ ! -f $features ]]; then
    echo "Generating features: $features"
    python scripts/save_features.py --data_path "$file" \
                            --save_path "$features" \
                            --features_generator rdkit_2d_normalized \
                            --restart
fi;

# Second we input SMILES + Features into grover and get the fingerprint out
# 'both' means we get a concatenated fingerprint of combined atoms + bonds features
outfile=$(echo $file | sed 's:.csv:_grover_base_both.npz:')
echo "EMB: $outfile"
if [[ ! -f $outfile ]]; then
    echo "Generating embedding: $outfile"
    python main.py fingerprint --data_path "$file" \
                       --features_path "$features" \
                       --checkpoint_path data/model/grover_base.pt \
                       --fingerprint_source both \
                       --output "$outfile"
fi;

FILE: data/embeddings/lincs_trapnell.csv
Generating features: data/embeddings/lincs_trapnell.npz
EMB: data/embeddings/lincs_trapnell_grover_base_both.npz
Generating embedding: data/embeddings/lincs_trapnell_grover_base_both.npz
Loading data


+ mv ../lincs_trapnell.smiles data/embeddings/lincs_trapnell.csv
+ file=data/embeddings/lincs_trapnell.csv
+ echo 'FILE: data/embeddings/lincs_trapnell.csv'
++ echo data/embeddings/lincs_trapnell.csv
++ sed s:.csv:.npz:
+ features=data/embeddings/lincs_trapnell.npz
+ [[ ! -f data/embeddings/lincs_trapnell.npz ]]
+ echo 'Generating features: data/embeddings/lincs_trapnell.npz'
+ python scripts/save_features.py --data_path data/embeddings/lincs_trapnell.csv --save_path data/embeddings/lincs_trapnell.npz --features_generator rdkit_2d_normalized --restart
  0%|          | 0/17869 [00:00<?, ?it/s]  0%|          | 1/17869 [00:01<6:26:26,  1.30s/it]  0%|          | 9/17869 [00:01<4:36:05,  1.08it/s]  0%|          | 17/17869 [00:02<3:20:24,  1.48it/s]  0%|          | 51/17869 [00:02<2:20:31,  2.11it/s]  0%|          | 59/17869 [00:03<1:45:40,  2.81it/s]  0%|          | 81/17869 [00:03<1:15:03,  3.95it/s]  1%|          | 100/17869 [00:03<53:13,  5.56it/s]   1%|          | 108/17869 [00

In [13]:
lincs_trapnell_base = np.load("data/embeddings/lincs_trapnell_grover_base_both.npz")
print("Shape of GROVER_base embedding:", lincs_trapnell_base["fps"].shape)

Shape of GROVER_base embedding: (17869, 3400)


## Step 2: Generate DataFrame with SMILES -> Embedding mapping

In [14]:
def flatten(x: np.ndarray):
    assert len(x.shape) == 2 and x.shape[0] == 1
    return x[0]

embeddings_fpath = Path("data/embeddings")
smiles_file = embeddings_fpath / "lincs_trapnell.csv"
emb_file = embeddings_fpath / "lincs_trapnell_grover_base_both.npz"

# read list of smiles 
smiles_df = pd.read_csv(smiles_file)
# read generated embedding (.npz has only one key, 'fps')
emb = np.load(emb_file)["fps"]
assert len(smiles_df) == emb.shape[0]

# generate a DataFrame with SMILES and Embedding in each row
final_df = pd.DataFrame(emb, index=smiles_df["smiles"].values, columns=[f"latent_{i+1}" for i in range(emb.shape[1])])
# remove duplicates indices (=SMILES) (This is probably useless)
final_df = final_df[~final_df.index.duplicated(keep="first")]
final_df.to_parquet(embeddings_fpath / "grover_base.parquet")

In [15]:
df = pd.read_parquet("data/embeddings/grover_base.parquet")

In [16]:
df

Unnamed: 0,latent_1,latent_2,latent_3,latent_4,latent_5,latent_6,latent_7,latent_8,latent_9,latent_10,...,latent_3391,latent_3392,latent_3393,latent_3394,latent_3395,latent_3396,latent_3397,latent_3398,latent_3399,latent_3400
C[C@H](NC(=O)/C(C#N)=C/c1cccc(Br)n1)c1ccccc1,0.277825,-0.521337,0.039926,0.201971,-0.025537,0.149467,0.131638,-0.322467,-0.397910,-0.610647,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.380367
Cc1cc(Nc2cc(CN3CCOCC3)c3nc(C)c(Cc4ccc(Cl)cc4F)n3n2)[nH]n1,0.028639,-1.048076,0.018409,0.130289,-0.072761,0.171014,0.061133,-0.513127,-0.223625,0.369723,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.282335
Cc1cc(N2CCOCC2)cc2[nH]c(-c3c(NCC(O)c4cccc(Cl)c4)cc[nH]c3=O)nc12,0.165422,-1.011326,0.057309,0.091816,-0.151365,0.221259,0.176238,0.507406,-0.169593,0.426839,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.162371
Cl.Cl.c1ccc([C@@H]2C[C@H]2NC2CCNCC2)cc1,0.155207,-0.776301,0.101483,0.110460,-0.037038,0.164413,-0.066388,0.117138,-0.320254,0.458388,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.970045
O=C(c1ccc(/C=C/c2n[nH]c3ccccc23)cc1)N1CCNCC1,0.201639,-0.486690,0.084306,0.112085,-0.117780,-0.014238,-0.002077,0.552523,-0.229843,-0.473847,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.802124
Cc1nnc(C(C)C)n1C1CC2CCC(C1)N2CCC(NC(=O)C1CCC(F)(F)CC1)c1ccccc1,0.159676,-0.576105,0.116720,0.180386,-0.096576,0.342556,0.057665,-0.126280,-0.195197,0.453579,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.321282
NC(=O)c1ncn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)c1N,0.191298,-0.848057,0.003274,-0.051926,0.026726,0.185980,-0.152911,4.109511,-0.187004,0.572909,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.220589
N#CCNC(=O)c1ccc(-c2ccnc(Nc3ccc(N4CCOCC4)cc3)n2)cc1,0.180701,-0.642636,0.042626,0.136705,-0.113401,0.021887,0.107432,0.053742,-0.344426,-0.337761,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.502553
C#Cc1cccc(Nc2ncnc3cc(OC)c(OCCCCCCC(=O)NO)cc23)c1,0.126199,-0.809480,0.064750,0.071284,-0.170155,0.304349,0.164865,-0.573351,-0.190654,-0.289925,...,1.593061e-17,5.766101e-14,1.000000e+00,0.168378,0.16738,1.481515e-18,2.324150e-16,9.999997e-01,0.166633,0.048322
O=C1CCC(N2C(=O)c3ccccc3C2=O)C(=O)N1,0.229506,0.199712,0.138227,0.201938,-0.157882,0.216377,-0.047184,1.550199,-0.394683,0.077301,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.713343


## Step 3: Check
Make extra sure the index of the generated dataframe is correct by loading our list of canonical SMILES again

In [17]:
all_smiles_fpath = Path.cwd().parent / "lincs_trapnell.smiles"
all_smiles = pd.read_csv(all_smiles_fpath)["smiles"].values
assert sorted(list(df.index)) == sorted(list(all_smiles))