# GROVER
Generate GROVER fingerprints for SMILES-drugs coming from LINCS + SciPlex3.

Steps:
1. Load LINCS + SciPlex3, extract SMILES
2. Generate fingerprints using GROVER
3. Save SMILES -> fingerprint mapping as a pandas df.

## Step 1: Get all relevant SMILES from datasets

In [1]:
import scanpy as sc
from rdkit import Chem
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
def canonicalize(smiles): 
    if smiles:
        # this canonicalizes the SMILES while preserving the information about chirality
        return Chem.CanonSmiles(smiles)
    else:
        return None

In [3]:
# SET
datasets_fpath = Path("/home/icb/simon.boehm/Masters_thesis/MT_code/datasets")

In [4]:
canon_smiles = []
for key, dataset in [("SMILES", "trapnell_cpa.h5ad"),
                     ("canonical_smiles", "lincs_full_smiles.h5ad")]:
    outpath = Path("data/embeddings/") / dataset.replace(".h5ad", ".csv")
    if outpath.exists():
        print(outpath, "found, skipping")
    else:
        print("Generating", outpath)
        df = sc.read(datasets_fpath / dataset)
        # load all unique SMILES in the dataset and canonicalize them
        canonical_smiles = pd.Series(df.obs[key].unique()).apply(canonicalize).drop_duplicates(keep="first").dropna()
        # dump to csv
        canonical_smiles.to_csv(outpath, index=False, header=["smiles"])
        canon_smiles.append(canonical_smiles)

Generating data/embeddings/trapnell_cpa.csv
Generating data/embeddings/lincs_full_smiles.csv


In [23]:
# concatenate to make a new series
all_smiles = pd.concat(canon_smiles, axis=0)
non_dedup_length = len(all_smiles)
all_smiles = all_smiles.drop_duplicates(keep="first")
print(f"Dropped {non_dedup_length - len(all_smiles)} SMILES, now {len(all_smiles)} remaining")
all_smiles.to_csv(Path("data/embeddings") / "lincs_trapnell.smiles", index=False, header=["smiles"])

Dropped 84 SMILES, now 17868 remaining


## Step 2: Generate fingerprints

- TODO: Right now we generate `rdkit_2d_normalized` features. Are these the correct ones?
- TODO: There are pretrained & finetuned models also available, maybe that's useful for us:
    - SIDER: Drug side effect prediction task
    - ClinTox: Drug toxicity prediction task
    - ChEMBL log P prediction task

In [6]:
%%bash
set -euox pipefail
for file in data/embeddings/*.csv; do
    # First we generate the feature embedding for the SMILES, which is an extra input
    # into GROVER
    echo "FILE: $file"
    features=$(echo $file | sed 's:.csv:.npz:')
    if [[ ! -f $features ]]; then
        echo "Generating features: $features"
        python scripts/save_features.py --data_path "$file" \
                                --save_path "$features" \
                                --features_generator rdkit_2d_normalized \
                                --restart
    fi;
    
    # Second we input SMILES + Features into grover and get the fingerprint out
    # 'both' means we get a concatenated fingerprint of combined atoms + bonds features
    outfile=$(echo $file | sed 's:.csv:_grover_base_both.npz:')
    echo "EMB: $outfile"
    if [[ ! -f $outfile ]]; then
        echo "Generating embedding: $outfile"
        python main.py fingerprint --data_path "$file" \
                           --features_path "$features" \
                           --checkpoint_path data/model/grover_base.pt \
                           --fingerprint_source both \
                           --output "$outfile"
    fi;
done;

FILE: data/embeddings/lincs_full_smiles.csv
Generating features: data/embeddings/lincs_full_smiles.npz
EMB: data/embeddings/lincs_full_smiles_grover_base_both.npz
Generating embedding: data/embeddings/lincs_full_smiles_grover_base_both.npz
Loading data
FILE: data/embeddings/trapnell_cpa.csv
Generating features: data/embeddings/trapnell_cpa.npz
EMB: data/embeddings/trapnell_cpa_grover_base_both.npz
Generating embedding: data/embeddings/trapnell_cpa_grover_base_both.npz
Loading data


+ for file in 'data/embeddings/*.csv'
+ echo 'FILE: data/embeddings/lincs_full_smiles.csv'
++ echo data/embeddings/lincs_full_smiles.csv
++ sed s:.csv:.npz:
+ features=data/embeddings/lincs_full_smiles.npz
+ [[ ! -f data/embeddings/lincs_full_smiles.npz ]]
+ echo 'Generating features: data/embeddings/lincs_full_smiles.npz'
+ python scripts/save_features.py --data_path data/embeddings/lincs_full_smiles.csv --save_path data/embeddings/lincs_full_smiles.npz --features_generator rdkit_2d_normalized --restart
  0%|          | 0/17763 [00:00<?, ?it/s]  0%|          | 1/17763 [00:00<1:57:59,  2.51it/s]  0%|          | 3/17763 [00:00<1:33:43,  3.16it/s]  0%|          | 22/17763 [00:00<1:06:27,  4.45it/s]  0%|          | 31/17763 [00:01<49:48,  5.93it/s]    0%|          | 36/17763 [00:01<36:42,  8.05it/s]  0%|          | 40/17763 [00:01<29:03, 10.17it/s]  0%|          | 52/17763 [00:01<21:44, 13.58it/s]  0%|          | 67/17763 [00:01<15:48, 18.66it/s]  0%|          | 75/17763 [00:01<

In [7]:
trapnell_base = np.load("data/embeddings/trapnell_cpa_grover_base_both.npz")
print("Shape of GROVER_base embedding:", trapnell_base["fps"].shape)

Shape of GROVER_base embedding: (189, 3400)


## Step 3: Generate DataFrame with SMILES -> Embedding mapping

In [11]:
def flatten(x: np.ndarray):
    assert len(x.shape) == 2 and x.shape[0] == 1
    return x[0]

embeddings_fpath = Path("data/embeddings")
final_dfs = []
for file in embeddings_fpath.iterdir():
    if str(file).endswith("csv"):
        # read original SMILES list
        df = pd.read_csv(file)
        # read generated embedding (.npz has only one key, 'fps')
        emb = np.load(str(file).replace(".csv", "_grover_base_both.npz"))["fps"]
        assert len(df) == emb.shape[0]
        # generate a DataFrame with SMILES and Embedding in each row
        final_df = pd.DataFrame(emb, index=df["smiles"].values, columns=[f"latent_{i+1}" for i in range(emb.shape[1])])
        final_dfs.append(final_df)
# join into one dataframe
final_df = pd.concat(final_dfs)
# remove duplicates indices (=SMILES)
final_df = final_df[~final_df.index.duplicated(keep="first")]
final_df.to_parquet(embeddings_fpath / "grover_base.parquet")

In [12]:
df = pd.read_parquet("data/embeddings/grover_base.parquet")

In [13]:
df

Unnamed: 0,latent_1,latent_2,latent_3,latent_4,latent_5,latent_6,latent_7,latent_8,latent_9,latent_10,...,latent_3391,latent_3392,latent_3393,latent_3394,latent_3395,latent_3396,latent_3397,latent_3398,latent_3399,latent_3400
CCC1(c2ccc(N)cc2)CCC(=O)NC1=O,0.207209,0.386310,0.139066,0.166840,-0.131474,0.453847,0.102388,2.153744,-0.350290,0.184963,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.501142
O=C(O)CCc1nc(-c2ccccc2)c(-c2ccccc2)o1,0.219819,-0.434457,0.109395,0.097777,-0.117656,0.179062,0.190519,-0.491964,-0.433697,-0.172840,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.795347
CNC(=O)Oc1ccc2c(c1)[C@]1(C)CCN(C)[C@@H]1N2C,0.173832,-0.588151,0.091966,0.099563,-0.144548,-0.029285,0.011446,-0.920698,-0.217348,-0.340028,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.916193
CC1(CN2CCC(n3c(=O)[nH]c4ccccc43)CC2)OCc2ccccc2-n2cccc21,0.169841,-1.065105,0.088725,0.165560,-0.129059,0.143544,0.124802,0.449721,-0.312256,-0.178750,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.402205
CC(C)C[C@@H](NC(=O)[C@@H](Cc1ccccc1)NC(=O)C1=CNC=CN1)B(O)O,0.035389,-0.221675,-0.138036,-0.063436,-0.091243,-0.092101,0.223962,0.949345,-0.121366,-0.257086,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.154427
CCC(=O)O[C@]1(C(=O)CCl)[C@@H](C)CC2C3CCC4=CC(=O)C=C[C@]4(C)[C@@]3(F)[C@@H](O)C[C@@]21C,0.205830,0.070576,0.165544,0.117492,-0.041380,0.558164,0.026682,-0.184504,-0.055854,1.378448,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.353871
CCCNCC(O)COc1ccccc1C(=O)CCc1ccccc1,0.228656,-0.473071,0.133487,0.129523,-0.120301,0.130344,0.116100,0.804645,-0.292522,0.392525,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,9.607067e-01,0.166633,0.338295
CC(C)Cn1cnc2c(N)nc3ccccc3c21,0.210773,-1.671837,0.042369,-0.040420,-0.042260,0.079906,0.222184,1.159440,-0.398418,-0.377411,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.757490
COC(=O)C1=C(C)NC(C)=C(C(=O)OCCN(C)Cc2ccccc2)C1c1cccc([N+](=O)[O-])c1,0.186040,-0.436950,0.210406,0.202431,-0.038688,0.022598,-0.038553,-0.955446,-0.094300,0.175068,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.157686
COc1c(O[C@@H]2O[C@H](CO)[C@@H](O)[C@H](O)[C@H]2O)cc2c(c1OC)-c1ccc(SC)c(=O)cc1[C@@H](NC(C)=O)CC2,0.121002,-0.640593,0.116678,0.082076,-0.115816,0.099828,-0.075212,0.053935,0.105860,1.249538,...,1.593061e-17,5.766101e-14,2.957989e-11,0.168378,0.16738,1.481515e-18,2.324150e-16,4.703598e-08,0.166633,0.133033


In [20]:
assert sorted(list(df.index)) == sorted(list(all_smiles))