# GROVER
Generate GROVER fingerprints for SMILES-drugs coming from LINCS + SciPlex3.

Steps:
1. Load LINCS + SciPlex3, extract SMILES
2. Generate fingerprints using GROVER
3. Save SMILES -> fingerprint mapping as a pandas df.

## Step 1: Get all relevant SMILES from datasets

In [24]:
import scanpy as sc
from rdkit import Chem
import pandas as pd
import numpy as np
from pathlib import Path

In [18]:
def canonicalize(smiles): 
    if smiles:
        return Chem.MolToSmiles(Chem.MolFromSmiles(smiles), canonical=True)
    else:
        return None

In [21]:
datasets_fpath = Path("/home/icb/simon.boehm/Masters_thesis/MT_code/datasets")

for key, dataset in [("SMILES", "trapnell_cpa.h5ad"),
                     ("SMILES", "trapnell_cpa_subset.h5ad"),
                     ("canonical_smiles", "lincs_full_smiles.h5ad")]:
    outpath = Path("data/embeddings/") / dataset.replace(".h5ad", ".csv")
    if outpath.exists():
        print(outpath, "found, skipping")
    else:
        print("Generating", outpath)
        df = sc.read(datasets_fpath / dataset)
        # load all unique SMILES in the dataset and canonicalize them
        canonical_smiles = pd.Series(df.obs[key].unique()).apply(canonicalize).dropna()
        # dump to csv
        canonical_smiles.to_csv(outpath, index=False, header=["smiles"])

data/embeddings/trapnell_cpa.csv found, skipping
Generating data/embeddings/trapnell_cpa_subset.csv
data/embeddings/lincs_full_smiles.csv found, skipping


## Step 2: Generate fingerprints

- TODO: Right now we generate `rdkit_2d_normalized` features. Are these the correct ones?
- TODO: There are pretrained & finetuned models also available, maybe that's useful for us:
    - SIDER: Drug side effect prediction task
    - ClinTox: Drug toxicity prediction task
    - ChEMBL log P prediction task

In [112]:
%%bash
set -euox pipefail
for file in data/embeddings/*.csv; do
    # First we generate the feature embedding for the SMILES, which is an extra input
    # into GROVER
    echo "FILE: $file"
    features=$(echo $file | sed 's:.csv:.npz:')
    if [[ ! -f $features ]]; then
        echo "Generating features: $features"
        python scripts/save_features.py --data_path "$file" \
                                --save_path "$features" \
                                --features_generator rdkit_2d_normalized \
                                --restart
    fi;
    
    # Second we input SMILES + Features into grover and get the fingerprint out
    # 'both' means we get a concatenated fingerprint of combined atoms + bonds features
    outfile=$(echo $file | sed 's:.csv:_grover_base_both.npz:')
    echo "EMB: $outfile"
    if [[ ! -f $outfile ]]; then
        echo "Generating embedding: $outfile"
        python main.py fingerprint --data_path "$file" \
                           --features_path "$features" \
                           --checkpoint_path data/model/base/grover_base.pt \
                           --fingerprint_source both \
                           --output "$outfile"
    fi;
done;

FILE: data/embeddings/lincs_full_smiles.csv
EMB: data/embeddings/lincs_full_smiles_grover_base_large.npz
Generating embedding: data/embeddings/lincs_full_smiles_grover_base_large.npz
Loading data
FILE: data/embeddings/trapnell_cpa.csv
EMB: data/embeddings/trapnell_cpa_grover_base_large.npz
Generating embedding: data/embeddings/trapnell_cpa_grover_base_large.npz
Loading data
FILE: data/embeddings/trapnell_cpa_subset.csv
EMB: data/embeddings/trapnell_cpa_subset_grover_base_large.npz
Generating embedding: data/embeddings/trapnell_cpa_subset_grover_base_large.npz
Loading data


+ for file in 'data/embeddings/*.csv'
+ echo 'FILE: data/embeddings/lincs_full_smiles.csv'
++ echo data/embeddings/lincs_full_smiles.csv
++ sed s:.csv:.npz:
+ features=data/embeddings/lincs_full_smiles.npz
+ [[ ! -f data/embeddings/lincs_full_smiles.npz ]]
++ echo data/embeddings/lincs_full_smiles.csv
++ sed s:.csv:_grover_base_large.npz:
+ outfile=data/embeddings/lincs_full_smiles_grover_base_large.npz
+ echo 'EMB: data/embeddings/lincs_full_smiles_grover_base_large.npz'
+ [[ ! -f data/embeddings/lincs_full_smiles_grover_base_large.npz ]]
+ echo 'Generating embedding: data/embeddings/lincs_full_smiles_grover_base_large.npz'
+ python main.py fingerprint --data_path data/embeddings/lincs_full_smiles.csv --features_path data/embeddings/lincs_full_smiles.npz --checkpoint_path data/model/large/grover_large.pt --fingerprint_source both --output data/embeddings/lincs_full_smiles_grover_base_large.npz
Total size = 17,767
Generating...
Loading pretrained parameter "grover.encoders.edge_blocks.

In [118]:
trapnell_base = np.load("data/embeddings/trapnell_cpa_grover_base_both.npz")
trapnell_large = np.load("data/embeddings/trapnell_cpa_grover_large_both.npz")

In [121]:
print("Shape of GROVER_base embedding:", trapnell_base["fps"].shape)
print("Shape of GROVER_large embedding:", trapnell_large["fps"].shape)

Shape of GROVER_base embedding: (188, 3400)
Shape of GROVER_large embedding: (188, 5000)


## Step 3: Generate DataFrame with SMILES -> Embedding mapping

In [179]:
def flatten(x: np.ndarray):
    assert len(x.shape) == 2 and x.shape[0] == 1
    return x[0]

embeddings_fpath = Path("data/embeddings")
for file in embeddings_fpath.iterdir():
    if str(file).endswith("csv"):
        # read original SMILES list
        df = pd.read_csv(file)
        # read generated embedding (.npz has only one key, 'fps')
        emb = np.load(str(file).replace(".csv", "_grover_base_both.npz"))["fps"]
        assert len(df) == emb.shape[0]
        # generate a DataFrame with SMILES and Embedding in each row
        df["emb_grover_base_both"] = [flatten(x) for x in np.split(emb, emb.shape[0])]
        df.to_parquet(str(file).replace(".csv", ".parquet"))

In [180]:
df = pd.read_parquet("data/embeddings/trapnell_cpa_subset.parquet")

In [181]:
df

Unnamed: 0,smiles,emb_grover_base_both
0,COc1cc2c(cc1O)CC[C@@H]1[C@@H]2CC[C@]2(C)[C@@H]...,"[0.1959595, -0.27820805, 0.15315697, 0.1058221..."
1,COc1cc2c(cc1OCCCN1CCCC1)N=C(N)C21CCC1,"[0.13969353, 0.11307117, 0.17590186, 0.1401735..."
2,CN(C)CCC(CSc1ccccc1)Nc1ccc(S(=O)(=O)NC(=O)c2cc...,"[0.21154176, -0.6662256, 0.1576269, 0.1624686,..."
3,Cc1c(NC(=O)OCC2COCCN2)cn2ncnc(Nc3ccc4c(cnn4Cc4...,"[0.10222339, -1.6007254, 0.061119717, 0.095597..."
4,CN(C)Cc1ccc(-c2nc3cccc4c3n2CCNC4=O)cc1,"[0.24495946, -0.6442437, 0.08785248, 0.1380993..."
5,N#C/C(=C\c1ccc(O)c(O)c1)C(=O)NCc1ccccc1,"[0.28735524, 0.22965477, 0.08471221, 0.0982936..."
6,NC(=O)c1ncn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)c1N,"[0.19129804, -0.8480568, 0.0032743316, -0.0519..."
7,Cc1csc(-c2nnc(Nc3ccc(Oc4ncccc4-c4ccnc(N)n4)cc3...,"[0.08628379, -1.1257279, 0.01997998, 0.0382080..."
8,CC(C)[C@H](C(=O)Nc1ccc(C(=O)NO)cc1)c1ccccc1,"[0.19964081, -0.51676327, 0.11761741, 0.105078..."
9,O=C(Nc1c[nH]nc1-c1nc2cc(CN3CCOCC3)ccc2[nH]1)NC...,"[0.12986009, -0.7568938, 0.021848459, 0.076668..."
