In [1]:
from pathlib import Path

import scanpy as sc
from rdkit import Chem
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
dest_dir = Path("./data")
dest_dir.mkdir(parents=True, exist_ok=True)

Create the GROVER embeddings

In [3]:
%%bash
set -euox pipefail

# move csv of all smiles to be encoded into current workdir
cp ../../datasets/sciplex/sciplex.smiles ./data/preliminary_embeddings/sciplex_smiles.csv
file="data/preliminary_embeddings/sciplex_smiles.csv"

# First we generate the feature embedding for the SMILES, which is an extra input
# into GROVER
echo "FILE: $file"
features=$(echo $file | sed 's:.csv:.npz:')
if [[ ! -f $features ]]; then
    echo "Generating features: $features"
    python scripts/save_features.py --data_path "$file" \
                            --save_path "$features" \
                            --features_generator rdkit_2d_normalized \
                            --restart
fi;

# Second we input SMILES + Features into grover and get the fingerprint out
# 'both' means we get a concatenated fingerprint of combined atoms + bonds features
outfile=$(echo $file | sed 's:.csv:_grover_base_both.npz:')
echo "EMB: $outfile"
if [[ ! -f $outfile ]]; then
    echo "Generating embedding: $outfile"
    python main.py fingerprint --data_path "$file" \
                       --features_path "$features" \
                       --checkpoint_path data/checkpoints/grover_base.pt \
                       --fingerprint_source both \
                       --output "$outfile"
fi;


+ cp ../../datasets/sciplex/sciplex.smiles ./data/preliminary_embeddings/sciplex_smiles.csv
+ file=data/preliminary_embeddings/sciplex_smiles.csv
+ echo 'FILE: data/preliminary_embeddings/sciplex_smiles.csv'


FILE: data/preliminary_embeddings/sciplex_smiles.csv


++ sed s:.csv:.npz:
++ echo data/preliminary_embeddings/sciplex_smiles.csv
+ features=data/preliminary_embeddings/sciplex_smiles.npz
+ [[ ! -f data/preliminary_embeddings/sciplex_smiles.npz ]]
++ sed s:.csv:_grover_base_both.npz:
++ echo data/preliminary_embeddings/sciplex_smiles.csv
+ outfile=data/preliminary_embeddings/sciplex_smiles_grover_base_both.npz
+ echo 'EMB: data/preliminary_embeddings/sciplex_smiles_grover_base_both.npz'
+ [[ ! -f data/preliminary_embeddings/sciplex_smiles_grover_base_both.npz ]]


EMB: data/preliminary_embeddings/sciplex_smiles_grover_base_both.npz


Load the embeddings and save them in standard format 

In [4]:
sciplex_embeddings = np.load("data/preliminary_embeddings/sciplex_smiles_grover_base_both.npz")
print("Shape of GROVER_base embedding:", sciplex_embeddings["fps"].shape)

Shape of GROVER_base embedding: (188, 3400)


Convert embeddings into data frame and save

In [5]:
def flatten(x: np.ndarray):
    assert len(x.shape) == 2 and x.shape[0] == 1
    return x[0]

embeddings_fpath = Path("data/preliminary_embeddings")
smiles_file = embeddings_fpath / "sciplex_smiles.csv"
emb_file = embeddings_fpath / "sciplex_smiles_grover_base_both.npz"

# read list of smiles 
smiles_df = pd.read_csv(smiles_file)
# read generated embedding (.npz has only one key, 'fps')
emb = np.load(emb_file)["fps"]
assert len(smiles_df) == emb.shape[0]

# generate a DataFrame with SMILES and Embedding in each row
final_df = pd.DataFrame(emb, index=smiles_df["SMILES"].values, columns=[f"latent_{i+1}" for i in range(emb.shape[1])])
# remove duplicates indices (=SMILES) (This is probably useless)
final_df = final_df[~final_df.index.duplicated(keep="first")]

In [6]:
df = final_df.to_csv(dest_dir / "embeddings.csv")