In [2]:
!conda create -n ftudd_chem --file requirements_chem.txt -c pytorch -c rdkit -c conda-forge -c rmg
# Then start the notebook in the new environment again

fatal: destination path '../grover' already exists and is not an empty directory.
Collecting boost
  Downloading boost-0.1.tar.gz (6.3 kB)
[31mERROR: Could not find a version that satisfies the requirement boost-cpp (from versions: none)[0m
[31mERROR: No matching distribution found for boost-cpp[0m
Note: you may need to restart the kernel to use updated packages.


### Grover model

Code and pretrained weights are available from here: https://github.com/tencent-ailab/grover
Implementation of Yu et al., Self-Supervised Graph Transformer on Large-Scale Molecular Data, NeurIPS 2020

Grover is an instance of a graph neural network. It is trained in a self-supervised way, i.e. from unlabeled training data, and creates an embedding of a molecule. It can be fine-tuned for downstream tasks.

In [10]:
# clone Grover repository
!git clone https://github.com/tencent-ailab/grover.git ../grover
!mkdir ../grover/data/
!wget https://ai.tencent.com/ailab/ml/ml-data/grover-models/pretrain/grover_large.tar.gz -O ../grover/data/grover_large.tar.gz
!tar -xzf ../grover/data/grover_large.tar.gz -C ../grover/data/
sys.path.append('../grover')

fatal: destination path '../grover' already exists and is not an empty directory.
--2022-03-04 10:38:39--  https://ai.tencent.com/ailab/ml/ml-data/grover-models/pretrain/grover_large.tar.gz
Resolving ai.tencent.com (ai.tencent.com)... 116.128.164.87
Connecting to ai.tencent.com (ai.tencent.com)|116.128.164.87|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 399013496 (381M) [application/octet-stream]
Saving to: ‘../grover/data/grover_large.tar.gz’


2022-03-04 11:14:43 (134 KB/s) - Read error at byte 296691428/399013496 (Operation timed out). Retrying.

--2022-03-04 11:14:44--  (try: 2)  https://ai.tencent.com/ailab/ml/ml-data/grover-models/pretrain/grover_large.tar.gz
Connecting to ai.tencent.com (ai.tencent.com)|116.128.164.87|:443... connected.
HTTP request sent, awaiting response... 206 Partial Content
Length: 399013496 (381M), 102322068 (98M) remaining [application/octet-stream]
Saving to: ‘../grover/data/grover_large.tar.gz’

../grover/data/grov 100%[++++

In [11]:
# Grover has a command line interface, let us use it to generate an embedding of a sample molecule
smiles = 'CC(=O)O'
with open('test_smiles.csv', 'w') as f:
    f.write(smiles)

In [23]:
%%bash
python ../grover/main.py predict --data_path test_smiles.csv --checkpoint_dir ../grover/data/ --no_features_scaling --output data_pre.csv

/opt/anaconda3/envs/ftudd_chem/bin/python


Traceback (most recent call last):
  File "../grover/main.py", line 54, in <module>
    avg_preds, test_smiles = make_predictions(args, train_args)
  File "/Users/m275696/Library/CloudStorage/OneDrive-MerckGroup/Talks/2022_FTUDD_DeepLearning/grover/task/predict.py", line 91, in make_predictions
    torch.cuda.set_device(args.gpu)
  File "/opt/anaconda3/envs/ftudd_chem/lib/python3.6/site-packages/torch/cuda/__init__.py", line 265, in set_device
    torch._C._cuda_setDevice(device)
AttributeError: module 'torch._C' has no attribute '_cuda_setDevice'


CalledProcessError: Command 'b'echo `which python`\npython ../grover/main.py predict --data_path test_smiles.csv --checkpoint_dir ../grover/data/ --no_features_scaling --output data_pre.csv --no_cuda\n'' returned non-zero exit status 1.

### ChemBERTa model

ChemBERTa is based on the BERT NLP model and treats SMILES strings as text that can be modeled. Most NLP models are nicely wrapped by the Huggingface transformer library and hence, we can leverage their API. Further details on ChemBERTa can be found in the paper:

Chithrananda et al., ChemBERTa: Large-Scale Self-Supervised Pretraining for Molecular Property Prediction, arXiv 2020

or at Github: https://github.com/seyonechithrananda/bert-loves-chemistry

In [7]:
# Download Chemberta model
from transformers import AutoTokenizer, AutoModelForMaskedLM
chemberta_model_name = 'seyonec/ChemBERTa-zinc-base-v1'
chemberta_tokenizer = AutoTokenizer.from_pretrained(chemberta_model_name)
chemberta_model = AutoModelForMaskedLM.from_pretrained(chemberta_model_name)

In [9]:
import torch
def embed_smiles(smiles, tokenizer, model, layers):
    """
    Returns the embedding of a SMILES string.
    """
    # Get the tokenized input
    tokenized_input = tokenizer(smiles, return_tensors='pt')
    # Get the embedding
    with torch.no_grad():
        output = model(**tokenized_input, output_hidden_states=True)
    # Return the embedding
    states = torch.stack([output.hidden_states[l] for l in layers]).mean([1,2]).view(-1)
    return states.detach().numpy()

test_embedding = embed_smiles('CC(=O)O', chemberta_tokenizer, chemberta_model, [-1])
print(test_embedding.shape)
print(test_embedding[:10])

(768,)
[ 0.30163023  0.50087255 -0.67029685 -1.5062698   0.09748616 -0.6335993
 -0.02147115  0.14238326 -1.3668206   0.44067818]


In [None]:
# load AqSolDB data 
import pandas as pd
df_aqsol = pd.read_csv('curated-solubility-dataset.csv')
print(df_aqsol.head(4))
smiles = df_aqsol['SMILES'].values
targets = df_aqsol['Solubility'].values

### Your tasks
1. Create embeddings for the molecules in the AqSolDB dataset using both the pretrained ChemBERTa model as well as the Grover model
2. Train a suitable scikit-learn model on top of these embeddings to predict the solubility
3. Experiment with this setting and summarize your findings

### The advanced stuff
4. Fine tune Grover and ChemBERTa on the AqSol prediction task 
5. Experiment and summarize your findings