# Hands On Tutorial 3: Multitask Deep Learning for Drug Discovery

Implemntation of the model described in https://arxiv.org/pdf/1502.02072.pdf

### Imports and parameters

In [1]:
import keras
import keras.backend as K
from keras.models import Model
from keras.layers import Input, Dense, Conv1D, LSTM, Dropout
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.optimizers import SGD
from keras.losses import MSE

import pandas as pd
import numpy as np
import zipfile
import requests
import shutil
import os

from helpers.extra_metrics import explained_variance_score, r2_score

Using TensorFlow backend.


In [None]:
MAX_LEN = 120
VAL_SIZE = 20000
LEARNING_RATE = .001
EPOCHS = 50
BATCH_SIZE = 64

### Dataset preparation

We will use the [QM9 dataset](http://quantum-machine.org/datasets/):
* L. Ruddigkeit, R. van Deursen, L. C. Blum, J.-L. Reymond, Enumeration of 166 billion organic small molecules in the chemical universe database GDB-17, J. Chem. Inf. Model. 52, 2864–2875, 2012.
* R. Ramakrishnan, P. O. Dral, M. Rupp, O. A. von Lilienfeld, Quantum chemistry structures and properties of 134 kilo molecules, Scientific Data 1, 140022, 2014. [bibtex]

The QM9 dataset contains 133885 organic molecules, represented as SMILES strings, with 15 properties each. We will train a regression model to predict all 15 properties in a multitask settings.

In [None]:
# download dataset
url = 'https://s3.us-east-2.amazonaws.com/weizmann-dl-workshop/data_qm9.pkl.zip'
file_path = 'datasets/qm9.pkl.zip'

In [None]:
os.makedirs('datasets/')
r = requests.get(url, auth=('usrname', 'password'), verify=False,stream=True)
r.raw.decode_content = True
with open(file_path, 'wb') as f:
        shutil.copyfileobj(r.raw, f)

In [None]:
""" ONLY IF THE PREVIOUS FAILED TO DOWNLOAD """
!mkdir -p datasets
!wget --no-check-certificate $url -O $file_path

In [None]:
# extract dataset
with zipfile.ZipFile(file_path, 'r') as file:
    file.extractall('datasets/')

In [None]:
# preprocess data
data = pd.read_pickle('datasets/data_qm9.pkl')
labels = ['A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']

properties = {}
for i, key in enumerate(labels):
    properties[key] = data['properties'][:, i]
    
properties['smiles'] = data['smiles_optimized']

df = pd.DataFrame(properties)
df = df[['smiles'] + labels]
df.head()

#### Task 1: convert smiles strings into one-hot representation and add to the dataframe in a column named 'one_hot' 
  
hint: use zero padding to make the length equal to MAX_LEN

In [None]:
"""
Convert SMILES to one-hot here
"""

In [None]:
# split to train/validation sets
x_train = df.one_hot.values[:-VAL_SIZE]
x_val = df.one_hot.values[-VAL_SIZE:]

y_train = df[labels].values[:-VAL_SIZE]
y_val = df[labels].values[-VAL_SIZE:]

### Building Model Architecture

#### Task 2: define model architecture

In [None]:
inp = Input(shape=(MAX_LEN, None))

In [None]:
"""
Build your model here.
Eventually the variable 'outputs' should be a list of all the model outputs, one for each task.
"""

In [None]:
model = Model(inputs=inp, outputs=outputs)

### Compile model and set loss function and metrics


#### Task 3: define loss function

In [None]:
"""
Define your loss function here
"""
loss = ...

In [None]:
# define metrics
metrics = [MSE, explained_variance_score, r2_score]

# compile model
model.compile(loss=loss,
              metrics=metrics,
              optimizer=SGD(lr=LEARNING_RATE, nesterov=True))

### Model Training


In [None]:
# set callbacks
callbacks = [ReduceLROnPlateau(factor=.2, patience=4, verbose=1),
             EarlyStopping(monitor='val_loss', patience=10)]

In [None]:
# train model
model.fit(x=x_train,
          y=y_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          callbacks=callbacks,
          validation_data=(x_val, y_val))

### Model Evaluation


In [None]:
y_pred = model.predict(x=x_val)

In [None]:
print('##### Model Evaluation #####')
print('Mean Squared Error: %f' % MSE(y_true=y_val, y_pred=y_pred))
print('Explained Variance: %f' % explained_variance_score(y_true=y_val, y_pred=y_pred))
print('R2 Score: %f' % r2_score(y_true=y_val, y_pred=y_pred))