<a href="https://colab.research.google.com/github/yala/deeplearning_bootcamp/blob/master/lab4/final_property_prediction_exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MLPs, RNNs, and CNNs for Property Prediction

In this exercise, you'll implement a multi-layer perceptron (MPL), a recurrent neural network (RNN), and a convolutional neural network (CNN) to predict log p from SMILES strings.

Let's get started!

# Preliminaries

The next few sections will set up the necessary components of the exercise, including:


1.   Installing PyTorch
2.   Importing dependencies
3.   Downloading and processing data
4.   Defining training and evaluation procedures



## Download PyTorch

In [0]:
# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag

platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
accelerator = 'cu100' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

print(f'Platform = {platform}, Accelerator = {accelerator}')

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-1.1.0-{platform}-linux_x86_64.whl
!pip install -q torchvision

import torch
print(f'Torch version = {torch.__version__}')
print(f'Cuda available = {torch.cuda.is_available()}')
print(f'Cuda version = {torch.version.cuda}')
print(f'Cuda device = {torch.cuda.get_device_name(0)}')

## Imports

In [0]:
import argparse
from collections import Counter
import csv
import pickle
import re

import math
import numpy as np
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

## Download and Process Data

In [0]:
!apt-get install wget
!wget https://raw.githubusercontent.com/yala/introML_chem/master/lab1/data/chem/delaney_train.csv
!wget https://raw.githubusercontent.com/yala/introML_chem/master/lab1/data/chem/delaney_val.csv
!wget https://raw.githubusercontent.com/yala/introML_chem/master/lab1/data/chem/delaney_test.csv

def get_data(split):
    data_path = 'delaney_{}.csv'.format(split)
    with open(data_path) as f:
        data = csv.reader(f)
    
        # Skip header
        next(data)
        
        # Get smiles and targets
        smiles, Y = [], []
        for row in data:
            smiles.append(row[0])
            Y.append(float(row[1]))
    
    return smiles, Y

trainSmiles, trainY = get_data('train')
devSmiles, devY = get_data('val')
testSmiles, testY = get_data('test')

allSmiles = trainSmiles + devSmiles + testSmiles

print(f'Num Train = {len(trainSmiles):,}')
print(f'Num Dev   = {len(devSmiles):,}')
print(f'Num Test  = {len(testSmiles):,}')
print()
print(f'Example data point: smiles = {trainSmiles[0]}, logp = {trainY[0]}')

## Dataset Class

In [0]:
class PropertyPredictionDataset(Dataset):
    def __init__(self, X, Y):
      self.X, self.Y = X, Y
      assert len(X) == len(Y)

    def __len__(self):
       return len(self.X)

    def __getitem__(self, i):
      return np.array(self.X[i]), self.Y[i]

## Model and Training Settings




After building your MLP, RNN, and CNN below, return to this section and experiment with different values to see how they affect training and model performance.

In [0]:
batch_size = 64
epochs = 10
lr = 1e-3
weight_decay = 1e-4
max_len = 100
embedding_size = 300
hidden_size = 300
output_size = 1  # do not modify
dropout = 0.6
use_cuda = True

## Utility Functions

In [0]:
def param_count(model):
    return sum(param.numel() for param in model.parameters() if param.requires_grad)
  
def rmse(targets, preds):
    return math.sqrt(mean_squared_error(targets, preds))

## Training Procedure

In [0]:
def train_epoch(model, train_loader, optimizer, epoch):
    model.train()  # Set the nn.Module to train mode. 
    total_loss = 0
    total_rmse = 0
    num_samples = len(train_loader.dataset)
    num_batches = 0
    for batch_idx, (data, target) in enumerate(train_loader):  # 1) get batch
        # Adjust dimensions of target and cast to float
        target = target.unsqueeze(1).float()
      
        # Move to cuda
        if next(model.parameters()).is_cuda:
            data, target = data.cuda(), target.cuda()
      
        # Reset gradient data to 0
        optimizer.zero_grad()
        
        # Get prediction for batch
        output = model(data)
        
        # 2) Compute loss
        loss = F.mse_loss(output, target)
        
        # 3) Do backprop
        loss.backward()
        
        # 4) Update model
        optimizer.step()
        
        # Do book-keeping to track rmse and avg loss
        total_loss += loss.detach()  # Don't keep computation graph 
        total_rmse += rmse(target.cpu().data.numpy(), output.cpu().data.numpy())
        num_batches += 1

    print(f'Train Epoch: {epoch} '
          f'Loss: {total_loss / num_samples:.4f}, '
          f'RMSE: {total_rmse / num_batches:.4f}')

## Evaluation Procedure

In [0]:
def eval_epoch(model, test_loader, name):
    model.eval()
    test_loss = 0
    test_rmse = 0
    num_batches = 0
    for data, target in test_loader:
        target = target.unsqueeze(1).float()
      
        # Move to cuda
        if next(model.parameters()).is_cuda:
            data, target = data.cuda(), target.cuda()
        
        output = model(data)
        
        test_loss += F.mse_loss(output, target).item()  # sum up batch loss
        test_rmse += rmse(target.cpu().data.numpy(), output.cpu().data.numpy())
        num_batches += 1

    test_loss /= len(test_loader.dataset)
    test_rmse /= num_batches
    print(f'\n{name} set: '
          f'Average loss: {test_loss:.4f}, '
          f'RMSE: {test_rmse:.4f}')

# Character Embeddings for SMILES

In the tutorial, we used word embeddings to encode each word in a sentence. Since a molecule's SMILES string is just a single "word" (i.e. a single sequence of characters), we'll instead use character embeddings, one for each character in the SMILES string. The embeddings will operate in essentially the same way as word embeddings.

## Define Vocab and Character-to-Index Mapping

In [0]:
# Define vocab
vocab = {char for smiles in allSmiles for char in smiles}

print(f'Vocab = {vocab}')

# Create word to index mapping
padding_idx = 0
char_to_index = {char: index + 1 for index, char in enumerate(vocab)}
vocab_size = len(char_to_index) + 1

print(f'Vocab size = {vocab_size:,}')

## Map Characters to Indices

In [0]:
trainX = [[char_to_index[char] for char in smiles] for smiles in trainSmiles]
devX =   [[char_to_index[char] for char in smiles] for smiles in devSmiles]
testX =  [[char_to_index[char] for char in smiles] for smiles in testSmiles]

print(f'Indices of first train SMILES = {trainX[0]}')
print(f'Last five indices = {trainX[0][-5:]}')

## Add Padding

Note: Since some SMILES are long, we've hard coded a maximum sentence length `max_len` in the Model and Training Settings section above.

In [0]:
trainX = [seq[:max_len] + [padding_idx] * (max_len - len(seq)) for seq in trainX]
devX =   [seq[:max_len] + [padding_idx] * (max_len - len(seq)) for seq in devX]
testX =  [seq[:max_len] + [padding_idx] * (max_len - len(seq)) for seq in testX]

print(f'Indices of first train SMILES = {trainX[0]}')
print(f'Last five indices = {trainX[0][-5:]}')

## Build Dataset/DataLoader

In [0]:
# Build Dataset
train = PropertyPredictionDataset(trainX, trainY)
dev = PropertyPredictionDataset(devX, devY)
test = PropertyPredictionDataset(testX, testY)

# Build DataLoader
train_loader = DataLoader(train, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test, batch_size=batch_size, shuffle=True)

# Multi-Layer Perceptron (MLP)

Your first task is to build a multi-layer perceptron (MLP) to predict log p using a sum-of-embeddings approach. Replace all `raise NotImplementedError` lines below with your implementation. When you're ready, build the MLP and then train and test it.

## Define MLP

In [0]:
class MLP(nn.Module):
    def __init__(self, vocab_size, padding_idx, embedding_size, hidden_size, output_size, dropout):
        super(MLP, self).__init__()
        
        # Embedding layer
        raise NotImplementedError
        
        # Fully connected layers
        raise NotImplementedError
        
        # Dropout (regularization)
        raise NotImplementedError

    def forward(self, x):  # batch_size x seq_length
        # Embed
        raise NotImplementedError  # batch_size x seq_length x embedding_size
        
        # Sum embeddings
        raise NotImplementedError  # batch_size x embedding_size
        
        # MLP
        raise NotImplementedError  # batch_size x output_size
        
        return None

## Build MLP

In [0]:
model = MLP(vocab_size, padding_idx, embedding_size, hidden_size, output_size, dropout)

print(model)
print(f'Number of parameters = {param_count(model):,}')

# Move to cuda
if use_cuda and torch.cuda.is_available():
    model = model.cuda()

optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) 

## Train MLP

In [0]:
for epoch in range(1, epochs + 1):
    train_epoch(model, train_loader, optimizer, epoch)
    eval_epoch(model,  dev_loader, "Dev")
    print("---")

## Test MLP

In [0]:
eval_epoch(model,  test_loader, "Test")

# Recurrent Neural Network (RNN)

Your next task is to build a recurrent neural network (RNN) to predict log p using a sequence of SMILES characteer embeddings. Replace all `raise NotImplementedError` lines below with your implementation. When you're ready, build the RNN and then train and test it.

## Define RNN

In [0]:
class RNN(nn.Module):
    def __init__(self, vocab_size, padding_idx, embedding_size, hidden_size, output_size, dropout):
        super(RNN, self).__init__()
        
        # Embedding layer
        raise NotImplementedError
        
        # RNN
        raise NotImplementedError
        
        # Fully connected layer
        raise NotImplementedError
        
        # Dropout (regularization)
        raise NotImplementedError
        
    def forward(self, x):  # batch_size x seq_length
        # Embed
        raise NotImplementedError  # batch_size x seq_length x embedding_size
      
        # Run RNN
        raise NotImplementedError  # batch_size x seq_length x hidden_size
        
        # Dropout
        raise NotImplementedError  # batch_size x seq_length x hidden_size
        
        # Max pooling across sequence
        raise NotImplementedError  # batch_size x hidden_size
        
        # Output layer
        raise NotImplementedError  # batch_size x output_size
        
        return None

## Build RNN

In [0]:
model = RNN(vocab_size, padding_idx, embedding_size, hidden_size, output_size, dropout)

print(model)
print(f'Number of parameters = {param_count(model):,}')

# Move to cuda
if use_cuda and torch.cuda.is_available():
    model = model.cuda()
    
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) 

## Train RNN

In [0]:
for epoch in range(1, epochs + 1):
    train_epoch(model, train_loader, optimizer, epoch)
    eval_epoch(model,  dev_loader, "Dev")
    print("---")

## Test RNN

In [0]:
eval_epoch(model,  test_loader, "Test")

# Convolutional Neural Network (CNN)

Your next task is to build a recurrent neural network (CNN) to predict log p using a sequence of SMILES characteer embeddings. Replace all `raise NotImplementedError` lines below with your implementation. When you're ready, build the CNN and then train and test it.

## Define CNN

In [0]:
class CNN(nn.Module):
    def __init__(self, vocab_size, padding_idx, embedding_size, hidden_size, output_size, dropout):
        super(CNN, self).__init__()
        
        # Embedding layer
        raise NotImplementedError
        
        # Convolutional layers
        raise NotImplementedError
        
        # Fully connect layer
        raise NotImplementedError
        
        # Dropout (regularization)
        raise NotImplementedError
        
    def forward(self, x):  # batch_size x seq_length
        # Embed
        raise NotImplementedError  # batch_size x seq_length x embedding_size
      
        # Permute dimensions
        raise NotImplementedError  # batch_size x embedding_size x seq_length
        
        # Convolutional layers
        raise NotImplementedError  # batch_size x hidden_size x new_seq_length
        
        # Sum
        raise NotImplementedError  # batch_size x hidden_size x new_seq_length
        
        # Max pooling across sequence
        raise NotImplementedError  # batch_size x hidden_size
        
        # Output
        raise NotImplementedError  # batch_size x output_size
        
        return None

## Build CNN

In [0]:
model = CNN(vocab_size, padding_idx, embedding_size, hidden_size, output_size, dropout)

print(model)
print(f'Number of parameters = {param_count(model):,}')

if use_cuda and torch.cuda.is_available():
    model = model.cuda()

optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) 

## Train CNN

In [0]:
for epoch in range(1, epochs + 1):
    train_epoch(model, train_loader, optimizer, epoch)
    eval_epoch(model,  dev_loader, "Dev")
    print("---")

## Test CNN

In [0]:
eval_epoch(model,  test_loader, "Test")

# Improving Performance

Now that you've built an MLP, RNN, and CNN, try your hand at maximizing the performance of each model. Experiment with different network architectures (e.g. different numbers of layers) and different model and training settings (see the Model and Training Settings section near the beginning). Which model performs best? How well does it do?