<a href="https://colab.research.google.com/github/yala/introML_chem/blob/master/lab2/property_prediction_exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Property Prediction Exercise!
In this exercise, you'll extend on the tutorial from lab1 to implement neural networks to predict log p from Morgan Fingerprints. 

Let's get started!

In [0]:
# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.0-{platform}-linux_x86_64.whl torchvision==0.2.0
import torch
print(torch.__version__)
print(torch.cuda.is_available())

!wget -c https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!conda install -q -y --prefix /usr/local -c conda-forge rdkit rdkit scikit-learn


In [0]:
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')
import math
import os
import random
from typing import Union, List, Dict
import argparse
from sklearn.feature_extraction.text import CountVectorizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pickle
import re
print( sys.version)
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from sklearn.metrics import mean_squared_error

!wget https://raw.githubusercontent.com/yala/introML_chem/master/lab1/data/chem/delaney_train.csv
!wget https://raw.githubusercontent.com/yala/introML_chem/master/lab1/data/chem/delaney_val.csv
!wget https://raw.githubusercontent.com/yala/introML_chem/master/lab1/data/chem/delaney_test.csv

In [0]:
def morgan_fingerprint(smiles: str, radius: int = 3, num_bits: int = 2048) -> np.ndarray:
  mol = Chem.MolFromSmiles(smiles)
  morgan_vect = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=num_bits)
  morgan_fp = np.zeros((1,))
  DataStructs.ConvertToNumpyArray(morgan_vect, morgan_fp)
  
  return morgan_fp

class MoleculeDatapoint:
  def __init__(self, smiles: str, target: float):
    self.smiles = smiles
    self.target = target
    self.morgan = morgan_fingerprint(smiles)
    
class MoleculeDataset(torch.utils.data.Dataset):
  def __init__(self, data: List[MoleculeDatapoint]):
    self.data = data
    
  def smiles(self) -> List[str]:
    return [d.smiles for d in self.data]
  
  def targets(self) -> List[float]:
    return [d.target for d in self.data]
  
  def morgans(self) -> List[np.ndarray]:
    return [d.morgan for d in self.data]
  
  def __len__(self):
    return len(self.data)
  
  def __getitem__(self, i):
    return self.data[i].morgan,  self.data[i].target

def get_data(split: str) -> MoleculeDataset:
  data_path = 'delaney_{}.csv'.format(split)
  with open(data_path) as f:
    f.readline()
    data = []
    for line in f:
      smiles, target = line.strip().split(',')
      target = float(target)
      data.append(MoleculeDatapoint(smiles, target))
 
      
  return MoleculeDataset(data)


def rmse(targets: List[float], preds: List[float]) -> float:
    return math.sqrt(mean_squared_error(targets, preds))

## Prepare your dataset

In [0]:
# Load train/val/test data
train = get_data('train')
dev = get_data('val')
test = get_data('test')

## Define your Model and Hyper-parameters

In [0]:
# Training settings
batch_size = pass
epochs = pass
lr = pass
momentum = pass

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        pass

    def forward(self, x):
        pass


model = Model()
optimizer = pass


train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
dev_loader = torch.utils.data.DataLoader(dev, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=True)

In [0]:

for batch in train_loader:
  print(batch[0].shape)
  print(batch[1].shape)
  
  break


## Define your training procedure




To train our model:

1) we'll randomly sample batches from our train loader

2) compute our loss (using standard `cross_entropy`)

3) compute our gradients (by calling `backward()` on our loss)

4) update our neural network with an `optimizer.step()`, and go back to 1)

I've added some extra stuff here to log our accuracy and avg loss for the epoch.


In [0]:
def train_epoch( model, train_loader, optimizer, epoch):
    model.train() # Set the nn.Module to train mode. 
    total_loss = 0
    num_samples = len(train_loader.dataset)
    for batch_idx, (x, target) in enumerate(train_loader): #1) get batch
        x = x.float()
        target = target.float()
        # Reset gradient data to 0
        pass
        # Get prediction for batch
        output = model(x).squeeze(1)
        # 2) Compute loss (hint: MSE!)
        loss = pass
        #3) Do backprop
        pass
        #4) Update model
        pass

        total_loss += loss.detach() # Don't keep computation graph 

    print('Train Epoch: {} \tMSE: {:.4f})\n'.format(
            epoch, total_loss / num_samples))


## Define our evaluation loop
Similar to above, we'll also loop through our dev or test set, and compute our loss and accuracy. 
This lets us see how well our model is generalizing. 

In [0]:
def eval_epoch(model, test_loader, name):
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        data = data.float()
        target = target.float()
        output = model(data).squeeze(-1)
        test_loss += pass

    test_loss /= len(test_loader.dataset)

    print('\n{} set: Average MSE: {:.4f}\n'.format(
        name,
        test_loss))


## Train your model

In [0]:

for epoch in range(1, epochs + 1):
    train_epoch(model, train_loader, optimizer, epoch)
    eval_epoch(model,  dev_loader, "Dev")
    print("---")

In [0]:
eval_epoch(model,  test_loader, "Test")