<a href="https://colab.research.google.com/github/swansonk14/chemprop-intro/blob/master/lab2/feed_forward_neural_network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feed-Forward Neural Network on Morgan Fingerprint

In [1]:
!wget -c https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!conda install -q -y --prefix /usr/local -c rdkit rdkit pytorch

import sys
sys.path.append('/usr/local/lib/python3.6/site-packages/')

!wget https://raw.githubusercontent.com/swansonk14/chemprop-intro/master/data/delaney_train.csv
!wget https://raw.githubusercontent.com/swansonk14/chemprop-intro/master/data/delaney_test.csv

--2018-12-27 08:02:20--  https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
Resolving repo.anaconda.com (repo.anaconda.com)... 104.17.111.77, 104.17.108.77, 104.17.107.77, ...
Connecting to repo.anaconda.com (repo.anaconda.com)|104.17.111.77|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.

PREFIX=/usr/local
reinstalling: python-3.7.0-hc3d631a_0 ...
Python 3.7.0
reinstalling: ca-certificates-2018.03.07-0 ...
reinstalling: conda-env-2.6.0-1 ...
reinstalling: libgcc-ng-8.2.0-hdf63c60_1 ...
reinstalling: libstdcxx-ng-8.2.0-hdf63c60_1 ...
reinstalling: libffi-3.2.1-hd88cf55_4 ...
reinstalling: ncurses-6.1-hf484d3e_0 ...
reinstalling: openssl-1.0.2p-h14c3975_0 ...
reinstalling: xz-5.2.4-h14c3975_4 ...
reinstalling: yaml-0.1.7-had09818_2 ...
reinstalling: zlib-1.2.11-ha838bed_2 ...
reinstalling: libedit-3.1.20170329-h6b74fdf_2 ...
reinstalling: readline-7.0-h7b6447c_5 ...

In [0]:
import math
import os
import random
from typing import Union, List, Dict

import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

In [0]:
def morgan_fingerprint(smiles: str, radius: int = 3, num_bits: int = 2048) -> np.ndarray:
  mol = Chem.MolFromSmiles(smiles)
  morgan_vect = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=num_bits)
  morgan_fp = np.zeros((1,))
  DataStructs.ConvertToNumpyArray(morgan_vect, morgan_fp)
  
  return morgan_fp

In [0]:
class MoleculeDatapoint:
  def __init__(self, smiles: str, targets: List[float]):
    self.smiles = smiles
    self.targets = targets
    self.morgan = morgan_fingerprint(smiles)
    
class MoleculeDataset:
  def __init__(self, data: List[MoleculeDatapoint]):
    self.data = data
    
  def smiles(self) -> List[str]:
    return [d.smiles for d in self.data]
  
  def targets(self) -> List[float]:
    return [d.targets for d in self.data]
  
  def morgans(self) -> List[np.ndarray]:
    return [d.morgan for d in self.data]
  
  def shuffle(self, seed: int = None):
    if seed is not None:
      random.seed(seed)
    random.shuffle(self.data)
  
  def __len__(self) -> int:
    return len(self.data)
  
  def __getitem__(self, item) -> MoleculeDatapoint:
    return self.data[item]

In [0]:
def get_data(split: str) -> MoleculeDataset:
  data_path = 'delaney_{}.csv'.format(split)
  with open(data_path) as f:
    f.readline()
    data = []
    for line in f:
      line = line.strip().split(',')
      smiles, targets = line[0], line[1:]
      targets = [float(target) for target in targets]
      data.append(MoleculeDatapoint(smiles, targets))
      
  return MoleculeDataset(data)

In [0]:
train_data, test_data = get_data('train'), get_data('test')

In [0]:
num_epochs = 30
batch_size = 50
lr = .01

In [0]:
class FFN(nn.Module):
  def __init__(self):
    super(FFN, self).__init__()
    self.fc1 = nn.Linear(2048, 512)
    self.fc2 = nn.Linear(512, 1)

  def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
    x = self.fc1(x)
    x = self.fc2(x)
    
    return x

In [0]:
model = FFN()
optimizer = optim.SGD(model.parameters(), lr=lr)

In [0]:
def param_count(model: nn.Module) -> int:
    return sum(param.numel() for param in model.parameters() if param.requires_grad)

In [24]:
print(model)
print('Number of parameters = {:,}'.format(param_count(model)))

FFN(
  (fc1): Linear(in_features=2048, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=1, bias=True)
)
Number of parameters = 1,049,601


In [0]:
def train_epoch(model: nn.Module,
                optimizer: optim.Optimizer,
                data: MoleculeDataset,
                batch_size: int,
                epoch: int) -> float:
  model.train()
  data.shuffle(seed=epoch)
  
  total_loss = 0
  num_batches = 0
  
  data_size = len(data) // batch_size * batch_size  # drop final, incomplete batch
  for i in range(0, data_size, batch_size):
    batch = MoleculeDataset(data[i:i + batch_size])
    morgans, targets = batch.morgans(), batch.targets()
    
    morgans, targets = torch.FloatTensor(morgans), torch.FloatTensor(targets)
    
    optimizer.zero_grad()
    preds = model(morgans)
    loss = F.mse_loss(preds, targets)
    loss.backward()
    optimizer.step()
    
    total_loss += math.sqrt(loss.item())
    num_batches += 1
    
  avg_loss = total_loss / num_batches
  
  return avg_loss

In [26]:
num_epochs = 30
for epoch in range(num_epochs):
  train_loss = train_epoch(model, optimizer, train_data, batch_size, epoch)
  print('Epoch {}: Train loss = {:.4f}'.format(epoch, train_loss))

Epoch 0: Train loss = 2.6107
Epoch 1: Train loss = 1.7606
Epoch 2: Train loss = 1.6369
Epoch 3: Train loss = 1.5026
Epoch 4: Train loss = 1.4129
Epoch 5: Train loss = 1.3318
Epoch 6: Train loss = 1.2642
Epoch 7: Train loss = 1.2080
Epoch 8: Train loss = 1.1533
Epoch 9: Train loss = 1.1138
Epoch 10: Train loss = 1.0785
Epoch 11: Train loss = 1.0382
Epoch 12: Train loss = 1.0103
Epoch 13: Train loss = 0.9736
Epoch 14: Train loss = 0.9510
Epoch 15: Train loss = 0.9283
Epoch 16: Train loss = 0.9064
Epoch 17: Train loss = 0.8871
Epoch 18: Train loss = 0.8641
Epoch 19: Train loss = 0.8421
Epoch 20: Train loss = 0.8287
Epoch 21: Train loss = 0.8086
Epoch 22: Train loss = 0.7877
Epoch 23: Train loss = 0.7781
Epoch 24: Train loss = 0.7620
Epoch 25: Train loss = 0.7514
Epoch 26: Train loss = 0.7380
Epoch 27: Train loss = 0.7242
Epoch 28: Train loss = 0.7079
Epoch 29: Train loss = 0.7038


In [0]:
def rmse(targets: List[float], preds: List[float]) -> float:
    return math.sqrt(mean_squared_error(targets, preds))

In [0]:
def evaluate(model: nn.Module, data: MoleculeDataset, batch_size: int):
    model.eval()
    
    all_preds = []
    with torch.no_grad():
      for i in range(0, len(data), batch_size):
        batch = MoleculeDataset(data[i:i + batch_size])
        morgans = batch.morgans()
        
        morgans = torch.FloatTensor(morgans)
        
        preds = model(morgans)
        all_preds.extend(preds)
    
    return rmse(data.targets(), all_preds)

In [29]:
test_rmse = evaluate(model, test_data, batch_size)
print('Test rmse = {:.4f}'.format(test_rmse))

Test rmse = 1.1838
