<a href="https://colab.research.google.com/github/swansonk14/chemprop-intro/blob/master/lab1/random_forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Random Forest on Morgan Fingerprint

In [1]:
!wget -c https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!conda install -q -y --prefix /usr/local -c rdkit rdkit scikit-learn

import sys
sys.path.append('/usr/local/lib/python3.6/site-packages/')

!wget https://raw.githubusercontent.com/swansonk14/chemprop-intro/master/data/delaney_train.csv
!wget https://raw.githubusercontent.com/swansonk14/chemprop-intro/master/data/delaney_test.csv

--2018-12-27 08:01:29--  https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
Resolving repo.anaconda.com (repo.anaconda.com)... 104.17.111.77, 104.17.109.77, 104.17.110.77, ...
Connecting to repo.anaconda.com (repo.anaconda.com)|104.17.111.77|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.

PREFIX=/usr/local
reinstalling: python-3.7.0-hc3d631a_0 ...
Python 3.7.0
reinstalling: ca-certificates-2018.03.07-0 ...
reinstalling: conda-env-2.6.0-1 ...
reinstalling: libgcc-ng-8.2.0-hdf63c60_1 ...
reinstalling: libstdcxx-ng-8.2.0-hdf63c60_1 ...
reinstalling: libffi-3.2.1-hd88cf55_4 ...
reinstalling: ncurses-6.1-hf484d3e_0 ...
reinstalling: openssl-1.0.2p-h14c3975_0 ...
reinstalling: xz-5.2.4-h14c3975_4 ...
reinstalling: yaml-0.1.7-had09818_2 ...
reinstalling: zlib-1.2.11-ha838bed_2 ...
reinstalling: libedit-3.1.20170329-h6b74fdf_2 ...
reinstalling: readline-7.0-h7b6447c_5 ...

In [0]:
import math
import os
import random
from typing import Union, List, Dict

import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [0]:
def morgan_fingerprint(smiles: str, radius: int = 3, num_bits: int = 2048) -> np.ndarray:
  mol = Chem.MolFromSmiles(smiles)
  morgan_vect = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=num_bits)
  morgan_fp = np.zeros((1,))
  DataStructs.ConvertToNumpyArray(morgan_vect, morgan_fp)
  
  return morgan_fp

In [0]:
class MoleculeDatapoint:
  def __init__(self, smiles: str, target: float):
    self.smiles = smiles
    self.target = target
    self.morgan = morgan_fingerprint(smiles)
    
class MoleculeDataset:
  def __init__(self, data: List[MoleculeDatapoint]):
    self.data = data
    
  def smiles(self) -> List[str]:
    return [d.smiles for d in self.data]
  
  def targets(self) -> List[float]:
    return [d.target for d in self.data]
  
  def morgans(self) -> List[np.ndarray]:
    return [d.morgan for d in self.data]

In [0]:
def get_data(split: str) -> MoleculeDataset:
  data_path = 'delaney_{}.csv'.format(split)
  with open(data_path) as f:
    f.readline()
    data = []
    for line in f:
      smiles, target = line.strip().split(',')
      target = float(target)
      data.append(MoleculeDatapoint(smiles, target))
      
  return MoleculeDataset(data)

In [0]:
train_data, test_data = get_data('train'), get_data('test')

In [7]:
model = RandomForestRegressor(random_state=42)
model.fit(train_data.morgans(), train_data.targets())



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [8]:
def rmse(targets: List[float], preds: List[float]) -> float:
    return math.sqrt(mean_squared_error(targets, preds))

preds = model.predict(test_data.morgans())
print('rmse = {:.4f}'.format(rmse(test_data.targets(), preds)))

rmse = 1.2125
