<a href="https://colab.research.google.com/github/yala/introML_chem/blob/master/lab1/sample_property_prediction_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Build a classifier on Morgan Fingerprints

In [0]:
!wget -c https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!conda install -q -y --prefix /usr/local -c conda-forge rdkit rdkit scikit-learn

import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')
import math
import os
import random
from typing import Union, List, Dict
import numpy as np

In [0]:
print( sys.version)
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from sklearn.metrics import mean_squared_error

!wget https://raw.githubusercontent.com/yala/introML_chem/master/lab1/data/chem/delaney_train.csv
!wget https://raw.githubusercontent.com/yala/introML_chem/master/lab1/data/chem/delaney_val.csv
!wget https://raw.githubusercontent.com/yala/introML_chem/master/lab1/data/chem/delaney_test.csv

In [0]:
def morgan_fingerprint(smiles: str, radius: int = 3, num_bits: int = 2048) -> np.ndarray:
  mol = Chem.MolFromSmiles(smiles)
  morgan_vect = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=num_bits)
  morgan_fp = np.zeros((1,))
  DataStructs.ConvertToNumpyArray(morgan_vect, morgan_fp)
  
  return morgan_fp

In [0]:
class MoleculeDatapoint:
  def __init__(self, smiles: str, target: float):
    self.smiles = smiles
    self.target = target
    self.morgan = morgan_fingerprint(smiles)
    
class MoleculeDataset:
  def __init__(self, data: List[MoleculeDatapoint]):
    self.data = data
    
  def smiles(self) -> List[str]:
    return [d.smiles for d in self.data]
  
  def targets(self) -> List[float]:
    return [d.target for d in self.data]
  
  def morgans(self) -> List[np.ndarray]:
    return [d.morgan for d in self.data]

def get_data(split: str) -> MoleculeDataset:
  data_path = 'delaney_{}.csv'.format(split)
  with open(data_path) as f:
    f.readline()
    data = []
    for line in f:
      smiles, target = line.strip().split(',')
      target = float(target)
      data.append(MoleculeDatapoint(smiles, target))
      
  return MoleculeDataset(data)


def rmse(targets: List[float], preds: List[float]) -> float:
    return math.sqrt(mean_squared_error(targets, preds))

In [0]:
# Load train/val/test data
train_data = get_data('train')
val_data = get_data('val')
test_data = get_data('test')

In [0]:
# Build your model and experiment to get min rmse on development set

In [0]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=42)
model.fit(train_data.morgans(), train_data.targets())

In [0]:


preds = model.predict(test_data.morgans())# Get predictions of your best model 
print('rmse = {:.4f}'.format(rmse(test_data.targets(), preds)))