# Run fine-tuning on MoleculeNet

In [7]:
%cd ..

/home/adam/Projects/hybrid-transformer


In [8]:
import os
import argparse
import torch
import wandb

from hybrid_transformer.configs.task import TaskConfig
from hybrid_transformer.configs.model import ModelConfig
from hybrid_transformer.configs.trainer import TrainerConfig
from hybrid_transformer.configs.logger import LoggerConfig

from hybrid_transformer.utils.datasets.auto import AutoDataset
from hybrid_transformer.utils.tokenizers.auto import AutoTokenizer
from hybrid_transformer.models.auto import AutoModel
from hybrid_transformer.utils.loggers.wandb import WandbLogger

from hybrid_transformer.trainers.trainer import Trainer

from hybrid_transformer.utils.objectives.guacamol.objective import GUACAMOL_TASKS
from hybrid_transformer.utils.objectives.molecule_net.objective import MOLECULENET_REGRESSION_TASKS
from hybrid_transformer.models.prediction import PREDICTION_MODEL_CONFIGS

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:

task_config_path = lambda: f'./configs/tasks/molecule_net/{task}/config.json'
TASKS = MOLECULENET_REGRESSION_TASKS


for task in TASKS:
    print(task)
    task_config = TaskConfig.from_pretrained(task_config_path())
    train_dataset = AutoDataset.from_config(task_config, split='train')
    eval_dataset = AutoDataset.from_config(task_config, split='val')
    test_dataset = AutoDataset.from_config(task_config, split='test')
    tokenizer = AutoTokenizer.from_config(task_config)
    if task == 'lipo':
        break

esol
Downloading esol MoleculeNet task...
Random seed set to 0
Downloaded into ./data/molecule_net/esol
freesolv
Downloading freesolv MoleculeNet task...
Random seed set to 0
Downloaded into ./data/molecule_net/freesolv
lipo
Downloading lipo MoleculeNet task...
Random seed set to 0
Downloaded into ./data/molecule_net/lipo


In [46]:
smiles_truncated = [smiles for smiles in train_dataset.data if len(smiles) < 126]


In [48]:
len(smiles_truncated)

3354

In [15]:
train_dataset.target.mean()

tensor(-1.8590e-09)

In [16]:
test_dataset.target.max()

tensor(1.6039)

In [17]:
test_dataset.target.min()

tensor(-1.5694)

In [18]:
test_dataset.target.mean()

tensor(0.0384)

In [20]:
import pandas as pd

data = pd.read_csv('SAMPL.csv')

In [22]:
target = data['calc']

In [24]:
target.min()

-21.762

In [25]:
import math

math.sqrt(0.02)

0.1414213562373095

In [32]:
from urllib.request import urlretrieve

# https://moleculenet.org/datasets-1





('./data/molecule_net/esol/raw.csv',
 <http.client.HTTPMessage at 0x7f1de78bc550>)

In [39]:
import pandas as pd



In [40]:
data = df['smiles']

In [41]:
target = df['measured log solubility in mols per litre']

In [97]:
MOLECULENET_REGRESSION_TASKS = {
    'esol': 'https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv',
    'freesolv': 'https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/SAMPL.csv',
    'lipo': 'https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv'
}

                         
def load_freesolv_dataset():
    data_path = './data/molecule_net/'
    target_label = 'freesolv'
    data_path = os.path.join(data_path, target_label)
    filename = os.path.join(data_path, 'raw.csv')
    os.makedirs(data_path, exist_ok=True)
    urlretrieve(MOLECULENET_REGRESSION_TASKS[target_label], filename)
    df = pd.read_csv(filename)
    df.rename(columns={"smiles": "X", "calc": "y"}, inplace=True)
    #df.drop(df.columns.difference(['X','y']), 1, inplace=True)
    return df.loc[:, ['X', 'y']]

def load_esol_dataset():
    data_path = './data/molecule_net/'
    target_label = 'esol'
    data_path = os.path.join(data_path, target_label)
    filename = os.path.join(data_path, 'raw.csv')
    os.makedirs(data_path, exist_ok=True)
    urlretrieve(MOLECULENET_REGRESSION_TASKS[target_label], filename)
    df = pd.read_csv(filename)
    df.rename(columns={"smiles": "X", "measured log solubility in mols per litre": "y"}, inplace=True)
    return df.loc[:, ['X', 'y']]

def load_lipo_dataset():
    data_path = './data/molecule_net/'
    target_label = 'lipo'
    data_path = os.path.join(data_path, target_label)
    filename = os.path.join(data_path, 'raw.csv')
    os.makedirs(data_path, exist_ok=True)
    urlretrieve(MOLECULENET_REGRESSION_TASKS[target_label], filename)
    df = pd.read_csv(filename)
    return df

In [98]:
df = load_lipo_dataset()

In [99]:
df

Unnamed: 0,CMPD_CHEMBLID,exp,smiles
0,CHEMBL596271,3.54,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
1,CHEMBL1951080,-1.18,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
2,CHEMBL1771,3.69,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
3,CHEMBL234951,3.37,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...
4,CHEMBL565079,3.10,Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...
...,...,...,...
4195,CHEMBL496929,3.85,OCCc1ccc(NC(=O)c2cc3cc(Cl)ccc3[nH]2)cc1
4196,CHEMBL199147,3.21,CCN(C1CCN(CCC(c2ccc(F)cc2)c3ccc(F)cc3)CC1)C(=O...
4197,CHEMBL15932,2.10,COc1cccc2[nH]ncc12
4198,CHEMBL558748,2.65,Clc1ccc2ncccc2c1C(=O)NCC3CCCCC3


In [64]:
from sklearn.model_selection import train_test_split

df, test = train_test_split(df, test_size=0.1, random_state=0)
train, val = train_test_split(df, test_size=0.1 / 0.9, random_state=0) # correct for df size change

In [66]:
len(train)

512

In [67]:
len(val)

65

In [68]:
len(test)

65

In [102]:
import deepchem as dc
from deepchem.feat.molecule_featurizers.raw_featurizer import RawFeaturizer

featurizer = RawFeaturizer(smiles=True)
splitter = 'random'
task, datasets, transform = dc.molnet.load_delaney(featurizer=featurizer, splitter=splitter)

In [107]:
transform[0].mean()

AttributeError: 'NormalizationTransformer' object has no attribute 'mean'

In [108]:
len(None)

TypeError: object of type 'NoneType' has no len()