In [4]:
# Read data
import pandas as pd
raw_df = pd.read_csv(".cache/gdb13.1M.freq.ll.smi", header=None, sep='\t')
raw_df.head()

Unnamed: 0,0,1,2
0,C1=Cc2cc1nnc1snc(o2)-o-1,0,68.182535
1,N1C2C3C4C5NC6C7C6C5(C13)C2N47,0,67.352869
2,c1c2c[nH]c(nn3cnc(c#1)c3)-s-2,0,65.054106
3,N=c1-c2cnn-1cnccc(=O)c2,0,62.522982
4,C=Nn1-c2cccconc-1[nH]c2,0,59.586299


In [5]:
# Generate canonical smiles and molecular weights
from rdkit import Chem
from rdkit.Chem import Descriptors
# 标准化
raw_df['SMILES'] = raw_df[0].apply(Chem.CanonSmiles)
# 计算分子量
GetMW = lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x))
raw_df['MW'] = raw_df['SMILES'].apply(GetMW)
raw_df.head()

Unnamed: 0,0,1,2,SMILES,MW
0,C1=Cc2cc1nnc1snc(o2)-o-1,0,68.182535,C1=CC2=NN=C3OC(=NS3)OC1=C2,193.187
1,N1C2C3C4C5NC6C7C6C5(C13)C2N47,0,67.352869,N1C2C3C4C5NC6C7C6C5(C13)C2N47,173.219
2,c1c2c[nH]c(nn3cnc(c#1)c3)-s-2,0,65.054106,C1#Cc2cn(cn2)N=C2NC=C1S2,188.215
3,N=c1-c2cnn-1cnccc(=O)c2,0,62.522982,N=C1C2=CC(=O)C=CN=CN1N=C2,174.163
4,C=Nn1-c2cccconc-1[nH]c2,0,59.586299,C=NN1C2=CNC1=NOC=CC=C2,176.179


In [6]:
# save intermediate results
raw_df.to_pickle('.cache/raw_df.pickle')
# reload intermediate results
raw_df = pd.read_pickle('.cache/raw_df.pickle')

# 分数据集

In [8]:
import numpy as np
np.random.seed(42)

shuffled_index = np.random.permutation(len(raw_df))
train_idx = shuffled_index[:int(len(raw_df)*0.8)]
val_idx = shuffled_index[int(len(raw_df)*0.8):int(len(raw_df)*0.9)]
test_idx = shuffled_index[int(len(raw_df)*0.9):]

Note that T5Chem for regression was initially designed for predicting reaction yield. Therefore, here I applied min-max scaling to the whole dataset based on train samples:

In [9]:
class Scaler(object):
    def __init__(self, data, span=(0,100)):
        self.min = np.min(data)
        self.max = np.max(data)
        self.span = span
        self.scale = (span[1]-span[0])/(self.max-self.min)
        self.min_ = span[0] - self.min * self.scale

    def encode(self, x):
        if x > self.max:
            return self.span[1]
        elif x < self.min:
            return self.span[0]
        x *= self.scale
        x += self.min_
        return x

    def decode(self,x):
        x -= self.min_
        x /= self.scale
        return x

In [10]:
# We obtained the scaler with ONLY training samples!
scaler = Scaler(raw_df['MW'][train_idx])
# Then apply it to whole dataset
raw_df['MW_converted'] = raw_df['MW'].apply(scaler.encode)

In [13]:
import os
import tqdm
os.makedirs("data/MW/", exist_ok=True)
train_source = open("data/MW/train.source", "w")
train_target = open("data/MW/train.target", "w")
val_source = open("data/MW/val.source", "w")
val_target = open("data/MW/val.target", "w")
test_source = open("data/MW/test.source", "w")
test_target = open("data/MW/test.target", "w")
with tqdm.tqdm(raw_df.iterrows(), total=len(raw_df)) as pbar:
    for i,item in pbar:
      if i in train_idx:
        print(item['SMILES'], file=train_source)
        print(item['MW_converted'], file=train_target)
      elif i in val_idx:
        print(item['SMILES'], file=val_source)
        print(item['MW_converted'], file=val_target)
      else:
        print(item['SMILES'], file=test_source)
        print(item['MW_converted'], file=test_target)
train_source.close()
train_target.close()
val_source.close()
val_target.close()
test_source.close()
test_target.close()

100%|██████████| 1000000/1000000 [07:31<00:00, 2213.76it/s]


# Download a pretrained model
- <https://yzhang.hpc.nyu.edu/T5Chem/models/simple_pretrain.tar.bz2>

```tar -xjvf simple_pretrain.tar.bz2```

# Train a model
在 GPU 上需要大约 20 mins

```shell
t5chem train --data_dir data/MW/ --output_dir model/ --pretrain models/pretrain/simple/ --task_type regression --num_epoch 1
```

# Make predictions
Even though we could have had a better model if we trained for longer time, due to time concern, let's just use the final checkpoint to make predictions.

In [None]:
!t5chem predict --data_dir t5chem/data/MW/ --model_dir model/ --prediction model/raw_predictions.csv
!tensorboard --logdir model/runs/

prediction: 100%|███████████████████████████| 1563/1563 [00:29<00:00, 53.11it/s]
MAE: 0.11072250106811524    RMSE: 0.20774499148766476    r2: 0.9987780558180899    r:0.9993888411514759


In [None]:
scaled_predictions = pd.read_csv("model/raw_predictions.csv")
scaled_predictions.head()

Unnamed: 0,target,prediction
0,50.52261,50.450325
1,50.547855,50.582577
2,53.039715,53.022133
3,51.767277,50.822006
4,50.494839,50.515278


Let's convert it back to its original scale:

In [None]:
scaled_predictions['True_target'] = scaled_predictions['target'].apply(scaler.decode)
scaled_predictions['True_prediction'] = scaled_predictions['prediction'].apply(scaler.decode)
scaled_predictions.head()

Unnamed: 0,target,prediction,True_target,True_prediction
0,50.52261,50.450325,176.179002,176.064477
1,50.547855,50.582577,176.219,176.274011
2,53.039715,53.022133,180.167003,180.139146
3,51.767277,50.822006,178.151003,176.653354
4,50.494839,50.515278,176.135003,176.167386


And take a look at its MAE/RMSE

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

MAE = mean_absolute_error(scaled_predictions['True_target'], scaled_predictions['True_prediction'])
MSE = mean_squared_error(scaled_predictions['True_target'], scaled_predictions['True_prediction'])
print("MAE: {}    RMSE: {}".format(MAE, MSE**0.5))

MAE: 0.17542430179227897    RMSE: 0.3291428547133963


Well, not bad! 

In this notebook, we successfully trained a T5Chem model on molecular weight prediciton. Now this T5Chem can predict molecular weight from SMILES! 


**Bugs**
- If you encounter any bugs, please report the issue to https://github.com/HelloJocelynLu/t5chem/issues
