In [1]:
# inner module import
import sys
sys.path.append("/storage/homefs/yc24j783/datacat4ml/datacat4ml")
from const import FETCH_DATA_DIR, FETCH_FIG_DIR, FEATURIZE_DATA_DIR, FEATURIZE_FIG_DIR

from simpletransformers.classification import ClassificationModel, ClassificationArgs

import logging

import os
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Data

In [23]:
import glob

# Find all files starting with 'ki_target_CHEMBL233' in the FEATURIZE_DATA_DIR
file_pattern = os.path.join(FEATURIZE_DATA_DIR, 'ki_maxcur', 'ki_target_CHEMBL233*.pkl')
file_list = glob.glob(file_pattern)

# for file 'ki_target_CHEMBL233_1_fp.pkl', add a column 'dataset' with value 'ki_mor_1'; In a similar fashion, for all other files, add a column 'dataset' with value 'ki_mor_2' for file 'ki_target_CHEMBL233_2_fp.pkl'. Finally, concatenate all the dataframes into a single dataframe ki_mor_df.
ki_mor_df = pd.DataFrame()
for file in file_list:
    df = pd.read_pickle(file)
    dataset = os.path.basename(file).split('_')[3]
    print(dataset)
    df['dataset'] = dataset

    ki_mor_df = pd.concat([ki_mor_df, df], axis=0)

1
1022
38
639
308
36
379
1632
2942


In [25]:
ki_mor_1_df = ki_mor_df[ki_mor_df['dataset'] == '1']

## ki_mor_1_df

In [None]:
def split_data(df):
    # Split the data into training, validation and test sets
    import random

    # split the data using random sampling
    df['set'] = ''
    smis = list(df['canonical_smiles'].values)
    random_20per = random.sample(smis, int(0.2*len(smis)))
    random_10per_1 = random_20per[0:int(0.1*len(smis))]
    random_10per_2 = random_20per[int(0.1*len(smis)):len(random_20per)]

    # assaign the set values
    df.loc[df['canonical_smiles'].isin(random_10per_1), 'set'] = 'VAL'
    df.loc[df['canonical_smiles'].isin(random_10per_2), 'set'] = 'TEST'
    df.loc[df['set'] == '', 'set'] = 'TRAIN'

    # check the distribution of the set
    print(f'The lengthe of the df is {len(ki_mor)} )
    df['set'].value_counts().plot(kind='bar')



In [17]:
import random

# split the data using random sampling
ki_mor_1_df['set'] = ''
smis = list(ki_mor_1_df['canonical_smiles'].values)
random_20per = random.sample(smis, int(0.2*len(smis)))
random_10per_1 = random_20per[0:int(0.1*len(smis))]
random_10per_2 = random_20per[int(0.1*len(smis)):len(random_20per)]

# Assign the set
ki_mor_1_df.loc[ki_mor_1_df['canonical_smiles'].isin(random_10per_1), 'set'] = 'VAL'
ki_mor_1_df.loc[ki_mor_1_df['canonical_smiles'].isin(random_10per_2), 'set'] = 'TEST'
ki_mor_1_df.loc[ki_mor_1_df['set'] == '', 'set'] = 'TRAIN'

# check the distribution of the set
print(f'The length of the ki_mor_1_df is {len(ki_mor_1_df)}')
ki_mor_1_df['set'].value_counts()

The length of the ki_mor_1_df is 298


set
TRAIN    226
TEST      42
VAL       30
Name: count, dtype: int64

In [5]:
train_y = list(ki_mor_1_df[ki_mor_1_df['set'] == 'TRAIN']['pchembl_value'].values)
eval_y = list(ki_mor_1_df[ki_mor_1_df['set'] == 'VAL']['pchembl_value'].values)

# x: canonical_smiles
train_smi_x = list(ki_mor_1_df[ki_mor_1_df['set'] == 'TRAIN']['canonical_smiles'].values)
train_smi_df = pd.DataFrame({'text': train_smi_x, 'labels': train_y})

eval_smi_x = list(ki_mor_1_df[ki_mor_1_df['set'] == 'VAL']['canonical_smiles'].values)
eval_smi_df = pd.DataFrame({'text': eval_smi_x, 'labels': eval_y})

# x: canonical_smiles + assay_id (concatenate the two columns by a space)
# convert the dtype of assay_id to str
ki_mor_1_df['assay_id'] = ki_mor_1_df['assay_id'].astype(str)
train_smi_aid_x = ki_mor_1_df[ki_mor_1_df['set'] == 'TRAIN'][['canonical_smiles', 'assay_id']].apply(lambda x: ' '.join(x), axis=1).values
train_smi_aid_df = pd.DataFrame({'text': train_smi_aid_x, 'labels': train_y})

eval_smi_aid_x = ki_mor_1_df[ki_mor_1_df['set'] == 'VAL'][['canonical_smiles', 'assay_id']].apply(lambda x: ' '.join(x), axis=1).values
eval_smi_aid_df = pd.DataFrame({'text': eval_smi_aid_x, 'labels': eval_y})

# x: canonical_smiles + assay_desc (concatenate the two columns by a space)
train_smi_ad_x = ki_mor_1_df[ki_mor_1_df['set'] == 'TRAIN'][['canonical_smiles', 'assay_desc']].apply(lambda x: ' '.join(x), axis=1).values
train_smi_ad_df = pd.DataFrame({'text': train_smi_ad_x, 'labels': train_y})

eval_smi_ad_x = ki_mor_1_df[ki_mor_1_df['set'] == 'VAL'][['canonical_smiles', 'assay_desc']].apply(lambda x: ' '.join(x), axis=1).values
eval_smi_ad_df = pd.DataFrame({'text': eval_smi_ad_x, 'labels': eval_y})

# x: canonical_smiles + vectorized assay-fields, including columns: assay_type, assay_organism, assay_category, assay_tax_id, assay_strain, assay_tissue, assay_cell_type, assay_subcellular_fraction, bao_format.
# load the vectorized assay-fields
smi_afs = ['canonical_smiles', 'assay_type', 'assay_category', 'assay_organism', 'assay_tax_id', 'assay_strain', 'assay_tissue', 'assay_cell_type',
           'assay_subcellular_fraction', 'bao_format', 'assay_test_type']
train_smi_afs_x = ki_mor_1_df[ki_mor_1_df['set'] == 'TRAIN'][smi_afs].apply(lambda x: ' '.join(x), axis=1).values
train_smi_afs_df = pd.DataFrame({'text': train_smi_afs_x, 'labels': train_y})

eval_smi_afs_x = ki_mor_1_df[ki_mor_1_df['set'] == 'VAL'][smi_afs].apply(lambda x: ' '.join(x), axis=1).values
eval_smi_afs_df = pd.DataFrame({'text': eval_smi_afs_x, 'labels': eval_y})


## ki_mor_df

In [18]:
import random

# split the data using random sampling
ki_mor_df['set'] = ''
smis = list(ki_mor_df['canonical_smiles'].values)
random_20per = random.sample(smis, int(0.2*len(smis)))
random_10per_1 = random_20per[0:int(0.1*len(smis))]
random_10per_2 = random_20per[int(0.1*len(smis)):len(random_20per)]

# Assign the set
ki_mor_df.loc[ki_mor_df['canonical_smiles'].isin(random_10per_1), 'set'] = 'VAL'
ki_mor_df.loc[ki_mor_df['canonical_smiles'].isin(random_10per_2), 'set'] = 'TEST'
ki_mor_df.loc[ki_mor_df['set'] == '', 'set'] = 'TRAIN'

# check the distribution of the set
print(f'The length of the ki_mor_df is {len(ki_mor_df)}')
ki_mor_df['set'].value_counts()

The length of the ki_mor_df is 3821


set
TRAIN    2790
TEST      577
VAL       454
Name: count, dtype: int64

In [8]:
train_y = list(ki_mor_df[ki_mor_df['set'] == 'TRAIN']['pchembl_value'].values)
eval_y = list(ki_mor_df[ki_mor_df['set'] == 'VAL']['pchembl_value'].values)

# x: canonical_smiles
train_smi_x = list(ki_mor_df[ki_mor_df['set'] == 'TRAIN']['canonical_smiles'].values)
train_smi_df = pd.DataFrame({'text': train_smi_x, 'labels': train_y})

eval_smi_x = list(ki_mor_df[ki_mor_df['set'] == 'VAL']['canonical_smiles'].values)
eval_smi_df = pd.DataFrame({'text': eval_smi_x, 'labels': eval_y})

# x: canonical_smiles + assay_id (concatenate the two columns by a space)
# convert the dtype of assay_id to str
ki_mor_df['assay_id'] = ki_mor_df['assay_id'].astype(str)
train_smi_aid_x = ki_mor_df[ki_mor_df['set'] == 'TRAIN'][['canonical_smiles', 'assay_id']].apply(lambda x: ' '.join(x), axis=1).values
train_smi_aid_df = pd.DataFrame({'text': train_smi_aid_x, 'labels': train_y})

eval_smi_aid_x = ki_mor_df[ki_mor_df['set'] == 'VAL'][['canonical_smiles', 'assay_id']].apply(lambda x: ' '.join(x), axis=1).values
eval_smi_aid_df = pd.DataFrame({'text': eval_smi_aid_x, 'labels': eval_y})

# x: canonical_smiles + assay_desc (concatenate the two columns by a space)
train_smi_ad_x = ki_mor_df[ki_mor_df['set'] == 'TRAIN'][['canonical_smiles', 'assay_desc']].apply(lambda x: ' '.join(x), axis=1).values
train_smi_ad_df = pd.DataFrame({'text': train_smi_ad_x, 'labels': train_y})

eval_smi_ad_x = ki_mor_df[ki_mor_df['set'] == 'VAL'][['canonical_smiles', 'assay_desc']].apply(lambda x: ' '.join(x), axis=1).values
eval_smi_ad_df = pd.DataFrame({'text': eval_smi_ad_x, 'labels': eval_y})


In [13]:
# x: canonical_smiles + vectorized assay-fields, including columns: assay_type, assay_organism, assay_category, assay_tax_id, assay_strain, assay_tissue, assay_cell_type, assay_subcellular_fraction, bao_format.
# load the vectorized assay-fields
smi_afs = ['canonical_smiles', 'assay_type', 'assay_category', 'assay_organism', 'assay_tax_id', 'assay_strain', 'assay_tissue', 'assay_cell_type',
           'assay_subcellular_fraction', 'bao_format', 'assay_test_type']
# change the dtype of all columns in smi_afs to str
for col in smi_afs:
    ki_mor_df[col] = ki_mor_df[col].astype(str)

train_smi_afs_x = ki_mor_df[ki_mor_df['set'] == 'TRAIN'][smi_afs].apply(lambda x: ' '.join(x), axis=1).values
train_smi_afs_df = pd.DataFrame({'text': train_smi_afs_x, 'labels': train_y})

eval_smi_afs_x = ki_mor_df[ki_mor_df['set'] == 'VAL'][smi_afs].apply(lambda x: ' '.join(x), axis=1).values
eval_smi_afs_df = pd.DataFrame({'text': eval_smi_afs_x, 'labels': eval_y})

# Model

In [10]:
# Enabling regression
# Setting optional model configuration
model_args = ClassificationArgs()
model_args.num_train_epochs = 1
model_args.regression = True

# create a ClassificationModel
model = ClassificationModel(
    "roberta", # 'bert' in rxn_yields. model_type in rxnfp, that is the type of model (bert, xlnet, roberta, distilbert)
    "roberta-base", # model_path in rxn_yields. model_name in rxnfp, that is the exact architecture and trained weights to use. This may be a Hugging Face Transformers compatible pre-trained model, a community model, or the path to a directory containing model files.
    num_labels=1,
    use_cuda=False,
    args=model_args
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## ki_mor_1_df

In [21]:
# Train the model
model.train_model(train_smi_df, output_dir='outputs/ki_mor_smi_x')
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(eval_smi_df)
print(f'result: \n{result}')
print(f'model_outputs: \n{model_outputs}')
print(f'wrong_predictions: \n{wrong_predictions}')
# calculate the RMSE
from sklearn.metrics import mean_squared_error
import numpy as np
rmse = np.sqrt(mean_squared_error(eval_smi_df['labels'].values, model_outputs))
print(f'RMSE: {rmse}')

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
6it [00:00,  7.28it/s]                       
INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_roberta_128_1_2
Epochs 1/1. Running Loss:    4.3050: 100%|██████████| 349/349 [04:34<00:00,  1.27it/s]
Epoch 1 of 1: 100%|██████████| 1/1 [04:35<00:00, 275.70s/it]
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/ki_mor_smi_x.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
1it [00:00,  3.41it/s]
INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_1_2
Running Evaluation: 100%|██████████| 5/5 [00:09<00:00,  1.94s/it]
INFO:simpletransformers.classification.classification_model:{'eval_loss': 2.040379047393799}


result: 
{'eval_loss': 2.040379047393799}
model_outputs: 
[7.70826721 7.70834446 7.70834398 7.70826244 7.70826244 7.7086668
 7.70841742 7.70826817 7.70853758 7.7088294  7.70866156 7.7086668
 7.70876884 7.7088151  7.70879984 7.70850754 7.70795202 7.70882082
 7.70870447 7.70819807 7.70839214 7.70789289 7.70844841 7.70894957
 7.70816422 7.70765924 7.70763636 7.70763636 7.7080369  7.70781994
 7.70861673 7.70815945 7.70835352 7.70850754 7.70876884 7.70882082
 7.70870447 7.70882797 7.70867729 7.70822001 7.70855379 7.70854616
 7.70842791 7.70853949 7.70824337 7.708673   7.70842791 7.70870447
 7.70879984 7.70850754 7.70870447 7.70779753 7.70772505 7.7077136
 7.70801449 7.70779848 7.70790815 7.70789957 7.70775747 7.70777178
 7.70774317 7.70792627 7.70791388 7.70842981 7.70792627 7.70842981
 7.70792627 7.70790339 7.70790052 7.70777082 7.70791101 7.7078557
 7.70779467 7.70843983 7.70869112 7.70862198 7.70785093 7.70774412
 7.70771742 7.70765543 7.70771313 7.70878553 7.7087779  7.70850754
 7.70873

In [11]:
# Train the model
model.train_model(train_smi_aid_df, output_dir='outputs/ki_mor_smi_aid_x')
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(eval_smi_aid_df)
print(f'result: \n{result}')
print(f'model_outputs: \n{model_outputs}')
print(f'wrong_predictions: \n{wrong_predictions}')
# calculate the RMSE
from sklearn.metrics import mean_squared_error
import numpy as np
rmse = np.sqrt(mean_squared_error(eval_smi_aid_df['labels'].values, model_outputs))
print(f'RMSE: {rmse}')

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
6it [00:00,  7.66it/s]                       
INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_roberta_128_1_2
Epochs 1/1. Running Loss:    1.0648: 100%|██████████| 349/349 [04:36<00:00,  1.26it/s]
Epoch 1 of 1: 100%|██████████| 1/1 [04:37<00:00, 277.70s/it]
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/ki_mor_smi_aid_x.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
1it [00:00,  2.71it/s]
INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_1_2
Running Evaluation: 100%|██████████| 5/5 [00:09<00:00,  1.89s/it]
INFO:simpletransformers.classification.classification_model:{'eval_loss': 1.5672023296356201}


result: 
{'eval_loss': 1.5672023296356201}
model_outputs: 
[7.48551941 8.14619923 8.26920891 8.34817886 8.33243561 7.08587646
 7.12619781 7.17888021 8.342309   8.26287651 8.37784863 8.33140087
 8.24748135 8.38541412 8.0808506  8.26342773 6.99802876 8.41786385
 8.09758186 8.04785538 8.06368637 8.35424232 8.35785389 8.38245964
 8.38469982 8.4062252  8.40786266 7.0135994  8.37469101 8.39299202
 8.39057064 8.19141388 8.21610832 7.06190109 7.05266047 7.05932045
 7.05932045 7.45863628 7.11658525 8.26284885 8.10824394 8.28118324
 8.37160397 8.08831501 8.39303589 8.39827251 8.30471134 7.74361944
 8.10598183 8.39433861 8.29758453 7.95125437 8.05582619 8.04395771
 8.09529114 8.03678703 8.01906395 7.94683313 8.26097679 8.23239231
 8.25300789 8.2905674  7.10239315 8.09194851 7.08905458 7.01616096
 7.01849079 6.99045992 8.33907318 7.80759335 7.01523018 7.01696682
 7.01887131 7.0171442  6.99228191 7.01235104 8.25656509 6.98409891
 6.98426914 7.07482719 7.05627346 7.05068302 7.04822826 7.05542946
 7.

In [12]:
# Train the model
model.train_model(train_smi_ad_df, output_dir='outputs/ki_mor_smi_ad_x')
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(eval_smi_ad_df)
print(f'result: \n{result}')
print(f'model_outputs: \n{model_outputs}')
print(f'wrong_predictions: \n{wrong_predictions}')
# calculate the RMSE
from sklearn.metrics import mean_squared_error
import numpy as np
rmse = np.sqrt(mean_squared_error(eval_smi_ad_df['labels'].values, model_outputs))
print(f'RMSE: {rmse}')

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
6it [00:00,  7.50it/s]                       
INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_roberta_128_1_2
Epochs 1/1. Running Loss:    1.2685: 100%|██████████| 349/349 [04:27<00:00,  1.30it/s]
Epoch 1 of 1: 100%|██████████| 1/1 [04:28<00:00, 268.55s/it]
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/ki_mor_smi_ad_x.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
1it [00:00,  1.96it/s]
INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_1_2
Running Evaluation: 100%|██████████| 5/5 [00:09<00:00,  1.84s/it]
INFO:simpletransformers.classification.classification_model:{'eval_loss': 1.1669396042823792}


result: 
{'eval_loss': 1.1669396042823792}
model_outputs: 
[6.37689495 7.79378748 7.91503763 8.82103062 8.81110954 7.3471241
 7.46457434 7.79846048 8.81555843 8.57633781 8.79423046 8.75148201
 8.49980736 8.75492382 8.72309494 8.58260441 7.12696791 8.82726383
 8.37564278 8.34118557 8.7642765  8.77909374 8.78821468 8.81554317
 8.81979084 8.81051254 8.82398796 7.06418514 8.79404736 8.82190132
 8.82196617 8.44206333 8.53530407 7.06912565 6.89646292 7.04895544
 7.04895544 7.65059233 7.06499243 8.64051723 8.70685387 8.7142477
 8.81810856 8.78200531 8.80018044 8.8006382  8.66011143 8.49785042
 8.76144886 8.79952526 8.62983227 8.29719257 8.28411961 8.24314404
 7.79825878 8.19754696 7.8092618  8.29719257 8.63471699 8.79545021
 8.79761887 8.78919888 8.22507286 8.78116512 7.90908384 6.18106413
 6.11310148 6.48041105 8.20433998 8.42690086 6.40110826 6.38848782
 6.56353617 6.53579569 6.18955183 6.20285654 8.33470154 6.66324377
 6.60930109 7.18931675 6.89047575 6.86070204 6.49888945 7.02067804
 6.67

In [14]:
# Train the model
model.train_model(train_smi_afs_df, output_dir='outputs/ki_mor_smi_afs_x')
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(eval_smi_afs_df)
print(f'result: \n{result}')
print(f'model_outputs: \n{model_outputs}')
print(f'wrong_predictions: \n{wrong_predictions}')
# calculate the RMSE
from sklearn.metrics import mean_squared_error
import numpy as np
rmse = np.sqrt(mean_squared_error(eval_smi_afs_df['labels'].values, model_outputs))
print(f'RMSE: {rmse}')

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
6it [00:00,  6.62it/s]                       
INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_roberta_128_1_2
Epochs 1/1. Running Loss:    1.0923: 100%|██████████| 349/349 [04:37<00:00,  1.26it/s]
Epoch 1 of 1: 100%|██████████| 1/1 [04:38<00:00, 278.30s/it]
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/ki_mor_smi_afs_x.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
1it [00:00,  3.45it/s]
INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_1_2
Running Evaluation: 100%|██████████| 5/5 [00:10<00:00,  2.01s/it]
INFO:simpletransformers.classification.classification_model:{'eval_loss': 1.1557834029197693}


result: 
{'eval_loss': 1.1557834029197693}
model_outputs: 
[6.83245516 8.40316772 7.72252989 8.72615528 8.35665607 7.05392933
 7.04999876 7.11484766 8.43216324 8.21472645 9.04643059 9.07679749
 8.34632206 9.0495882  9.02111721 8.21472645 6.73025322 9.07475376
 8.87486458 8.30542469 8.30074406 8.31183624 8.54761982 9.03816605
 9.1000948  8.51847172 8.51847172 6.87887287 9.06630611 9.07322121
 9.06899738 8.63253403 9.06204033 7.06935406 7.06451702 7.21083975
 7.21083975 7.58873892 7.0893259  8.20062542 9.06758595 8.84051323
 9.04205704 9.05147648 9.02997589 9.04839325 8.8228569  8.77469349
 8.86613178 9.04839325 9.07914639 8.24199581 8.49408436 8.52519417
 7.99969673 8.54445744 7.74535179 8.24199581 8.20062542 8.18353081
 8.34386253 8.86629581 7.93690634 9.05147648 7.65126228 6.75280857
 6.75234652 6.9076066  6.96726799 8.2215519  6.85886431 6.85784435
 6.92342377 6.8932991  6.74633694 6.78868151 7.91699982 6.89037275
 6.89037275 7.34697819 7.33468962 7.29852724 6.91043568 7.13277149
 7.

## ki_mor_df