In [1]:
# Imports
# Incorporate this https://github.com/seyonechithrananda/bert-loves-chemistry/blob/master/chemberta/utils/roberta_regression.py#L138


import os
import torch

from jointformer.configs.dataset import DatasetConfig
from jointformer.configs.tokenizer import TokenizerConfig
from jointformer.configs.model import ModelConfig
from jointformer.configs.trainer import TrainerConfig

from jointformer.utils.datasets.auto import AutoDataset
from jointformer.utils.tokenizers.auto import AutoTokenizer
from jointformer.models.auto import AutoModel
from jointformer.trainers.trainer import Trainer

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm
2024-07-10 13:48:35.271505: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-10 13:48:35.445266: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-10 13:48:35.445329: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-10 13:48:35.445358: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-10 13:48:3

In [2]:
if not torch.cuda.is_available():
    raise RuntimeError("GPU not detected.")

In [3]:
# Configs

REPOSITORY_DIR = '/home/adamizdebski/projects/jointformer'
DATA_DIR = '/home/adamizdebski/files/data'
OUTPUT_DIR = '/home/adamizdebski/files/jointformer/results/chemberta2/moleculenet'

PATH_TO_DATASET_CONFIG   = '/home/adamizdebski/projects/jointformer/configs/datasets/molecule_net/lipo'
PATH_TO_TOKENIZER_CONFIG = '/home/adamizdebski/projects/jointformer/configs/tokenizers/chemberta'
PATH_TO_CHEMBERTA_CONFIG = '/home/adamizdebski/projects/jointformer/configs/models/chemberta'
PATH_TO_MODEL_CONFIG = '/home/adamizdebski/projects/jointformer/configs/models/jointformer_test'
PATH_TO_TRAINER_CONFIG = '/home/adamizdebski/projects/jointformer/configs/trainers/finetune'

In [4]:
os.chdir(REPOSITORY_DIR)

In [5]:
dataset_config = DatasetConfig.from_config_file(PATH_TO_DATASET_CONFIG)
tokenizer_config = TokenizerConfig.from_config_file(PATH_TO_TOKENIZER_CONFIG)

train_dataset = AutoDataset.from_config(dataset_config, data_dir=DATA_DIR, split='train')
val_dataset = AutoDataset.from_config(dataset_config, data_dir=DATA_DIR, split='val')
test_dataset = AutoDataset.from_config(dataset_config, data_dir=DATA_DIR, split='test')

tokenizer = AutoTokenizer.from_config(tokenizer_config)

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/adamizdebski/miniconda3/envs/jointformer-experiments/lib/python3.9/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [29]:
model_config = ModelConfig.from_config_file(PATH_TO_CHEMBERTA_CONFIG)
model = AutoModel.from_config(model_config)

Some weights of RobertaForRegression were not initialized from the model checkpoint at DeepChem/ChemBERTa-5M-MLM and are newly initialized: ['regression.dense.bias', 'regression.dense.weight', 'regression.out_proj.bias', 'regression.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
trainer_config = TrainerConfig.from_config_file(PATH_TO_TRAINER_CONFIG)

trainer = Trainer(
    config=trainer_config,
    model=model,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    test_dataset=test_dataset,
    tokenizer=tokenizer
    )
trainer._init_data_loaders()

In [37]:
inputs = trainer.get_training_batch()

with torch.no_grad():
    outputs = model.get_loss(**inputs, return_dict=True)

In [38]:
outputs

{'loss': tensor(0.8823, device='cuda:0'),
 'Y_pred': tensor([[-0.0263],
         [-0.0886],
         [ 0.1256],
         [-0.0776],
         [-0.0566],
         [-0.0208],
         [ 0.0161],
         [ 0.0104],
         [-0.0599],
         [-0.1337],
         [-0.1310],
         [-0.1062],
         [ 0.0072],
         [-0.0688],
         [ 0.0245],
         [-0.0207]], device='cuda:0')}

In [40]:
trainer.test()

1.1602163160049317

In [14]:
# Create output directory

ckpt_dir = os.path.join(OUTPUT_DIR, 'checkpoints')
eval_dir = os.path.join(OUTPUT_DIR, 'eval')

if not os.path.exists(ckpt_dir):
    os.makedirs(ckpt_dir)

if not os.path.exists(eval_dir):
    os.makedirs(eval_dir)

# set the parameters
EPOCHS = 2
BATCH_SIZE = 2
patience = 15
learning_rate = 0.00001
manual_seed = 112

wandb_kwargs = {'name' : 'chemberta2-moleculenet-esol'}

model_args = {
    'evaluate_each_epoch': True,
    'evaluate_during_training_verbose': True,
    'evaluate_during_training' : True,
    'best_model_dir' : ckpt_dir,
    'no_save': False,
    'save_eval_checkpoints': False,
    'save_model_every_epoch': False,
    'save_best_model' : True,
    'save_steps': -1,
    'num_train_epochs': EPOCHS,
    'use_early_stopping': True,
    'early_stopping_patience': patience,
    'early_stopping_delta': 0.001,
    'early_stopping_metrics': 'eval_loss',
    'early_stopping_metrics_minimize': True,
    'early_stopping_consider_epochs' : True,
    'fp16' : False,
    'optimizer' : "AdamW",
    'adam_betas' : (0.95, 0.999),
    'learning_rate' : learning_rate,
    'manual_seed': manual_seed,
    'train_batch_size' : BATCH_SIZE,
    'eval_batch_size' : BATCH_SIZE,
    'logging_steps' : 2,
    'auto_weights': True, # change to true
    'wandb_project': 'chemberta',
    'wandb_kwargs': wandb_kwargs
    }

In [15]:


dataset_config = DatasetConfig.from_config_file(PATH_TO_DATASET_CONFIG)

train_dataset = AutoDataset.from_config(dataset_config, data_dir=DATA_DIR, split='train', seed=0, num_samples=100)
train_df = train_dataset.get_data_frame()

val_dataset = AutoDataset.from_config(dataset_config, data_dir=DATA_DIR, split='train', seed=0, num_samples=100)
val_df = val_dataset.get_data_frame()


In [7]:
train_dataset[0]

('CCc1c(C)[nH]c2c1C(=O)C(CN1CCOCC1)CC2', tensor([1]))

In [8]:
model = ClassificationModel('roberta', 'DeepChem/ChemBERTa-5M-MLM', args=model_args)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-5M-MLM and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
results = model.train_model(train_df, eval_df=val_df, output_dir=ckpt_dir)

0it [27:03, ?it/s]


KeyboardInterrupt: 

In [17]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-5M-MLM")
model = AutoModelForMaskedLM.from_pretrained("DeepChem/ChemBERTa-5M-MLM")



In [18]:
tokenizer(train_df)

KeyError: 0

In [10]:
train_dataset[0]

('CCc1c(C)[nH]c2c1C(=O)C(CN1CCOCC1)CC2', tensor([1.], dtype=torch.float64))

In [3]:
import numpy as np
import torch





In [4]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")
dataset["train"][100]

tensor([23, 21,  1, 32,  3,  6, 24, 29,  4,  3, 15, 33, 21, 13, 23,  4, 14, 28,
        15,  2, 33,  1,  6, 19, 22, 11,  8, 27, 10,  2, 30, 16])

In [16]:
import numpy as np

# Create a random numpy array of shape (3, 4)
random_array = np.random.randn(10000, 200).astype(np.float32)
random_array.tofile('random_array.bin')



In [17]:
ix = torch.randint(100 - 64, (32,))
data = np.memmap('random_array.bin', dtype=np.float32, mode='r')

In [19]:
data[ix]

array([-0.37802866, -0.36042753,  0.3643469 ,  0.3643469 , -1.0327613 ,
        0.8884671 , -1.2017971 ,  2.5359983 ,  0.3449528 , -0.88895214,
        0.40260556,  1.2032892 , -0.36042753, -0.3175705 ,  1.4858595 ,
        1.4858595 ,  2.5359983 , -0.89773315, -0.88895214,  0.13865697,
       -0.89773315, -0.3175705 ,  0.3643469 ,  0.14209104, -0.26975426,
        0.3643469 ,  1.4858595 ,  0.13865697, -0.89773315, -0.36042753,
        0.5191463 , -1.0327613 ], dtype=float32)

In [8]:
from jointformer.models.chemberta2 import RobertaForRegression
from transformers import RobertaConfig, RobertaTokenizerFast, Trainer, TrainingArguments

%load_ext autoreload
%autoreload 2

2024-07-10 11:25:42.070521: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-10 11:25:42.101455: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-10 11:25:42.101491: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-10 11:25:42.101497: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-10 11:25:42.107749: I tensorflow/core/platform/cpu_feature_g

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
PRETRAINED_MODEL_NAME_OR_PATH = "DeepChem/ChemBERTa-5M-MLM"

In [10]:
config = RobertaConfig.from_pretrained(
    PRETRAINED_MODEL_NAME_OR_PATH
)
config.num_labels = 1



In [11]:
model = RobertaForRegression.from_pretrained("DeepChem/ChemBERTa-5M-MLM", config=config)

Some weights of RobertaForRegression were not initialized from the model checkpoint at DeepChem/ChemBERTa-5M-MLM and are newly initialized: ['regression.dense.bias', 'regression.dense.weight', 'regression.out_proj.bias', 'regression.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
torch.manual_seed(FLAGS.seed)

tokenizer = RobertaTokenizerFast.from_pretrained(
    FLAGS.tokenizer_path, max_len=FLAGS.max_tokenizer_len, use_auth_token=True
)

finetune_datasets = get_finetune_datasets(dataset_name, tokenizer, is_molnet)

if FLAGS.pretrained_model_name_or_path:
    config = RobertaConfig.from_pretrained(
        FLAGS.pretrained_model_name_or_path, use_auth_token=True
    )
else:
    config = RobertaConfig(
        vocab_size=FLAGS.vocab_size,
        max_position_embeddings=FLAGS.max_position_embeddings,
        num_attention_heads=FLAGS.num_attention_heads,
        num_hidden_layers=FLAGS.num_hidden_layers,
        type_vocab_size=FLAGS.type_vocab_size,
        is_gpu=torch.cuda.is_available(),
    )

if dataset_type == "classification":
    model_class = RobertaForSequenceClassification
    config.num_labels = finetune_datasets.num_labels

elif dataset_type == "regression":
    model_class = RobertaForRegression
    config.num_labels = 1
    config.norm_mean = finetune_datasets.norm_mean
    config.norm_std = finetune_datasets.norm_std

state_dict = prune_state_dict(FLAGS.pretrained_model_name_or_path)

In [None]:
"""Script for finetuning and evaluating pre-trained ChemBERTa models on MoleculeNet tasks.

[classification]
python finetune.py --datasets=bbbp --pretrained_model_name_or_path=DeepChem/ChemBERTa-SM-015

[regression]
python finetune.py --datasets=delaney --pretrained_model_name_or_path=DeepChem/ChemBERTa-SM-015

[csv]
python finetune.py --datasets=$HOME/finetune_datasets/logd/ \
                --dataset_types=regression \
                --pretrained_model_name_or_path=DeepChem/ChemBERTa-SM-015 \
                --is_molnet=False

[multiple]
python finetune.py \
--datasets=bace_classification,bace_regression,bbbp,clearance,clintox,delaney,lipo,tox21 \
--pretrained_model_name_or_path=DeepChem/ChemBERTa-SM-015 \
--n_trials=20 \
--output_dir=finetuning_experiments \
--run_name=sm_015

[from scratch (no pretraining)]
python finetune.py --datasets=bbbp

"""


In [5]:

import json
import os
import shutil
from collections import OrderedDict
from dataclasses import dataclass
from glob import glob
from typing import List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from absl import app, flags

# from chemberta.utils.molnet_dataloader import get_dataset_info, load_molnet_dataset

from jointformer.models.chemberta2 import RobertaForRegression

from scipy.special import softmax
from scipy.stats import pearsonr
from sklearn.metrics import (
    average_precision_score,
    matthews_corrcoef,
    mean_squared_error,
    roc_auc_score,
)
from transformers import RobertaConfig, RobertaTokenizerFast, Trainer, TrainingArguments
from transformers.trainer_callback import EarlyStoppingCallback

FLAGS = flags.FLAGS

# Settings
flags.DEFINE_string(name="output_dir", default="default_dir", help="")
flags.DEFINE_boolean(name="overwrite_output_dir", default=True, help="")
flags.DEFINE_string(name="run_name", default="default_run", help="")
flags.DEFINE_integer(name="seed", default=0, help="Global random seed.")

# Model params
flags.DEFINE_string(
    name="pretrained_model_name_or_path",
    default=None,
    help="Arg to HuggingFace model.from_pretrained(). Can be either a path to a local model or a model ID on HuggingFace Model Hub. If not given, trains a fresh model from scratch (non-pretrained).",
)
flags.DEFINE_boolean(
    name="freeze_base_model",
    default=False,
    help="If True, freezes the parameters of the base model during training. Only the classification/regression head parameters will be trained. (Only used when `pretrained_model_name_or_path` is given.)",
)
flags.DEFINE_boolean(
    name="is_molnet",
    default=True,
    help="If true, assumes all dataset are MolNet datasets.",
)

# RobertaConfig params (only for non-pretrained models)
flags.DEFINE_integer(name="vocab_size", default=600, help="")
flags.DEFINE_integer(name="max_position_embeddings", default=515, help="")
flags.DEFINE_integer(name="num_attention_heads", default=6, help="")
flags.DEFINE_integer(name="num_hidden_layers", default=6, help="")
flags.DEFINE_integer(name="type_vocab_size", default=1, help="")

# Train params
flags.DEFINE_integer(name="logging_steps", default=10, help="")
flags.DEFINE_integer(name="early_stopping_patience", default=3, help="")
flags.DEFINE_integer(name="num_train_epochs_max", default=10, help="")
flags.DEFINE_integer(name="per_device_train_batch_size", default=64, help="")
flags.DEFINE_integer(name="per_device_eval_batch_size", default=64, help="")
flags.DEFINE_integer(
    name="n_trials",
    default=5,
    help="Number of different hyperparameter combinations to try. Each combination will result in a different finetuned model.",
)
flags.DEFINE_integer(
    name="n_seeds",
    default=5,
    help="Number of unique random seeds to try. This only applies to the final best model selected after hyperparameter tuning.",
)

# Dataset params
flags.DEFINE_list(
    name="datasets",
    default=None,
    help="Comma-separated list of MoleculeNet dataset names.",
)
flags.DEFINE_string(
    name="split", default="scaffold", help="DeepChem data loader split_type."
)
flags.DEFINE_list(
    name="dataset_types",
    default=None,
    help="List of dataset types (ex: classification,regression). Include 1 per dataset, not necessary for MoleculeNet datasets.",
)

# Tokenizer params
flags.DEFINE_string(
    name="tokenizer_path",
    default="seyonec/SMILES_tokenized_PubChem_shard00_160k",
    help="",
)
flags.DEFINE_integer(name="max_tokenizer_len", default=512, help="")

flags.mark_flag_as_required("datasets")

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"



2024-07-09 18:19:51.096423: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-09 18:19:51.552885: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-09 18:19:51.552991: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-09 18:19:51.553010: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-09 18:19:51.744987: I tensorflow/core/platform/cpu_feature_g

In [None]:

def main(argv):
    if FLAGS.pretrained_model_name_or_path is None:
        print(
            "`WARNING: pretrained_model_name_or_path` is None - training a model from scratch."
        )
    else:
        print(
            f"Instantiating pretrained model from: {FLAGS.pretrained_model_name_or_path}"
        )

    is_molnet = FLAGS.is_molnet

    # Check that CSV dataset has the proper flags
    if not is_molnet:
        print("Assuming each dataset is a folder containing CSVs...")
        assert (
            len(FLAGS.dataset_types) > 0
        ), "Please specify dataset types for csv datasets"
        for dataset_folder in FLAGS.datasets:
            assert os.path.exists(os.path.join(dataset_folder, "train.csv"))
            assert os.path.exists(os.path.join(dataset_folder, "valid.csv"))
            assert os.path.exists(os.path.join(dataset_folder, "test.csv"))

    for i in range(len(FLAGS.datasets)):
        dataset_name_or_path = FLAGS.datasets[i]
        dataset_name = get_dataset_name(dataset_name_or_path)
        dataset_type = (
            get_dataset_info(dataset_name)["dataset_type"]
            if is_molnet
            else FLAGS.dataset_types[i]
        )

        run_dir = os.path.join(FLAGS.output_dir, FLAGS.run_name, dataset_name)

        if os.path.exists(run_dir) and not FLAGS.overwrite_output_dir:
            print(f"Run dir already exists for dataset: {dataset_name}")
        else:
            print(f"Finetuning on {dataset_name}")
            finetune_single_dataset(
                dataset_name_or_path, dataset_type, run_dir, is_molnet
            )



In [None]:

def prune_state_dict(model_dir):
    """Remove problematic keys from state dictionary"""
    if not (model_dir and os.path.exists(os.path.join(model_dir, "pytorch_model.bin"))):
        return None

    state_dict_path = os.path.join(model_dir, "pytorch_model.bin")
    assert os.path.exists(
        state_dict_path
    ), f"No `pytorch_model.bin` file found in {model_dir}"
    loaded_state_dict = torch.load(state_dict_path)
    state_keys = loaded_state_dict.keys()
    keys_to_remove = [
        k for k in state_keys if k.startswith("regression") or k.startswith("norm")
    ]

    new_state_dict = OrderedDict({**loaded_state_dict})
    for k in keys_to_remove:
        del new_state_dict[k]
    return new_state_dict



In [1]:
TOKENIZER_PATH = "DeepChem/SmilesTokenizer_PubChem_1M"
PRETRAINED_MODEL_NAME_OR_PATH = "DeepChem/ChemBERTa-5M-MLM"
MAX_TOKENIZER_LEN = 512


tokenizer = RobertaTokenizerFast.from_pretrained(
        TOKENIZER_PATH
    )

NameError: name 'RobertaTokenizerFast' is not defined

In [None]:
# finetune_datasets = get_finetune_datasets(dataset_name, tokenizer, is_molnet) TODO: check compatibility with new datasets

In [17]:
config = RobertaConfig.from_pretrained(
    PRETRAINED_MODEL_NAME_OR_PATH
)
config.num_labels = 1
config.norm_mean = finetune_datasets.norm_mean
config.norm_std = finetune_datasets.norm_std

In [None]:

if dataset_type == "classification":
    model_class = RobertaForSequenceClassification
elif dataset_type == "regression":
    model_class = RobertaForRegression

        if FLAGS.pretrained_model_name_or_path:
            model = model_class.from_pretrained(
                FLAGS.pretrained_model_name_or_path,
                config=config,
                state_dict=state_dict,
                use_auth_token=True,
            )
            if FLAGS.freeze_base_model:
                for name, param in model.base_model.named_parameters():
                    param.requires_grad = False
        else:
            model = model_class(config=config)

        return model

In [None]:

def finetune_single_dataset(dataset_name, dataset_type, run_dir, is_molnet):
    torch.manual_seed(FLAGS.seed)

    if dataset_type == "classification":
        model_class = RobertaForSequenceClassification
        config.num_labels = finetune_datasets.num_labels

    elif dataset_type == "regression":
        model_class = RobertaForRegression
        config.num_labels = 1
        config.norm_mean = finetune_datasets.norm_mean
        config.norm_std = finetune_datasets.norm_std

    state_dict = prune_state_dict(FLAGS.pretrained_model_name_or_path)

    def model_init():
        if dataset_type == "classification":
            model_class = RobertaForSequenceClassification
        elif dataset_type == "regression":
            model_class = RobertaForRegression

        if FLAGS.pretrained_model_name_or_path:
            model = model_class.from_pretrained(
                FLAGS.pretrained_model_name_or_path,
                config=config,
                state_dict=state_dict,
                use_auth_token=True,
            )
            if FLAGS.freeze_base_model:
                for name, param in model.base_model.named_parameters():
                    param.requires_grad = False
        else:
            model = model_class(config=config)

        return model

    training_args = TrainingArguments(
        evaluation_strategy="epoch",
        output_dir=run_dir,
        overwrite_output_dir=FLAGS.overwrite_output_dir,
        per_device_eval_batch_size=FLAGS.per_device_eval_batch_size,
        logging_steps=FLAGS.logging_steps,
        load_best_model_at_end=True,
        report_to=None,
    )

    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=finetune_datasets.train_dataset,
        eval_dataset=finetune_datasets.valid_dataset,
        callbacks=[
            EarlyStoppingCallback(early_stopping_patience=FLAGS.early_stopping_patience)
        ],
    )

    def custom_hp_space_optuna(trial):
        return {
            "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
            "num_train_epochs": trial.suggest_int(
                "num_train_epochs", 1, FLAGS.num_train_epochs_max
            ),
            "seed": trial.suggest_int("seed", 1, 40),
            "per_device_train_batch_size": trial.suggest_categorical(
                "per_device_train_batch_size", [FLAGS.per_device_train_batch_size]
            ),
        }

    best_trial = trainer.hyperparameter_search(
        backend="optuna",
        direction="minimize",
        hp_space=custom_hp_space_optuna,
        n_trials=FLAGS.n_trials,
    )

    # Set parameters to the best ones from the hp search
    for n, v in best_trial.hyperparameters.items():
        setattr(trainer.args, n, v)

    dir_valid = os.path.join(run_dir, "results", "valid")
    dir_test = os.path.join(run_dir, "results", "test")
    os.makedirs(dir_valid, exist_ok=True)
    os.makedirs(dir_test, exist_ok=True)

    metrics_valid = {}
    metrics_test = {}

    # Run with several seeds so we can see std
    for random_seed in range(FLAGS.n_seeds):
        setattr(trainer.args, "seed", random_seed)
        trainer.train()
        metrics_valid[f"seed_{random_seed}"] = eval_model(
            trainer,
            finetune_datasets.valid_dataset_unlabeled,
            dataset_name,
            dataset_type,
            dir_valid,
            random_seed,
        )
        metrics_test[f"seed_{random_seed}"] = eval_model(
            trainer,
            finetune_datasets.test_dataset,
            dataset_name,
            dataset_type,
            dir_test,
            random_seed,
        )

    with open(os.path.join(dir_valid, "metrics.json"), "w") as f:
        json.dump(metrics_valid, f)
    with open(os.path.join(dir_test, "metrics.json"), "w") as f:
        json.dump(metrics_test, f)

    # Delete checkpoints from hyperparameter search since they use a lot of disk
    for d in glob(os.path.join(run_dir, "run-*")):
        shutil.rmtree(d, ignore_errors=True)


def eval_model(trainer, dataset, dataset_name, dataset_type, output_dir, random_seed):
    labels = dataset.labels
    predictions = trainer.predict(dataset)
    fig = plt.figure(dpi=144)

    if dataset_type == "classification":
        if len(np.unique(labels)) <= 2:
            y_pred = softmax(predictions.predictions, axis=1)[:, 1]
            metrics = {
                "roc_auc_score": roc_auc_score(y_true=labels, y_score=y_pred),
                "average_precision_score": average_precision_score(
                    y_true=labels, y_score=y_pred
                ),
            }
            sns.histplot(x=y_pred, hue=labels)
        else:
            y_pred = np.argmax(predictions.predictions, axis=-1)
            metrics = {"mcc": matthews_corrcoef(labels, y_pred)}

    elif dataset_type == "regression":
        y_pred = predictions.predictions.flatten()
        metrics = {
            "pearsonr": pearsonr(y_pred, labels),
            "rmse": mean_squared_error(y_true=labels, y_pred=y_pred, squared=False),
        }
        sns.regplot(x=y_pred, y=labels)
        plt.xlabel("ChemBERTa predictions")
        plt.ylabel("Ground truth")
    else:
        raise ValueError(dataset_type)

    plt.title(f"{dataset_name} {dataset_type} results")
    plt.savefig(os.path.join(output_dir, f"results_seed_{random_seed}.png"))

    return metrics


def get_finetune_datasets(dataset_name, tokenizer, is_molnet):
    if is_molnet:
        tasks, (train_df, valid_df, test_df), _ = load_molnet_dataset(
            dataset_name, split=FLAGS.split, df_format="chemprop"
        )
        assert len(tasks) == 1
    else:
        train_df = pd.read_csv(os.path.join(dataset_name, "train.csv"))
        valid_df = pd.read_csv(os.path.join(dataset_name, "valid.csv"))
        test_df = pd.read_csv(os.path.join(dataset_name, "test.csv"))

    train_dataset = FinetuneDataset(train_df, tokenizer)
    valid_dataset = FinetuneDataset(valid_df, tokenizer)
    valid_dataset_unlabeled = FinetuneDataset(valid_df, tokenizer, include_labels=False)
    test_dataset = FinetuneDataset(test_df, tokenizer, include_labels=False)

    num_labels = len(np.unique(train_dataset.labels))
    norm_mean = [np.mean(np.array(train_dataset.labels), axis=0)]
    norm_std = [np.std(np.array(train_dataset.labels), axis=0)]

    return FinetuneDatasets(
        train_dataset,
        valid_dataset,
        valid_dataset_unlabeled,
        test_dataset,
        num_labels,
        norm_mean,
        norm_std,
    )


def get_dataset_name(dataset_name_or_path):
    return os.path.splitext(os.path.basename(dataset_name_or_path))[0]


@dataclass
class FinetuneDatasets:
    train_dataset: str
    valid_dataset: torch.utils.data.Dataset
    valid_dataset_unlabeled: torch.utils.data.Dataset
    test_dataset: torch.utils.data.Dataset
    num_labels: int
    norm_mean: List[float]
    norm_std: List[float]


class FinetuneDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, include_labels=True):

        self.encodings = tokenizer(df["smiles"].tolist(), truncation=True, padding=True)
        self.labels = df.iloc[:, 1].values
        self.include_labels = include_labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.include_labels and self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])



In [None]:
# Source: https://github.com/seyonechithrananda/bert-loves-chemistry/blob/master/chemberta/finetune/finetune.py 