In [None]:
# conda env: datacat(Python 3.8.20)
#--> from clamp.dataset import InMemoryClamp
#--> from clamp import utils
#--> from clamp.utils import set_device

import os
import pandas as pd
from loguru import logger
import random
import torch
import numpy as np

import mlflow
import argparse
import wandb
from time import time

from pathlib import Path

In [2]:
# `clamp/clamp/utils.py`
def seed_everything(seed=70135): 
    """does what it says ;) - from https://gist.github.com/KirillVladimirov/005ec7f762293d2321385580d3dbe335"""
    import numpy as np
    import random
    import os
    import torch

    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed) # set the seed for hash-based operations
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed) # if using GPU
    torch.backends.cudnn.deterministic = True # if using GPU

def set_device(gpu=0):
    "Set device to gpu or cpu."
    if gpu=='any':
        gpu = 0 # The GPU numbre on device is ususally starting from 0
    if torch.cuda.is_available():
        device = torch.device(f'cuda:{gpu}')
    else:
        device = torch.device('cpu')
    return device

# `prep_fsmol.py`

# `def parse_args_override`

In [2]:
def parse_args(): 
    parser = argparse.ArgumentParser('Train and test a single run of clip-gpcr model. Overrides arguments from hyperparam-file')
    parser.add_argument('-f', type=str) # ? what does 'f' mean?
    parser.add_argument('--dataset', type=str, default='./data/fsmol/', help='Path to a prepared dataset directory.') # <--
    parser.add_argument('--assay_mode', type=str, default='lsa', help='Type of assay features ("clip", "biobert", or "lsa").') # -->
    parser.add_argument('--compound_mode', type=str, default='morganc+rdkc', help='Type of compound features (default:morgan+rdkc)') # <-->
    parser.add_argument('--hparams', type=str, default='./hparams/clip-gpcr.json', help='Path to a hyperparameter to use in training clip-gpcr (json, yml)') # <-->

    parser.add_argument('--checkpoint', help='Path to a checkpoint file to load model weights from.', metavar='') 
    parser.add_argument('--experiment', type=str, default='debug', help='Name of MLflow experiment where to assign this run.', metavar='') 
    parser.add_argument('--random', action='store_true', help='Forget about the specified model and run a random baseline.') #?

    parser.add_argument('--gpu', type=str, default="0", help='GPU number to use. Default: 0', metavar='')
    parser.add_argument('--seed', type=int, default=None, help='seed everything with provided seed, default no seed')
    
    parser.add_argument('--split', type=str, default='time_a_c', help='split-type default: time_a_c for time based assay and compound split, other options: time_a, time_c, random_{seed}m or column of activity.parquet triplet') # <-->
    parser.add_argument('--support_set_size', type=int, default='0', help='per task how many to add from test- as well as valid- to the train-set default=0 = zero-shot') #?
    parser.add_argument('--train_only_actives', action='store_true', help='train only with active molecules')
    parser.add_argument('--drop_cidx_path', type=str, default=None, help='Path to a file containing a np of cidx (NOT CIDs) to drop from the dataset.')

    parser.add_argument('--verbose', '-v', type=int, default=0, help='verbosity level default=0')
    parser.add_argument('--wandb','-w', action='store_true', help='Log to wandb')
    parser.add_argument('--bf16', action='store_true', help='use bfloat16 for training')

    args = parser.parse_args()

    return args

# `def setup_dataset`

In [16]:
dataset = './data/pubchem23'

In [18]:
dataset = Path(dataset)
dataset

PosixPath('data/pubchem23')

# `def main`

In [3]:
class Args:
    def __init__(self):
        self.experiment = 'debug'
        self.seed = None
        self.wandb = False
        self.split = 'time_a_c' # <--
        self.assay_mode = 'lsa' # <--
        self.gpu = '0'
        self.verbose = 0
        self.checkpoint = None
        
args = Args()

In [4]:
hparams = args.__dict__
hparams

{'experiment': 'debug',
 'seed': None,
 'wandb': False,
 'split': 'time_a_c',
 'assay_mode': 'lsa',
 'gpu': '0',
 'verbose': 0,
 'checkpoint': None}

In [5]:
mlflow.set_experiment(args.experiment)

if args.seed:
    seed_everything(args.seed)
    logger.info(f'Seeded everything with {args.seed}')

clamp_dl, train_idx, valid_idx, test_idx = setup_dataset(**args.__dict__)
# ensure that there is no overlap between the splits.
assert set(train_idx).intersection(set(valid_idx)) == set()
assert set(train_idx).intersection(set(test_idx)) == set()

if args.wandb:
    runname = args.experiment + args.split[-1]+ args.assay_mode[-1]
    
    runname += ''.join([chr(random.randrange(97, 97 + 26)) for _ in range(3)]) # random 3 letter suffix
    wandb.init(project='clip-gpcr', entity='yu', name=runname, config=args.__dict__)

device = set_device(gpu=args.gpu)

metrics_df = pd.DataFrame()

NameError: name 'setup_dataset' is not defined

In [6]:
try:
    # start a new MLflow run
    with mlflow.start_run():
        # retrieve the run info
        mlflowi = mlflow.active_run().info

    if args.checkpoint is not None:
        # set a tag in the MLflow run to indicate that we are resuming training
        mlflow.set_tag(
            'mlflow.note.content',
            f'Resumed training from {args.checkpoint}.'
        )

    if 'assay_mode' in hparams:
        if hparams['assay_mode'] != args.assay_mode:
            # 2 reasons to check the above:
            # a previous hyperparameter set could be already saved
            # the command-line argument could be merged with a experiment parameters from a tool like MLflow

            logger.warning(f'Assay features are {args.assay_mode} in command line but \"{hparams["assay_mode"]}\" in hyperparameter file.')
            logger.warning('Command line {args.assay_mode} is the prevailing option.')
            hparams['assay_mode'] = args.assay_mode
    else: # if not in hparams, log it using mlflow
        mlflow.log_param('assay_mode', args.assay_mode) 
    mlflow.log_params(hparams) # log all hyperparameters to mlflow.

    metrics_df = utils.train_and_test(
        clamp_dl,
        train_idx=train_idx,
        valid_idx=valid_idx,
        test_idx=test_idx,
        hparams=hparams,
        run_info=mlflowi,
        checkpoint_file=args.checkpoint,
        device=device,
        bf16=args.bf16,
        verbose=args.verbose,
    )

except KeyboardInterrupt:
    logger.error('Training manually interrupted. Trying to test with last checkpoint.')
    metrics_df = utils.test(
        clamp_dl,
        train_idx=train_idx,
        test_idx=test_idx,
        hparams=hparams,
        run_info=mlflowi,
        device=device,
        verbose=args.verbose
    )

NameError: name 'utils' is not defined

# `if __name__ == '__main__:`

In [15]:
# args = parse_args_override()
run_id = str(time()).split('.')[0]
fn_postfix = f'{args.experiment}_{run_id}' 

if args.verbose>=1:
    logger.info('Run args: ', os.getcwd()+__file__, args.__dict__)

# main(args)