# Run Joint Learning Benchmark

In [1]:
%cd ..

/home/adam/Projects/hybrid-transformer


In [5]:
import os
import torch
import wandb

from hybrid_transformer.configs.task import TaskConfig
from hybrid_transformer.configs.model import ModelConfig
from hybrid_transformer.configs.trainer import TrainerConfig
from hybrid_transformer.configs.logger import LoggerConfig

from hybrid_transformer.utils.datasets.auto import AutoDataset
from hybrid_transformer.utils.tokenizers.auto import AutoTokenizer
from hybrid_transformer.models.auto import AutoModel
from hybrid_transformer.utils.loggers.wandb import WandbLogger

from hybrid_transformer.trainers.trainer import Trainer

from scripts.pretrain.train import DEFAULT_CONFIG_FILES

from hybrid_transformer.utils.objectives.guacamol.objective import GUACAMOL_TASKS
from hybrid_transformer.models.prediction import PREDICTION_MODEL_CONFIGS
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
# Load configs

task_config_path = lambda: f'./configs/tasks/molecule_net/{guacamol_task}/config.json'


for guacamol_task in ['lipo']:
    
    task_config = TaskConfig.from_pretrained(task_config_path())
    task_config.validate = False
    task_config.split = 'test'
    task_config.num_samples = 100
    dataset = AutoDataset.from_config(task_config)
    print(f"Loaded {task_config.target_label} data.")
    
    for model_name, path_to_model_config in PREDICTION_MODEL_CONFIGS.items():
        
        model_config = ModelConfig.from_pretrained(path_to_model_config)

        trainer_config = TrainerConfig.from_pretrained('./configs/trainers/debug/')
        logger_config = LoggerConfig.from_pretrained(DEFAULT_CONFIG_FILES['logger'])
        
        out_dir = f'./results/regression_task/guacamol/{model_name}/{guacamol_task}'
        trainer_config.out_dir = out_dir
        logger_config.name = model_name + '_' + guacamol_task
        logger_config.project = 'debug'
        trainer_config.enable_save_checkpoint = False
        task_config.validate = False
        logger_config.wandb_log = True
        
        dataset = AutoDataset.from_config(task_config)
        tokenizer = AutoTokenizer.from_config(task_config)                
        model = AutoModel.from_config(model_config)
        logger = WandbLogger(logger_config, [task_config, model_config, trainer_config])
        trainer = Trainer(config=trainer_config, model=model, train_dataset=dataset, eval_dataset=dataset, tokenizer=tokenizer, logger=logger)
        
        results = trainer.test(dataset)
        

        

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead
Downloading lipo MoleculeNet task...
Random seed set to 0
Downloaded into ./data/molecule_net/lipo
Loaded lipo data.
number of parameters: 38.06M
tokens per iteration will be: 512
Using cuda device
Random seed set to 1337
num decayed parameter tensors: 63, with 38,115,840 parameters
num non-decayed parameter tensors: 25, with 12,800 parameters
using fused AdamW: True
Compiling model..




number of parameters: 38.06M
tokens per iteration will be: 512
Using cuda device
Random seed set to 1337
num decayed parameter tensors: 63, with 38,115,840 parameters
num non-decayed parameter tensors: 25, with 12,800 parameters
using fused AdamW: True
Compiling model..




KeyboardInterrupt: 

In [11]:
predictions = results[0]

In [12]:
targets = results[1]

In [16]:
torch.Tensor(predictions).shape

torch.Size([3354])

In [17]:
torch.Tensor(targets).shape

torch.Size([3354])

In [20]:
dataset.target_transforms

In [25]:
dataset.target_transforms

In [27]:
import deepchem as dc
from deepchem.feat.molecule_featurizers.raw_featurizer import RawFeaturizer

In [30]:
featurizer = RawFeaturizer(smiles=True)
splitter = 'random'
target_transforms = None
a, b, c = dc.molnet.load_lipo(featurizer=featurizer, splitter=splitter)

In [33]:
c is not None

True

In [34]:
target_transforms = c if c is not None else None

In [35]:
target_transforms

[<deepchem.trans.transformers.NormalizationTransformer at 0x7f8e91352750>]

In [9]:
import torch

torch.Tensor([0.571, 0.914, 0.573]).mean()

tensor(0.6860)

In [10]:
from hybrid_transformer.utils.datasets.utils import load_txt_into_list

data = load_txt_into_list('../data/guacamol/train/smiles.txt')

In [11]:
len(data)

1273104