In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import pandas as pd

In [3]:
repo_dir = os.path.join(os.path.abspath('../../'))
repo_dir

'/mnt/c/Users/nicol/Documents/GitHub/wsl_experimental_pridict2/experimental_pridict2'

In [4]:
import sys
sys.path.insert(0,repo_dir)
import pridict

In [5]:
data_pth = os.path.join(repo_dir, 'dataset')
data_pth

'/mnt/c/Users/nicol/Documents/GitHub/wsl_experimental_pridict2/experimental_pridict2/dataset'

In [6]:
from pridict.pridictv2.utilities import *
from pridict.pridictv2.dataset import *
from pridict.pridictv2.run_workflow import *

In [7]:
report_available_cuda_devices()

number of GPUs available: 1
cuda:0, name:NVIDIA GeForce GTX 1650
total memory available: 3.999755859375 GB
total memory allocated on device: 0.0 GB
max memory allocated on device: 0.0 GB
total memory cached on device: 0.0 GB
max memory cached  on device: 0.0 GB





### Generate datatensor partitions

In [8]:
wsize=20
outcome_suffix = 'clamped'
include_MFE = False
if include_MFE:
    fsuffix = 'withMFE'
else:
    fsuffix = 'withoutMFE'
    
# use these to tune on 23K library HEK, K562
tfolder_name = 'proc_v2'
dsetnames_lst = ['HEK', 'K562']

data_dir = create_directory(os.path.join(repo_dir, 'dataset', tfolder_name, f'align_{fsuffix}'))

dtensor_partitions_lst = []
for outcome_name in dsetnames_lst:
    fname = f'dpartitions_{outcome_name}_{outcome_suffix}_wsize{wsize}.pkl'
    data_partitions =  ReaderWriter.read_data(os.path.join(data_dir, fname))
    fname = f'dtensor_{outcome_name}_{outcome_suffix}_wsize{wsize}.pkl'
    dtensor= ReaderWriter.read_data(os.path.join(data_dir, fname))
    dtensor_partitions = generate_partition_datatensor(dtensor, data_partitions)
    dtensor_partitions_lst.append(dtensor_partitions)
    
    
dtensor_partitions_multidata = {}
for run_num in range(5):
    dtensor_partitions_multidata[run_num] = []
    for dtensor_partitions in dtensor_partitions_lst:
        dtensor_partitions_multidata[run_num].append(dtensor_partitions[run_num])

In [9]:
dtensor_partitions_multidata

{0: [{'train': <pridict.pridictv2.dataset.PartitionDataTensor at 0x7fe02164d750>,
   'validation': <pridict.pridictv2.dataset.PartitionDataTensor at 0x7fe02164d690>,
   'test': <pridict.pridictv2.dataset.PartitionDataTensor at 0x7fe02164d5d0>},
  {'train': <pridict.pridictv2.dataset.PartitionDataTensor at 0x7fe137142800>,
   'validation': <pridict.pridictv2.dataset.PartitionDataTensor at 0x7fe0f165fdc0>,
   'test': <pridict.pridictv2.dataset.PartitionDataTensor at 0x7fe02164fa90>}],
 1: [{'train': <pridict.pridictv2.dataset.PartitionDataTensor at 0x7fe02164d510>,
   'validation': <pridict.pridictv2.dataset.PartitionDataTensor at 0x7fe02164d450>,
   'test': <pridict.pridictv2.dataset.PartitionDataTensor at 0x7fe02164d390>},
  {'train': <pridict.pridictv2.dataset.PartitionDataTensor at 0x7fe02164eb00>,
   'validation': <pridict.pridictv2.dataset.PartitionDataTensor at 0x7fe02164cf40>,
   'test': <pridict.pridictv2.dataset.PartitionDataTensor at 0x7fe02164e1a0>}],
 2: [{'train': <pridict.

### Define model and experiment configuration

In [10]:
run_gpu_map = {i:0 for i in range(len(data_partitions))}
run_gpu_map

{0: 0, 1: 0, 2: 0, 3: 0, 4: 0}

### Specify which layers to finetune 

In [11]:
trainable_layernames = []

for dsetname in dsetnames_lst:
    for lname in ['seqlevel_featembeder',
                  'decoder', 
                  'global_featemb_init_attn', 
                  'global_featemb_mut_attn', 
                  'local_featemb_init_attn',
                  'local_featemb_mut_attn']:
        trainable_layernames.append(f'{lname}_{dsetname}')
        
trainable_layernames

['seqlevel_featembeder_HEK',
 'decoder_HEK',
 'global_featemb_init_attn_HEK',
 'global_featemb_mut_attn_HEK',
 'local_featemb_init_attn_HEK',
 'local_featemb_mut_attn_HEK',
 'seqlevel_featembeder_K562',
 'decoder_K562',
 'global_featemb_init_attn_K562',
 'global_featemb_mut_attn_K562',
 'local_featemb_init_attn_K562',
 'local_featemb_mut_attn_K562']

### Finetuning base models on 23K (HEK and K562) Library

#### Use a base model pre-trained on Library 1 data (Mathis et al.) - see Figure 1 n in the paper

In [12]:
embed_dim = 128
z_dim = 72
num_hidden_layers = 2
bidirection=True
p_dropout = 0.15
rnn_class = nn.GRU
nonlin_func = nn.ReLU()
l2_reg = 1e-5
batch_size = 750
batch_size = 100
num_epochs = 150
# loss_func = 'KLDloss'
loss_func = 'CEloss'
# loss_func = 'Huberloss'
trf_tup = [embed_dim, z_dim,
           num_hidden_layers,
           bidirection, 
           p_dropout,
           rnn_class, nonlin_func,
           l2_reg, batch_size, num_epochs]
seqlevel_featdim = len(dtensor_partitions[0]['train'].pe_datatensor.seqlevel_feat_colnames)
num_t_outcomes = 3
default_outcomes = ['averageedited', 'averageunedited', 'averageindel']

experiment_options = {'experiment_desc':'pe_rnn_distribution_multidata',
                      'model_name':'PE_RNN_distribution_multidata',
                      'annot_embed':8,
                      'assemb_opt':'stack',
                      'loader_mode':'cycle',
                      'run_num':0,
                      'fdtype':torch.float32,
                      'wsize':wsize,
                      'datasets_name':dsetnames_lst,
                      'target_names': default_outcomes[:num_t_outcomes],
                      'base_model_suffix':None,
                      'separate_attention_layers':True,
                      'separate_seqlevel_embedder':True,
                      'seqlevel_featdim': seqlevel_featdim,
                      'trainable_layernames': trainable_layernames,
                      'num_outcomes':num_t_outcomes}
mconfig, options = build_config_map(trf_tup, experiment_options, loss_func=loss_func)

# provide the base model that will be used to fine-tune on the data
# we will use base_90k (pretrained on Library 1) to finetune by specifying the folder name where the trained base model is found
mfolder = 'exp_2023-06-02_09-49-21' # base_90k
model_type = 'base_90k'
trun = 1 # given that we have 5-fold training of base model we can specify which run to use

#### Use a base model pre-trained on Library 1 and Library-ClinVar data (Mathis et al. and Yu et al) - see Figure 1 n in the paper

In [13]:
# #######
# ## uncomment this cell to use this configuration to finetune 390k base model
# #######

# ## 390k model retuning
# embed_dim = 128
# z_dim = 72
# num_hidden_layers = 2
# bidirection=True
# p_dropout = 0.15
# rnn_class = nn.GRU
# nonlin_func = nn.ReLU()
# l2_reg = 1e-5
# batch_size = 750
# num_epochs = 150
# # loss_func = 'KLDloss'
# loss_func = 'CEloss'
# # loss_func = 'Huberloss'
# trf_tup = [embed_dim, z_dim,
#            num_hidden_layers,
#            bidirection, 
#            p_dropout,
#            rnn_class, nonlin_func,
#            l2_reg, batch_size, num_epochs]
# seqlevel_featdim = len(dtensor_partitions[0]['train'].pe_datatensor.seqlevel_feat_colnames)
# default_outcomes = ['averageedited', 'averageunedited', 'averageindel']
# num_t_outcomes = 3
# experiment_options = {'experiment_desc':'pe_rnn_distribution_multidata',
#                       'model_name':'PE_RNN_distribution_multidata',
#                       'annot_embed':8,
#                       'assemb_opt':'stack',
#                       'loader_mode':'cycle',
#                       'run_num':0,
#                       'fdtype':torch.float32,
#                       'wsize':wsize,
#                       'datasets_name':dsetnames_lst,
#                       'target_names': default_outcomes[:num_t_outcomes],
#                       'base_model_suffix':'HEKschwank',
#                       'separate_attention_layers':True,
#                       'separate_seqlevel_embedder':True,
#                       'seqlevel_featdim': seqlevel_featdim,
#                       'trainable_layernames': trainable_layernames,
#                       'num_outcomes':num_t_outcomes}
# mconfig, options = build_config_map(trf_tup, experiment_options, loss_func=loss_func)
# ## the base model 390k to finetune
# mfolder = 'exp_2023-08-26_20-58-14' # folder name where pretrained model is found
# model_type = 'base_390k'
# trun = 1


In [14]:
import datetime
repo_path = create_directory(os.path.join(os.path.abspath('../')))
experiment_desc = experiment_options['experiment_desc']
exp_dir = create_directory(os.path.join(repo_dir, 'experiments', experiment_desc))
exp_dir

'/mnt/c/Users/nicol/Documents/GitHub/wsl_experimental_pridict2/experimental_pridict2/experiments/pe_rnn_distribution_multidata'

### Run training/fine-tuning on the 5-folds


In [16]:
# mfolder = 'exp_2023-06-02_09-49-21' # base_90k **
# model_type = 'base_90k'
# trun = 1 

# mfolder = 'exp_2023-08-26_20-58-14' # base_390k **
# model_type = 'base_390k'
# trun = 1


trained_basemodel_dir = os.path.join(repo_dir, 
                                     'trained_models', 
                                     model_type,
                                     mfolder,
                                     'train_val')
config_map = (mconfig, options)
trmodels_dir_lst = []
for base_model_run in [trun]: 
    time_stamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    tr_val_dir = create_directory(f'exp_{time_stamp}', exp_dir)
    state_dict_dir = os.path.join(trained_basemodel_dir, 
                                  f'run_{base_model_run}',
                                 'model_statedict')
    trmodels_dir_lst.append(tr_val_dir)
    print('basemodel_run:', base_model_run)
    print('state_dict_dir:', state_dict_dir)
    print('tr_val_dir:', tr_val_dir)
    print()
    num_epochs=2
    tune_trainval_run(dtensor_partitions_multidata,
                      config_map, 
                      tr_val_dir, 
                      state_dict_dir, 
                      run_gpu_map, 
                      num_epochs=num_epochs) # change num_epochs if you want to do a `dry test` (i.e. fast check)

basemodel_run: 1
state_dict_dir: /mnt/c/Users/nicol/Documents/GitHub/wsl_experimental_pridict2/experimental_pridict2/trained_models/base_90k/exp_2023-06-02_09-49-21/train_val/run_1/model_statedict
tr_val_dir: /mnt/c/Users/nicol/Documents/GitHub/wsl_experimental_pridict2/experimental_pridict2/experiments/pe_rnn_distribution_multidata/exp_2024-07-11_14-45-14

config_exp_2024-07-11_14-45-14
{'batch_size': 100, 'num_workers': 0, 'loader_mode': 'cycle', 'datasets_name': ['HEK', 'K562']}
loader_mode: cycle
datasets_name: ['HEK', 'K562']
loss_type: CEloss
datasets_name_lst: ['HEK', 'K562']
using separate attention layers!!
[[(FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_init_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_init_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_mut_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_mut_attn_HEK')], [(FeatureEmbAttention(
  (softmax

100%|██████████| 163/163 [01:33<00:00,  1.73it/s]


__perfmetric_report_multidata_cont__
# of rows: 32600
updated # of rows: 32500
device: cuda:0 | experiment_desc: pe_rnn_distribution_multidata | run_num: 0 | epoch: 0 | dsettype: validation | pid: 11698


100%|██████████| 19/19 [00:04<00:00,  3.81it/s]


__perfmetric_report_multidata_cont__
# of rows: 3674
updated # of rows: 3674
device: cuda:0 | experiment_desc: pe_rnn_distribution_multidata | run_num: 0 | epoch: 1 | dsettype: train | pid: 11698


100%|██████████| 163/163 [02:00<00:00,  1.35it/s]


__perfmetric_report_multidata_cont__
# of rows: 32600
updated # of rows: 32500
device: cuda:0 | experiment_desc: pe_rnn_distribution_multidata | run_num: 0 | epoch: 1 | dsettype: validation | pid: 11698


100%|██████████| 19/19 [00:03<00:00,  5.45it/s]


__perfmetric_report_multidata_cont__
# of rows: 3674
updated # of rows: 3674
config_exp_2024-07-11_14-45-14
{'batch_size': 100, 'num_workers': 0, 'loader_mode': 'cycle', 'datasets_name': ['HEK', 'K562']}
loader_mode: cycle
datasets_name: ['HEK', 'K562']
loss_type: CEloss
datasets_name_lst: ['HEK', 'K562']
using separate attention layers!!
[[(FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_init_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_init_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_mut_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_mut_attn_HEK')], [(FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_init_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_init_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_mut_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'glo

100%|██████████| 163/163 [01:41<00:00,  1.60it/s]


__perfmetric_report_multidata_cont__
# of rows: 32600
updated # of rows: 32600
device: cuda:0 | experiment_desc: pe_rnn_distribution_multidata | run_num: 1 | epoch: 0 | dsettype: validation | pid: 11698


100%|██████████| 19/19 [00:04<00:00,  3.93it/s]


__perfmetric_report_multidata_cont__
# of rows: 3699
updated # of rows: 3599
device: cuda:0 | experiment_desc: pe_rnn_distribution_multidata | run_num: 1 | epoch: 1 | dsettype: train | pid: 11698


100%|██████████| 163/163 [01:43<00:00,  1.57it/s]


__perfmetric_report_multidata_cont__
# of rows: 32600
updated # of rows: 32600
device: cuda:0 | experiment_desc: pe_rnn_distribution_multidata | run_num: 1 | epoch: 1 | dsettype: validation | pid: 11698


100%|██████████| 19/19 [00:06<00:00,  2.77it/s]


__perfmetric_report_multidata_cont__
# of rows: 3699
updated # of rows: 3599
config_exp_2024-07-11_14-45-14
{'batch_size': 100, 'num_workers': 0, 'loader_mode': 'cycle', 'datasets_name': ['HEK', 'K562']}
loader_mode: cycle
datasets_name: ['HEK', 'K562']
loss_type: CEloss
datasets_name_lst: ['HEK', 'K562']
using separate attention layers!!
[[(FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_init_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_init_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_mut_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_mut_attn_HEK')], [(FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_init_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_init_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_mut_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'glo

100%|██████████| 163/163 [01:37<00:00,  1.67it/s]


__perfmetric_report_multidata_cont__
# of rows: 32600
updated # of rows: 32500
device: cuda:0 | experiment_desc: pe_rnn_distribution_multidata | run_num: 2 | epoch: 0 | dsettype: validation | pid: 11698


100%|██████████| 19/19 [00:03<00:00,  5.34it/s]


__perfmetric_report_multidata_cont__
# of rows: 3642
updated # of rows: 3642
device: cuda:0 | experiment_desc: pe_rnn_distribution_multidata | run_num: 2 | epoch: 1 | dsettype: train | pid: 11698


100%|██████████| 163/163 [01:47<00:00,  1.52it/s]


__perfmetric_report_multidata_cont__
# of rows: 32600
updated # of rows: 32500
device: cuda:0 | experiment_desc: pe_rnn_distribution_multidata | run_num: 2 | epoch: 1 | dsettype: validation | pid: 11698


100%|██████████| 19/19 [00:06<00:00,  3.03it/s]


__perfmetric_report_multidata_cont__
# of rows: 3642
updated # of rows: 3642
config_exp_2024-07-11_14-45-14
{'batch_size': 100, 'num_workers': 0, 'loader_mode': 'cycle', 'datasets_name': ['HEK', 'K562']}
loader_mode: cycle
datasets_name: ['HEK', 'K562']
loss_type: CEloss
datasets_name_lst: ['HEK', 'K562']
using separate attention layers!!
[[(FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_init_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_init_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_mut_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_mut_attn_HEK')], [(FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_init_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_init_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_mut_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'glo

100%|██████████| 163/163 [01:54<00:00,  1.42it/s]


__perfmetric_report_multidata_cont__
# of rows: 32600
updated # of rows: 32500
device: cuda:0 | experiment_desc: pe_rnn_distribution_multidata | run_num: 3 | epoch: 0 | dsettype: validation | pid: 11698


100%|██████████| 19/19 [00:07<00:00,  2.43it/s]


__perfmetric_report_multidata_cont__
# of rows: 3665
updated # of rows: 3665
device: cuda:0 | experiment_desc: pe_rnn_distribution_multidata | run_num: 3 | epoch: 1 | dsettype: train | pid: 11698


100%|██████████| 163/163 [01:36<00:00,  1.69it/s]


__perfmetric_report_multidata_cont__
# of rows: 32600
updated # of rows: 32500
device: cuda:0 | experiment_desc: pe_rnn_distribution_multidata | run_num: 3 | epoch: 1 | dsettype: validation | pid: 11698


100%|██████████| 19/19 [00:04<00:00,  4.18it/s]


__perfmetric_report_multidata_cont__
# of rows: 3665
updated # of rows: 3665
config_exp_2024-07-11_14-45-14
{'batch_size': 100, 'num_workers': 0, 'loader_mode': 'cycle', 'datasets_name': ['HEK', 'K562']}
loader_mode: cycle
datasets_name: ['HEK', 'K562']
loss_type: CEloss
datasets_name_lst: ['HEK', 'K562']
using separate attention layers!!
[[(FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_init_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_init_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_mut_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_mut_attn_HEK')], [(FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_init_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_init_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_mut_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'glo

100%|██████████| 163/163 [02:00<00:00,  1.35it/s]


__perfmetric_report_multidata_cont__
# of rows: 32600
updated # of rows: 32500
device: cuda:0 | experiment_desc: pe_rnn_distribution_multidata | run_num: 4 | epoch: 0 | dsettype: validation | pid: 11698


100%|██████████| 19/19 [00:08<00:00,  2.37it/s]


__perfmetric_report_multidata_cont__
# of rows: 3632
updated # of rows: 3632
device: cuda:0 | experiment_desc: pe_rnn_distribution_multidata | run_num: 4 | epoch: 1 | dsettype: train | pid: 11698


100%|██████████| 163/163 [01:41<00:00,  1.61it/s]


__perfmetric_report_multidata_cont__
# of rows: 32600
updated # of rows: 32500
device: cuda:0 | experiment_desc: pe_rnn_distribution_multidata | run_num: 4 | epoch: 1 | dsettype: validation | pid: 11698


100%|██████████| 19/19 [00:05<00:00,  3.45it/s]


__perfmetric_report_multidata_cont__
# of rows: 3632
updated # of rows: 3632


### Run train models on test set of each fold

In [17]:
config_map = (mconfig, options)
for tr_val_dir in trmodels_dir_lst:
    print('evaluating modeldir:', tr_val_dir)
    test_multidata_run(dtensor_partitions_multidata,
                   config_map, 
                   tr_val_dir, 
                   tr_val_dir, 
                   run_gpu_map, 
                   num_epochs=1)

evaluating modeldir: /mnt/c/Users/nicol/Documents/GitHub/wsl_experimental_pridict2/experimental_pridict2/experiments/pe_rnn_distribution_multidata/exp_2024-07-11_14-45-14
{'batch_size': 100, 'num_workers': 0, 'loader_mode': 'cycle', 'datasets_name': ['HEK', 'K562']}
loader_mode: cycle
datasets_name: ['HEK', 'K562']
loss_type: CEloss
datasets_name_lst: ['HEK', 'K562']
using separate attention layers!!
[[(FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_init_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_init_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_mut_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_mut_attn_HEK')], [(FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_init_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_init_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_mut_attn_K5

100%|██████████| 46/46 [00:15<00:00,  3.03it/s]


__perfmetric_report_multidata_cont__
# of rows: 9096
updated # of rows: 9096
{'batch_size': 100, 'num_workers': 0, 'loader_mode': 'cycle', 'datasets_name': ['HEK', 'K562']}
loader_mode: cycle
datasets_name: ['HEK', 'K562']
loss_type: CEloss
datasets_name_lst: ['HEK', 'K562']
using separate attention layers!!
[[(FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_init_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_init_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_mut_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_mut_attn_HEK')], [(FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_init_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_init_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_mut_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_mut_attn_K562')]]
u

100%|██████████| 46/46 [00:09<00:00,  4.92it/s]


__perfmetric_report_multidata_cont__
# of rows: 9075
updated # of rows: 9075
{'batch_size': 100, 'num_workers': 0, 'loader_mode': 'cycle', 'datasets_name': ['HEK', 'K562']}
loader_mode: cycle
datasets_name: ['HEK', 'K562']
loss_type: CEloss
datasets_name_lst: ['HEK', 'K562']
using separate attention layers!!
[[(FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_init_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_init_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_mut_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_mut_attn_HEK')], [(FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_init_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_init_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_mut_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_mut_attn_K562')]]
u

100%|██████████| 46/46 [00:07<00:00,  5.80it/s]


__perfmetric_report_multidata_cont__
# of rows: 9067
updated # of rows: 9067
{'batch_size': 100, 'num_workers': 0, 'loader_mode': 'cycle', 'datasets_name': ['HEK', 'K562']}
loader_mode: cycle
datasets_name: ['HEK', 'K562']
loss_type: CEloss
datasets_name_lst: ['HEK', 'K562']
using separate attention layers!!
[[(FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_init_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_init_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_mut_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_mut_attn_HEK')], [(FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_init_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_init_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_mut_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_mut_attn_K562')]]
u

100%|██████████| 46/46 [00:08<00:00,  5.51it/s]


__perfmetric_report_multidata_cont__
# of rows: 9079
updated # of rows: 9079
{'batch_size': 100, 'num_workers': 0, 'loader_mode': 'cycle', 'datasets_name': ['HEK', 'K562']}
loader_mode: cycle
datasets_name: ['HEK', 'K562']
loss_type: CEloss
datasets_name_lst: ['HEK', 'K562']
using separate attention layers!!
[[(FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_init_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_init_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_mut_attn_HEK'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_mut_attn_HEK')], [(FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_init_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_init_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'local_featemb_mut_attn_K562'), (FeatureEmbAttention(
  (softmax): Softmax(dim=1)
), 'global_featemb_mut_attn_K562')]]
u

100%|██████████| 46/46 [00:08<00:00,  5.41it/s]


__perfmetric_report_multidata_cont__
# of rows: 9054
updated # of rows: 9054


### Models' evaluation

In [18]:
dsetnames = dsetnames_lst
for dsettype in ('train', 'validation', 'test'):
    print(f'--- {dsettype} ---')
    for outcome_name in ['averageedited', 'averageunedited', 'averageindel']:
        out = build_performance_multidata_dfs(tr_val_dir, 5, dsettype, outcome_name, 'continuous', dsetnames)
        for i_data, dsetname in enumerate(dsetnames):
            display(out[i_data])
            
    print('*'*15)

--- train ---


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman_corr_HEK_averageedited,0.898341,0.900508,0.896103,0.897903,0.899286,0.898428,0.898341,0.001639
pearson_corr_HEK_averageedited,0.890738,0.892372,0.891015,0.8903,0.891213,0.891128,0.891015,0.000776


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman_corr_K562_averageedited,0.75652,0.752396,0.752962,0.750411,0.755048,0.753467,0.752962,0.002375
pearson_corr_K562_averageedited,0.618464,0.608165,0.617701,0.611258,0.601728,0.611463,0.611258,0.006957


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman_corr_HEK_averageunedited,0.880484,0.881336,0.878566,0.880294,0.880787,0.880294,0.880484,0.001043
pearson_corr_HEK_averageunedited,0.885028,0.886863,0.88528,0.88509,0.886392,0.885731,0.88528,0.000841


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman_corr_K562_averageunedited,0.668958,0.667247,0.664571,0.669594,0.665234,0.667121,0.667247,0.002212
pearson_corr_K562_averageunedited,0.614942,0.604898,0.61329,0.609621,0.601294,0.608809,0.609621,0.005704


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman_corr_HEK_averageindel,0.2757,0.275885,0.277205,0.273657,0.272809,0.275051,0.2757,0.001784
pearson_corr_HEK_averageindel,0.255856,0.26614,0.278226,0.269608,0.248883,0.263742,0.26614,0.011543


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman_corr_K562_averageindel,0.231857,0.234924,0.228951,0.232507,0.230886,0.231825,0.231857,0.002191
pearson_corr_K562_averageindel,0.347141,0.371359,0.358647,0.370484,0.321159,0.353758,0.358647,0.020736


***************
--- validation ---


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman_corr_HEK_averageedited,0.900594,0.902125,0.910504,0.910543,0.904847,0.905723,0.904847,0.00464
pearson_corr_HEK_averageedited,0.894112,0.890715,0.901314,0.891432,0.890633,0.893641,0.891432,0.004517


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman_corr_K562_averageedited,0.754306,0.767472,0.780294,0.782539,0.775489,0.77202,0.775489,0.011463
pearson_corr_K562_averageedited,0.584737,0.646825,0.622521,0.612337,0.682693,0.629822,0.622521,0.037


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman_corr_HEK_averageunedited,0.874954,0.890119,0.896389,0.890129,0.883937,0.887106,0.890119,0.008095
pearson_corr_HEK_averageunedited,0.885184,0.889394,0.893056,0.88476,0.881846,0.886848,0.885184,0.004393


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman_corr_K562_averageunedited,0.658835,0.692245,0.676786,0.682347,0.693842,0.680811,0.682347,0.01416
pearson_corr_K562_averageunedited,0.578394,0.64488,0.620047,0.612094,0.674007,0.625885,0.620047,0.035904


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman_corr_HEK_averageindel,0.261509,0.304135,0.259856,0.318512,0.30408,0.289618,0.30408,0.027068
pearson_corr_HEK_averageindel,0.238272,0.243568,0.243509,0.253626,0.274238,0.250643,0.243568,0.014315


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman_corr_K562_averageindel,0.205323,0.252111,0.245431,0.296889,0.23285,0.246521,0.245431,0.033368
pearson_corr_K562_averageindel,0.425064,0.274382,0.310455,0.378761,0.35646,0.349024,0.35646,0.058696


***************
--- test ---


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman_corr_HEK_averageedited,0.905339,0.904053,0.898795,0.905389,0.90696,0.904107,0.905339,0.003143
pearson_corr_HEK_averageedited,0.897007,0.898176,0.884071,0.90099,0.902304,0.896509,0.898176,0.00727


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman_corr_K562_averageedited,0.780155,0.773526,0.774443,0.788912,0.788191,0.781046,0.780155,0.007312
pearson_corr_K562_averageedited,0.642186,0.631576,0.631518,0.643688,0.642651,0.638324,0.642186,0.00621


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman_corr_HEK_averageunedited,0.888964,0.88474,0.879959,0.883876,0.887197,0.884947,0.88474,0.003438
pearson_corr_HEK_averageunedited,0.892276,0.888572,0.880626,0.893647,0.895311,0.890087,0.892276,0.005843


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman_corr_K562_averageunedited,0.696549,0.685921,0.698279,0.683989,0.699329,0.692813,0.696549,0.007274
pearson_corr_K562_averageunedited,0.644663,0.628958,0.633357,0.637798,0.635306,0.636016,0.635306,0.005817


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman_corr_HEK_averageindel,0.30101,0.292837,0.286965,0.28906,0.291004,0.292175,0.291004,0.005402
pearson_corr_HEK_averageindel,0.273591,0.258513,0.268975,0.253349,0.351037,0.281093,0.268975,0.039921


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman_corr_K562_averageindel,0.283801,0.232526,0.275295,0.239439,0.25598,0.257408,0.25598,0.022139
pearson_corr_K562_averageindel,0.397798,0.320636,0.406359,0.326456,0.437412,0.377732,0.397798,0.051656


***************


In [19]:
# update options with wsize and seqlevelfeat_dimension
tdir = tr_val_dir
for run in range(5):
    tlink = os.path.join(tdir, 'train_val', f'run_{run}', 'model_statedict', 'best_epoch.pkl')
    print(ReaderWriter.read_data(tlink))

{'epoch': 2}
{'epoch': 2}
{'epoch': 2}
{'epoch': 2}
{'epoch': 2}
