In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
repo_dir = os.path.join(os.path.abspath('../../'))
repo_dir

In [None]:
import sys
sys.path.insert(0,repo_dir)
import pridict

In [None]:
data_pth = os.path.join(repo_dir, 'dataset')
data_pth

In [None]:
from pridict.pridictv2.utilities import *
from pridict.pridictv2.dataset import *
from pridict.pridictv2.run_workflow import *

In [None]:
report_available_cuda_devices()

### Generate datatensor partitions

In [None]:
wsize=20
outcome_suffix = 'clamped'
include_MFE = False
if include_MFE:
    fsuffix = 'withMFE'
else:
    fsuffix = 'withoutMFE'
    
# use these to tune on 23K library HEK, K562
tfolder_name = 'proc_v2'
dsetnames_lst = ['HEK', 'K562']

data_dir = create_directory(os.path.join(repo_dir, 'dataset', tfolder_name, f'align_{fsuffix}'))

dtensor_partitions_lst: list[dict[int, dict[str, PartitionDataTensor]]] = []
for outcome_name in dsetnames_lst:
    fname = f'dpartitions_{outcome_name}_{outcome_suffix}_wsize{wsize}.pkl'
    data_partitions =  ReaderWriter.read_data(os.path.join(data_dir, fname))
    fname = f'dtensor_{outcome_name}_{outcome_suffix}_wsize{wsize}.pkl'
    dtensor= ReaderWriter.read_data(os.path.join(data_dir, fname))
    dtensor_partitions = generate_partition_datatensor(dtensor, data_partitions)
    dtensor_partitions_lst.append(dtensor_partitions)
    
    
dtensor_partitions_multidata: dict[int, list[dict[str, PartitionDataTensor]]] = {}
for run_num in range(5):
    dtensor_partitions_multidata[run_num] = []
    for dtensor_partitions in dtensor_partitions_lst:
        dtensor_partitions_multidata[run_num].append(dtensor_partitions[run_num])

In [None]:
dtensor_partitions_multidata

### Define model and experiment configuration

In [None]:
run_gpu_map = {i:0 for i in range(len(data_partitions))}
run_gpu_map

### Specify which layers to finetune 

In [None]:
trainable_layernames = []

for dsetname in dsetnames_lst:
    for lname in ['seqlevel_featembeder',
                  'decoder', 
                  'global_featemb_init_attn', 
                  'global_featemb_mut_attn', 
                  'local_featemb_init_attn',
                  'local_featemb_mut_attn']:
        trainable_layernames.append(f'{lname}_{dsetname}')
        
trainable_layernames

### Finetuning base models on 23K (HEK and K562) Library

#### Use a base model pre-trained on Library 1 data (Mathis et al.) - see Figure 1 n in the paper

In [None]:
embed_dim = 128
z_dim = 72
num_hidden_layers = 2
bidirection=True
p_dropout = 0.15
rnn_class = nn.GRU
nonlin_func = nn.ReLU()
l2_reg = 1e-5
batch_size = 750
num_epochs = 150
# loss_func = 'KLDloss'
loss_func = 'CEloss'
# loss_func = 'Huberloss'
trf_tup = [embed_dim, z_dim,
           num_hidden_layers,
           bidirection, 
           p_dropout,
           rnn_class, nonlin_func,
           l2_reg, batch_size, num_epochs]
seqlevel_featdim = len(dtensor_partitions[0]['train'].pe_datatensor.seqlevel_feat_colnames)
num_t_outcomes = 3
default_outcomes = ['averageedited', 'averageunedited', 'averageindel']

experiment_options = {'experiment_desc':'pe_rnn_distribution_multidata',
                      'model_name':'PE_RNN_distribution_multidata',
                      'annot_embed':8,
                      'assemb_opt':'stack',
                      'loader_mode':'cycle',
                      'run_num':0,
                      'fdtype':torch.float32,
                      'wsize':wsize,
                      'datasets_name':dsetnames_lst,
                      'target_names': default_outcomes[:num_t_outcomes],
                      'base_model_suffix':None,
                      'separate_attention_layers':True,
                      'separate_seqlevel_embedder':True,
                      'seqlevel_featdim': seqlevel_featdim,
                      'trainable_layernames': trainable_layernames,
                      'num_outcomes':num_t_outcomes}
mconfig, options = build_config_map(trf_tup, experiment_options, loss_func=loss_func)

# provide the base model that will be used to fine-tune on the data
# we will use base_90k (pretrained on Library 1) to finetune by specifying the folder name where the trained base model is found
mfolder = 'exp_2023-06-02_09-49-21' # base_90k
model_type = 'base_90k'
trun = 1 # given that we have 5-fold training of base model we can specify which run to use

#### Use a base model pre-trained on Library 1 and Library-ClinVar data (Mathis et al. and Yu et al) - see Figure 1 n in the paper

In [None]:
# #######
# ## uncomment this cell to use this configuration to finetune 390k base model
# #######

# ## 390k model retuning
# embed_dim = 128
# z_dim = 72
# num_hidden_layers = 2
# bidirection=True
# p_dropout = 0.15
# rnn_class = nn.GRU
# nonlin_func = nn.ReLU()
# l2_reg = 1e-5
# batch_size = 750
# num_epochs = 150
# # loss_func = 'KLDloss'
# loss_func = 'CEloss'
# # loss_func = 'Huberloss'
# trf_tup = [embed_dim, z_dim,
#            num_hidden_layers,
#            bidirection, 
#            p_dropout,
#            rnn_class, nonlin_func,
#            l2_reg, batch_size, num_epochs]
# seqlevel_featdim = len(dtensor_partitions[0]['train'].pe_datatensor.seqlevel_feat_colnames)
# default_outcomes = ['averageedited', 'averageunedited', 'averageindel']
# num_t_outcomes = 3
# experiment_options = {'experiment_desc':'pe_rnn_distribution_multidata',
#                       'model_name':'PE_RNN_distribution_multidata',
#                       'annot_embed':8,
#                       'assemb_opt':'stack',
#                       'loader_mode':'cycle',
#                       'run_num':0,
#                       'fdtype':torch.float32,
#                       'wsize':wsize,
#                       'datasets_name':dsetnames_lst,
#                       'target_names': default_outcomes[:num_t_outcomes],
#                       'base_model_suffix':'HEKschwank',
#                       'separate_attention_layers':True,
#                       'separate_seqlevel_embedder':True,
#                       'seqlevel_featdim': seqlevel_featdim,
#                       'trainable_layernames': trainable_layernames,
#                       'num_outcomes':num_t_outcomes}
# mconfig, options = build_config_map(trf_tup, experiment_options, loss_func=loss_func)
# ## the base model 390k to finetune
# mfolder = 'exp_2023-08-26_20-58-14' # folder name where pretrained model is found
# model_type = 'base_390k'
# trun = 1


In [None]:
import datetime
repo_path = create_directory(os.path.join(os.path.abspath('../')))
experiment_desc = experiment_options['experiment_desc']
exp_dir = create_directory(os.path.join(repo_dir, 'experiments', experiment_desc))
exp_dir

### Run training/fine-tuning on the 5-folds


In [None]:
# mfolder = 'exp_2023-06-02_09-49-21' # base_90k **
# model_type = 'base_90k'
# trun = 1 

# mfolder = 'exp_2023-08-26_20-58-14' # base_390k **
# model_type = 'base_390k'
# trun = 1


trained_basemodel_dir = os.path.join(repo_dir, 
                                     'trained_models', 
                                     model_type,
                                     mfolder,
                                     'train_val')
config_map = (mconfig, options)
trmodels_dir_lst = []
for base_model_run in [trun]: 
    time_stamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    tr_val_dir = create_directory(f'exp_{time_stamp}', exp_dir)
    state_dict_dir = os.path.join(trained_basemodel_dir, 
                                  f'run_{base_model_run}',
                                 'model_statedict')
    trmodels_dir_lst.append(tr_val_dir)
    print('basemodel_run:', base_model_run)
    print('state_dict_dir:', state_dict_dir)
    print('tr_val_dir:', tr_val_dir)
    print()
    tune_trainval_run(dtensor_partitions_multidata,
                      config_map, 
                      tr_val_dir, 
                      state_dict_dir, 
                      run_gpu_map, 
                      num_epochs=num_epochs) # change num_epochs if you want to do a `dry test` (i.e. fast check)

### Run train models on test set of each fold

In [None]:
config_map = (mconfig, options)
for tr_val_dir in trmodels_dir_lst:
    print('evaluating modeldir:', tr_val_dir)
    test_multidata_run(dtensor_partitions_multidata,
                   config_map, 
                   tr_val_dir, 
                   tr_val_dir, 
                   run_gpu_map, 
                   num_epochs=1)

### Models' evaluation

In [None]:
dsetnames = dsetnames_lst
for dsettype in ('train', 'validation', 'test'):
    print(f'--- {dsettype} ---')
    for outcome_name in ['averageedited', 'averageunedited', 'averageindel']:
        out = build_performance_multidata_dfs(tr_val_dir, 5, dsettype, outcome_name, 'continuous', dsetnames)
        for i_data, dsetname in enumerate(dsetnames):
            display(out[i_data])
            
    print('*'*15)