In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
repo_dir = os.path.join(os.path.abspath('../../'))
repo_dir

In [None]:
import sys
sys.path.insert(0,repo_dir)
import pridict

In [None]:
data_pth = os.path.join(repo_dir, 'dataset')
data_pth

In [None]:
!conda env list

In [None]:
from pridict.pridictv2.utilities import *
from pridict.pridictv2.dataset import *
from pridict.pridictv2.run_workflow import *


### Generate datatensor partitions

In [None]:
wsize=20
outcome_suffix = 'clamped'
include_MFE = False
if include_MFE:
    fsuffix = 'withMFE'
else:
    fsuffix = 'withoutMFE'
    
tfolder = 'proc_v2'
data_dir = create_directory(os.path.join(repo_dir, 'dataset', tfolder, f'align_{fsuffix}'))

dtensor_partitions_lst: list[dict[int, dict[str, PartitionDataTensor]]] = []
for outcome_name in ['HEK', 'K562']:
    fname = f'dpartitions_{outcome_name}_{outcome_suffix}_wsize{wsize}.pkl'
    data_partitions =  ReaderWriter.read_data(os.path.join(data_dir, fname))
    fname = f'dtensor_{outcome_name}_{outcome_suffix}_wsize{wsize}.pkl'
    dtensor= ReaderWriter.read_data(os.path.join(data_dir, fname))
    dtensor_partitions = generate_partition_datatensor(dtensor, data_partitions)
    dtensor_partitions_lst.append(dtensor_partitions)
    
    
dtensor_partitions_multidata: dict[int, list[dict[str, PartitionDataTensor]]] = {}
for run_num in range(5):
    dtensor_partitions_multidata[run_num] = []
    for dtensor_partitions in dtensor_partitions_lst:
        dtensor_partitions_multidata[run_num].append(dtensor_partitions[run_num])

In [None]:
dtensor_partitions_multidata

In [None]:
data_dir

In [None]:
dtensor_partitions_lst[0]

In [None]:
torch.cuda.device_count()

### Define model and experiment configuration

We can assign different gpu device index to a fold index (i.e. id), where we train and test separate models on the different GPU devices. For example, if we have 5 GPU devices, we can assign each one to a fold id and create `run_gpu_map` dictionary as the following:
```python
run_gpu_map = {i:i for i in range(len(data_partitions))}

```
The `run_gpu_map` dictionary has keys referring to fold ids and values referring to the GPU device index where the model is trained.


In [None]:
## in the current setup, we have one GPU device and hence we will assign the same device to all fold ids
run_gpu_map = {i:0 for i in range(len(data_partitions))}
run_gpu_map

### Example definition for model training workflow

In [None]:
embed_dim = 128
z_dim = 64
num_hidden_layers = 2
bidirection=True
p_dropout = 0.15
rnn_class = nn.GRU
nonlin_func = nn.ReLU()
l2_reg = 1e-5
batch_size = 750
num_epochs = 150
# loss_func = 'KLDloss'
loss_func = 'CEloss'
trf_tup = [embed_dim, z_dim,
           num_hidden_layers,
           bidirection, 
           p_dropout,
           rnn_class, nonlin_func,
           l2_reg, batch_size, num_epochs]
seqlevel_featdim = len(dtensor_partitions[0]['train'].pe_datatensor.seqlevel_feat_colnames)
default_outcomes = ['averageedited', 'averageunedited', 'averageindel']
num_t_outcomes = 3
experiment_options = {'experiment_desc':'pe_rnn_distribution_multidata',
                      'model_name':'PE_RNN_distribution_multidata',
                      'annot_embed':8,
                      'assemb_opt':'stack',
                      'loader_mode':'cycle',
                      'run_num':0,
                      'fdtype':torch.float32,
                      'wsize':wsize,
                      'datasets_name':['HEK', 'K562'],
                      'target_names': default_outcomes[:num_t_outcomes],
                      'weight_func_pointers':[None, None],
                      'correctiontype_weights':[None, None],
                      'separate_attention_layers':True,
                      'separate_seqlevel_embedder':True,
                      'seqlevel_featdim': seqlevel_featdim,
                      'num_outcomes':num_t_outcomes}
mconfig, options = build_config_map(trf_tup, experiment_options, loss_func=loss_func)

In [None]:
import datetime
repo_path = repo_dir
experiment_desc = experiment_options['experiment_desc']
exp_dir = create_directory(os.path.join(repo_path, 'experiments', experiment_desc))
time_stamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
tr_val_dir = create_directory(f'exp_{time_stamp}', exp_dir)
tr_val_dir

In [None]:
def compute_performance(tr_val_dir, dset_names, num_runs, wsize=20):
    outcome_names = ['averageedited', 'averageunedited', 'averageindel']
    res_lst = []

    for run_num in range(num_runs):
        pred_df = pd.read_csv(os.path.join(tr_val_dir, 'test', f'run_{run_num}', 'predictions_test.csv'))
        mscore, report = compute_performance_multidata_from_df(pred_df, dset_names)
        for i_data, dsetname in enumerate(dset_names):
            m = mscore.modelscores_lst[i_data]
            for tindx, tcol in enumerate(outcome_names):
                pearson_score = m.pearson_lst[tindx]
                spearman_score =  m.spearman_lst[tindx]
                res_lst.append((wsize, run_num, pearson_score, spearman_score, tcol, dsetname))
    res_df = pd.DataFrame(res_lst)
    res_df.columns = ['wsize', 'run_num', 'pear_score', 'spearman_score', 'outcome_name', 'dsetname']
    return res_df


### Train/test models on the 5 folds

In [None]:
config_map = (mconfig, options)
train_val_multidata_run(dtensor_partitions_multidata,
                        config_map,
                        tr_val_dir, 
                        run_gpu_map, 
                        num_epochs=num_epochs) # change num_epochs if you want to do a `dry test` (i.e. fast check)

In [None]:
test_multidata_run(dtensor_partitions_multidata,
                   config_map, 
                   tr_val_dir, 
                   tr_val_dir, 
                   run_gpu_map, 
                   num_epochs=1)

### Evaluate trained models' performance

In [None]:
compute_performance(tr_val_dir, ['HEK', 'K562'], 5, wsize=20)

In [None]:
dsetnames = ['HEK', 'K562']
for dsettype in ('train', 'validation', 'test'):
    print(f'--- {dsettype} ---')
    for outcome_name in ['averageedited', 'averageunedited', 'averageindel']:
        out = build_performance_multidata_dfs(tr_val_dir, 5, dsettype, outcome_name, 'continuous', dsetnames)
        for i_data, dsetname in enumerate(dsetnames):
            display(out[i_data])
            
    print('*'*15)

### Identify the epoch's number in which the model (saved state) achieved best peformance on validation set

In [None]:
# update options with wsize and seqlevelfeat_dimension
tdir = tr_val_dir
for run in range(5):
    tlink = os.path.join(tdir, 'train_val', f'run_{run}', 'model_statedict', 'best_epoch.pkl')
    print(ReaderWriter.read_data(tlink))