In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
repo_dir = os.path.join(os.path.abspath('../'))
repo_dir

In [None]:
from matplotlib import pyplot as plt

In [None]:
import sys
sys.path.insert(0,repo_dir)
import pridict

In [None]:
data_pth = os.path.join(repo_dir, 'dataset')
data_pth

In [None]:
input_pth = os.path.join(repo_dir, 'input')
input_pth

In [None]:
from pridict.pridictv2.utilities import *
from pridict.pridictv2.dataset import *
from pridict.pridictv2.predict_outcomedistrib import *

### Running on 29k dataset

In [None]:

test_df = pd.read_csv(os.path.join(input_pth, '20240113_librarydiv_df_batchfile_with_adapted_wide_initial_target_with_HEKaverageedited.csv'))
test_df['deepeditposition_lst'] = test_df['deepeditposition_lst'].astype('str')

In [None]:
test_df['AdVaverageedited'].hist()

In [None]:
test_df['HEKaverageedited'].hist()

In [None]:
def compute_performance(pred_df, dset_names, run_num, model_id, correction_type='all', wsize=20, outcome_names=['averageedited', 'averageunedited', 'averageindel']):
    res_lst = []
    mscore, report = compute_performance_multidata_from_df(pred_df, dset_names, outcome_names)
    for i_data, dsetname in enumerate(dset_names):
        m = mscore.modelscores_lst[i_data]
        for tindx, tcol in enumerate(outcome_names):
            pearson_score = m.pearson_lst[tindx]
            spearman_score =  m.spearman_lst[tindx]
            res_lst.append([model_id, wsize, run_num, pearson_score, spearman_score, tcol, dsetname, correction_type])
    return res_lst

In [None]:
def compute_modelruns_avg_predictions(pred_df_lst, grouping_colnames=['seq_id', 'dataset_name']):
    pred_df_allruns = pd.concat(pred_df_lst, axis=0, ignore_index=True)
    agg_df = pred_df_allruns.groupby(by=grouping_colnames).mean()
    agg_df.reset_index(inplace=True)
    for colname in ('run_num', 'Unnamed: 0'):
        if colname in agg_df:
            del agg_df[colname]
    return agg_df

### Evaluate all trained models

In [None]:
device = get_device(True,0)

include_MFE=False
include_addendumfeat=False

y_ref_colnames = ['HEKaverageedited']
y_pred_colnames = ['averageedited', 'averageunedited']

res_lst = []

topfname = 'pridict_inference_29k_basemodels'


# updated retrained new models
dumpfname = 'newmodels_all'

models = [('base_90k', 'pe_rnn_kldiv', 'exp_2023-06-02_09-49-21', ['HEK'], 5),
          ('base_390k', 'pe_rnn_distribution_multidata', 'exp_2023-08-26_20-58-14',['HEKschwank', 'HEKhyongbum'], 5)
         ]

for model in models:
    for wsize in [20]:
        prieml_model = PRIEML_Model(device, 
                                    wsize=wsize, 
                                    normalize='max', 
                                    include_MFE=include_MFE, 
                                    include_addendumfeat=include_addendumfeat,
                                    fdtype=torch.float32)

        model_id, model_type, mfolder, cell_types, num_runs = model
        dloader = prieml_model.prepare_data(test_df, 
                                            model_id,
                                            cell_types=cell_types,
                                            y_ref=y_ref_colnames, 
                                            batch_size=1500)
        pred_df_lst = []
        for run_num in range(num_runs):
            model_dir = os.path.join(repo_dir, 
                                     'trained_models',
                                     model_id,
                                     mfolder,
                                     'train_val')
            mdir = os.path.join(model_dir, f'run_{run_num}')
            print(mdir)
            pred_df = prieml_model.predict_from_dloader(dloader, mdir, y_ref=y_pred_colnames)
            pred_df['run_num'] = run_num
            
            # drop na 
            pred_df = pred_df.dropna(axis=0, subset= ['true_averageedited'])
            pred_df_lst.append(pred_df)

            mid = f"{model_id}_{mfolder}"
            res = compute_performance(pred_df, 
                                      cell_types,
                                      run_num,
                                      mid,
                                      wsize=wsize,
                                      outcome_names=['averageedited'])
            res_lst.extend(res)

        # compute average prediction from multiple runs 
        agg_df = compute_modelruns_avg_predictions(pred_df_lst)
        res = compute_performance(agg_df, 
                                  cell_types,
                                 'avg_run',
                                  mid,
                                  wsize=wsize,
                                  outcome_names=['averageedited'])
        res_lst.extend(res)

dump_dir = create_directory(os.path.join(repo_dir,
                                         'experiments',
                                         topfname,
                                        dumpfname))
res_df = pd.DataFrame(res_lst)
res_df.columns = ['model_id', 'wsize', 'run_num', 'pear_score', 'spearman_score', 'outcome_name', 'cell_type', 'correction_type']
res_df.to_csv(os.path.join(dump_dir, 'res_df.csv'), index=False)

In [None]:
res_df

In [None]:
res_df.loc[res_df['run_num'] != 'avg_run']

In [None]:
res_df.loc[res_df['run_num'] == 'avg_run']

In [None]:
cond = res_df['run_num'] != 'avg_run'
res_df.loc[cond].groupby(by=['model_id', 'wsize', 'outcome_name', 'cell_type', 'correction_type'])[['pear_score', 'spearman_score']].mean()

### Visualizing sequences

In [None]:
from IPython.core.display import HTML

In [None]:
test_df['seq_id'] = [f"seq_{i}" for i in range(test_df.shape[0])]

In [None]:
# choose ids of sequences to visualize
# we are using seq_id as the main column to filter from
viz_res = prieml_model.visualize_seqs(test_df, ['seq_10','seq_50'])
for kelm in viz_res:
    display(HTML(viz_res[kelm]))