In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import pandas as pd
import torch
import argparse
from models.data_process import get_datatensor_partitions, prepare_nonproto_features, generate_partition_datatensor
from models.dataset import ProtospacerDataset, ProtospacerExtendedDataset
from src.utils import create_directory, one_hot_encode, get_device, ReaderWriter, print_eval_results
import matplotlib.pyplot as plt

In [3]:
cmd_opt = argparse.ArgumentParser(description='Argparser for data')
cmd_opt.add_argument('-model_name',  type=str, help = 'name of the model')
cmd_opt.add_argument('-exp_name',  type=str, help = 'name of the experiment')

cmd_opt.add_argument('-data_dir',  type=str,default = './data/', help = 'directory of the data')
cmd_opt.add_argument('-target_dir',  type=str, default='processed',  help = 'folder name to save the processed data')
cmd_opt.add_argument('-working_dir',  type=str, default='./', help = 'the main working directory')
cmd_opt.add_argument('-output_path', type=str, help='path to save the trained model')
cmd_opt.add_argument('-model_path', type=str, help='path to trained model')
cmd_opt.add_argument('-random_seed', type=int,default=42)
cmd_opt.add_argument('-epoch_num', type=int, default =200, help='number of training epochs')
args, _ = cmd_opt.parse_known_args()

In [4]:
def get_data_ready(args, normalize_opt = 'max', train_size=0.9, fdtype=torch.float32):
    ## prepare the data
    data_dir = args.data_dir + args.target_dir
    data_partitions = ReaderWriter.read_data(data_dir + '/data_partitions.pkl')
    data = ReaderWriter.read_data(data_dir + '/list_of_x_f_y.pkl')
    x_protospacer, x_extended_f,x_non_protos_f, y = data

    if args.model_name in {'CNN', 'FFN'}:
        ## onehot-encode the protospacer features
        proc_x_protospacer = one_hot_encode(x_protospacer)
        proc_x_protospacer = proc_x_protospacer.reshape(proc_x_protospacer.shape[0], -1)
    elif args.model_name in {'Transformer', 'RNN'}:
        proc_x_protospacer = x_protospacer 
        
    x_non_protos_f_df, x_non_protos_f_norm = prepare_nonproto_features(x_non_protos_f, normalize_opt)
    
    if args.exp_name == 'protospacer_extended':
        x_non_protos_features = x_non_protos_f_norm
    elif args.exp_name == 'protospacer':
        x_non_protos_features = None
    dpartitions, datatensor_partitions = get_datatensor_partitions(data_partitions,
                                                                   args.model_name,
                                                                   proc_x_protospacer,
                                                                   y,
                                                                   x_non_protos_features,
                                                                   fdtype=fdtype,
                                                                   train_size=train_size,
                                                                   random_state=args.random_seed)
    return dpartitions, datatensor_partitions

### Run trained models and evaluate performance on test set

In [6]:
from models.trainval_workflow import run_inference
from src.utils import compute_eval_results_df

In [7]:
gpu_index = 0
res_desc = {}
version=2
for model_name in ['FFN', 'CNN', 'RNN', 'Transformer']:
    args.model_name =  model_name# {'RNN','CNN', 'Transformer'}
    res_desc[model_name] = {}
    for exp_name in ['protospacer', 'protospacer_extended']:
        args.exp_name = exp_name
        model_path = os.path.join(args.working_dir, 
                                  'output', 
                                  f'{model_name}_v{version}',
                                  exp_name)
        dpartitions, datatensor_partitions = get_data_ready(args, 
                                                            normalize_opt='max',
                                                            train_size=0.9, 
                                                            fdtype=torch.float32)

        train_val_path = os.path.join(model_path, 'train_val')
        test_path = os.path.join(model_path, 'test')
        
        print(f'Running model: {model_name}, exp_name: {exp_name}, saved at {train_val_path}')
        a, b = run_inference(datatensor_partitions, 
                             train_val_path, 
                             test_path, 
                             gpu_index, 
                             to_gpu=True)
        print('='*15)
        res_desc[model_name][exp_name] = compute_eval_results_df(test_path, len(dpartitions))        

--- max normalization ---
Running model: FFN, exp_name: protospacer, saved at ./output/FFN_v2/protospacer/train_val
cpu
test
model_name: FFN
xxxxxxxxxxxxxxxxxxxxxxxxx
test
model_name: FFN
xxxxxxxxxxxxxxxxxxxxxxxxx
test
model_name: FFN
xxxxxxxxxxxxxxxxxxxxxxxxx
test
model_name: FFN
xxxxxxxxxxxxxxxxxxxxxxxxx
test
model_name: FFN
xxxxxxxxxxxxxxxxxxxxxxxxx
run_name: run_0
run_name: run_1
run_name: run_2
run_name: run_3
run_name: run_4
--- max normalization ---
Running model: FFN, exp_name: protospacer_extended, saved at ./output/FFN_v2/protospacer_extended/train_val
cpu
test
model_name: FFN
xxxxxxxxxxxxxxxxxxxxxxxxx
test
model_name: FFN
xxxxxxxxxxxxxxxxxxxxxxxxx
test
model_name: FFN
xxxxxxxxxxxxxxxxxxxxxxxxx
test
model_name: FFN
xxxxxxxxxxxxxxxxxxxxxxxxx
test
model_name: FFN
xxxxxxxxxxxxxxxxxxxxxxxxx
run_name: run_0
run_name: run_1
run_name: run_2
run_name: run_3
run_name: run_4
--- max normalization ---
Running model: CNN, exp_name: protospacer, saved at ./output/CNN_v2/protospacer/train_

In [8]:
for model_name in res_desc:
    for exp_name in res_desc[model_name]:
        print(f'model_name: {model_name}, exp_name: {exp_name}')
        display(res_desc[model_name][exp_name])
        print('='*15)


model_name: FFN, exp_name: protospacer


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman,0.709838,0.733197,0.719227,0.717508,0.699367,0.715827,0.717508,0.012479
pearson,0.768133,0.765097,0.745308,0.740168,0.735278,0.750797,0.745308,0.014907
MAE,4.321987,4.451395,4.273986,4.441211,4.413304,4.380377,4.413304,0.078353


model_name: FFN, exp_name: protospacer_extended


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman,0.737713,0.743325,0.712711,0.725644,0.713233,0.726525,0.725644,0.013926
pearson,0.784637,0.778563,0.745203,0.748264,0.735172,0.758368,0.748264,0.02186
MAE,4.126849,4.057595,4.278763,4.311938,4.421915,4.239412,4.278763,0.146496


model_name: CNN, exp_name: protospacer


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman,0.769019,0.789019,0.763319,0.771624,0.758249,0.770246,0.769019,0.011702
pearson,0.829258,0.811868,0.793431,0.804372,0.801098,0.808006,0.804372,0.013601
MAE,3.703795,3.740852,4.105558,3.915196,3.766388,3.846358,3.766388,0.165661


model_name: CNN, exp_name: protospacer_extended


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman,0.764276,0.783819,0.764906,0.771028,0.760843,0.768974,0.764906,0.009074
pearson,0.825335,0.818067,0.794789,0.800273,0.802215,0.808136,0.802215,0.012937
MAE,3.808375,3.75236,3.845842,3.931996,3.737353,3.815185,3.808375,0.078479


model_name: RNN, exp_name: protospacer


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman,0.76532,0.786523,0.768976,0.766275,0.758588,0.769136,0.766275,0.010444
pearson,0.810613,0.815035,0.795311,0.797432,0.796956,0.803069,0.797432,0.009075
MAE,3.832806,3.877604,3.86951,3.945184,3.703842,3.845789,3.86951,0.089123


model_name: RNN, exp_name: protospacer_extended


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman,0.769832,0.778847,0.753758,0.750929,0.768922,0.764458,0.768922,0.011762
pearson,0.815824,0.803886,0.780654,0.756225,0.809744,0.793267,0.803886,0.024618
MAE,3.782751,3.880668,3.981464,4.264782,3.534493,3.888832,3.880668,0.267766


model_name: Transformer, exp_name: protospacer


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman,0.682484,0.703473,0.695263,0.72478,0.690616,0.699323,0.695263,0.016133
pearson,0.748878,0.755179,0.741985,0.745598,0.735357,0.745399,0.745598,0.007421
MAE,4.614115,4.626415,4.349097,4.427824,4.25395,4.45428,4.427824,0.16361


model_name: Transformer, exp_name: protospacer_extended


Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean,median,stddev
spearman,0.719099,0.728455,0.721773,0.708179,0.706992,0.716899,0.719099,0.00917
pearson,0.779659,0.761188,0.765637,0.732273,0.750374,0.757826,0.761188,0.017731
MAE,4.266432,4.219082,4.139365,4.481317,4.076254,4.23649,4.219082,0.15512


