In [1]:
import pickle
import torch
import pickle
import pandas as pd
import csv
from helper.model import Net
from helper.utils import *
from helper.distance_map import *
from helper.evaluate import *
from eval import *
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_score, recall_score, average_precision_score, roc_auc_score, accuracy_score, balanced_accuracy_score, f1_score

dtype = torch.float32
device = torch.device("cuda")

### Use maximum separation to call EC

In [2]:
args_model_name = "uniref10_train_split_1_3200"
args_test_data = "uniref10_test_split_1_curate"
args_train_data = "uniref10_train_split_1"

args_hidden_dim = 512
args_out_dim = 128
id_ec_train, ec_id_dict_train = get_ec_id_dict(
    './data/' + args_train_data + '.csv')
id_ec_test, ec_id_dict_test = get_ec_id_dict(
    './data/' + args_test_data + '.csv')

model = Net(args_hidden_dim, args_out_dim, device, dtype)
checkpoint = torch.load('./model/uniref10_train_split_1_3200.pth')
model.load_state_dict(checkpoint)

emb_train = model(esm_embedding(ec_id_dict_train, device, dtype))
emb_test = model_embedding_test(id_ec_test, model, device, dtype)
eval_dist = get_dist_map_test(
    emb_train, emb_test, ec_id_dict_train, id_ec_test, device, dtype)
eval_df = pd.DataFrame.from_dict(eval_dist)
#eval_df.to_csv('./eval/distmap_' + args_test_data + '.csv')
out_filename = './eval/' + args_test_data
all_test_EC = write_top10_choices(eval_df, out_filename)
## maximum separation results
write_max_sep_choices(eval_df, out_filename, first_grad=False, use_max_grad=False)
## get preds and true labels

pred_label = get_pred_labels(out_filename, pred_type='_maxsep')
true_label, all_label = get_true_labels('./data/'+args_test_data)

# pred_label, entries = get_pred_labels(out_filename, pred_type='_maxsep')
# true_label, all_label = get_true_labels('./data/train_ec5238_seq227358', entries)

The embedding sizes for train and test: torch.Size([25584, 128]) torch.Size([5492, 128])


100%|██████████| 4223/4223 [00:00<00:00, 15328.37it/s]


Calculating eval distance map, between 5492 test ids and 4223 train EC cluster centers


5492it [00:04, 1332.52it/s]


In [4]:
print_eval_res(pred_label, true_label, all_label)



(0.5606791940374495,
 0.8212750569070556,
 0.6253859500002147,
 0.9104000577860419,
 0.48015294974508377)

#### Use random nk samples to call EC

In [3]:
args_model_name = "uniref10_train_split_1_3200"
args_test_data = "uniref10_test_split_1_curate"
args_train_data = "uniref10_train_split_1"

args_hidden_dim = 512
args_out_dim = 128
id_ec_train, ec_id_dict_train = get_ec_id_dict(
    './data/' + args_train_data + '.csv')
id_ec_test, ec_id_dict_test = get_ec_id_dict(
    './data/' + args_test_data + '.csv')

model = Net(args_hidden_dim, args_out_dim, device, dtype)
checkpoint = torch.load('./model/uniref10_train_split_1_3200.pth')
model.load_state_dict(checkpoint)

emb_train = model(esm_embedding(ec_id_dict_train, device, dtype))
emb_test = model_embedding_test(id_ec_test, model, device, dtype)
eval_dist = get_dist_map_test(
    emb_train, emb_test, ec_id_dict_train, id_ec_test, device, dtype)
eval_df = pd.DataFrame.from_dict(eval_dist)
#eval_df.to_csv('./eval/distmap_' + args_test_data + '.csv')
out_filename = './eval/' + args_test_data
all_test_EC = write_top10_choices(eval_df, out_filename)
seed_everything()
rand_nk_ids, rand_nk_emb_train = random_nk_model(
    id_ec_train, emb_train, n=2, weighted=False)
random_nk_dist_map = get_random_nk_dist_map(
    emb_train, rand_nk_emb_train, ec_id_dict_train, rand_nk_ids, device, dtype)
write_random_nk_choices(
    eval_df, out_filename, random_nk_dist_map, p_value=0.001)


The embedding sizes for train and test: torch.Size([25584, 128]) torch.Size([5492, 128])


100%|██████████| 4223/4223 [00:00<00:00, 16858.40it/s]


Calculating eval distance map, between 5492 test ids and 4223 train EC cluster centers


5492it [00:04, 1335.60it/s]
100%|██████████| 4223/4223 [00:00<00:00, 17131.76it/s]
2000it [00:01, 1372.68it/s]
100%|██████████| 5492/5492 [00:15<00:00, 349.46it/s]


#### Training symmetric distance map refactor

In [None]:
args_model_name = "uniref10_train_split_0"
args_hidden_dim = 512
args_out_dim  = 128
id_ec, ec_id_dict = get_ec_id_dict('./data/' + args_model_name + '.csv')
ec_id = {key: list(ec_id_dict[key]) for key in ec_id_dict.keys()}
dist_map = pickle.load( open('./data/distance_map/' + args_model_name + '.pkl', 'rb'))
model = Net(args_hidden_dim, args_out_dim, device, dtype)
checkpoint = torch.load('./model/uniref10_train_split_0_50.pth')
model.load_state_dict(checkpoint)
esm_emb = esm_embedding(ec_id_dict, device, dtype)

In [None]:
args_train_file = "uniref10_train_split_0"
_, ec_id_dict = get_ec_id_dict('./data/' + args_train_file + '.csv')
# use cpu and high precision by default
device, dtype = torch.device("cpu"), torch.float64
esm_emb = esm_embedding(ec_id_dict, device, dtype)
esm_dist = get_dist_map(ec_id_dict, esm_emb, device, dtype)
esm_df = pd.DataFrame.from_dict(esm_dist)
# pickle.dump(esm_dist, open('./data/distance_map/' +
#             args_train_file + '.pkl', 'wb'))
esm_df.to_csv('./data/distance_map/' + args_train_file + '_df.csv')