## XGBOD accuracy comparison--GB1_2016
GB1_2016 has combintorially measured all the possible mutations and been studied extensively in the previous literature. Here, we use GB1_2016 as an example to check the XGBOD outlier detection accuracy with different fitness threshold.
To use this notebook, please change the global control varibles and read the comments.

In [None]:
import numpy as np
import torch
import pandas as pd
import odbo
import os
import matplotlib.pyplot as plt

In [None]:
# Experiment settings 
dataset_name ='GB1_2016'
random_seed = 0 #Random seed for the trial
search_iter = 50 #Number of new observations, GB1_2014=100, BRCA1=50, avGFP_2016=50
# Initialization method protocol
update_method='independent'#find round 0 experiments to initiate BO. For the datasets with few changes in the sequences, 'correlate' mode is recommended. 
allow_abundance=True #If we allow the top scoring experiments to take abundance of a mutation in different sites into account.
# Featurization settings
method=['Avg','Max','Avg','Max'] #switching order for feature spaces to overcome local maxima in one certain representation
mode='independent' #Feature computing mode. 
threshold = 0.05 #thresholds to be tested
cMat_plot = True #Plot the confusion matrix to check the accuracy of search space prescreening or not

In [None]:
# Load dataset
np.random.seed(random_seed)
data_test = pd.read_csv('../datasets/GB1_2016_149361.csv', sep=',')
name_pre, Y_test = np.array(data_test['AACombo']), np.array(data_test['Fitness'])
shuffle_order = np.arange(len(Y_test))
np.random.shuffle(shuffle_order[1:])
name_pre[1:], Y_test[1:] = name_pre[shuffle_order[1:]], Y_test[shuffle_order[1:]]
name = odbo.utils.code_to_array(name_pre)
#Load the preselected indices using a certain shuffling order. Control Round 0 experiments to be the same for different trials
if os.path.isfile('sele_experiment_GB1_2016.npy') == True:
    name_sele = np.load('sele_experiment_GB1_2016.npy')
    Y_train = np.load('sele_fitness_GB1_2016.npy')
else:
    # Let each site has 20 AA codes at least show up twice 
    sele_indices = odbo.initialization.initial_design(name, least_occurance=2*np.ones(name.shape[1]),allow_abundance=allow_abundance, update_method=update_method,verbose = True)
    # Initial experiments are selected to be name_sele with fitness of Y_sele
    name_sele, Y_train = name[sele_indices, :], Y_test[sele_indices]
print('Selected initial experiments no. is ', len(Y_train))
print('Select max Y: ', Y_train.max())

In [None]:
# Using MassiveFeatureTransform method to transform features. (Since GB1 2016 mutates all the sites)
feature_model = odbo.featurization.MassiveFeatureTransform(raw_vars=name_sele, Y=Y_train, method = method[0], mode=mode)
X_train = feature_model.transform(name_sele)
X_test = feature_model.transform(name)
# Get outliers or inliers using the threshold
labels_train = odbo.prescreening.sp_label(X_train, Y_train, thres=threshold)
# Find the XGBOD adaptive search space model
pre_model = odbo.prescreening.XGBOD(eval_metric = 'error', random_state = random_seed)
pre_model.fit(X_train, labels_train)
# Predict the entire search space to get the adapt search space
labels_test = odbo.prescreening.sp_label(X_test, Y_test, thres=threshold)
pred_test_labels = pre_model.predict(X_test)
sele_id_test = list(np.where(pred_test_labels == 0)[0])
# Plot the confusion matrix to check the accuracy of search space prescreening
if cMat_plot:
    out_outlier, in_outlier, out_inlier, in_inlier = odbo.plot.plot_cm(labels_test, pred_test_labels, Y_test)
    print("Correct ratio: {0:.3%}".format((len(out_outlier)+len(in_inlier))/len(labels_test)))
    print("FP ratio: {0:.3%}".format(len(in_outlier)/(len(in_outlier)+len(out_outlier))))
    print("FN ratio: {0:.3%}".format(len(out_inlier)/(len(out_inlier)+len(in_inlier))))
    print("Adapt space size, Entire space size: ", len(sele_id_test), name.shape[0])