1. import libraries

In [17]:
#import 
from function import search_highly_correlated_variables_cv, boruta_cv, T2_value, knn

import pandas as pd
import numpy as np
import os
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

2. Setting

In [18]:
#random seed
rseed_cv = 42
rseed_boruta = 1

In [19]:
#select dataset, monomer, descriptors
data_set = "1&2" #1&2
monomer = "St" #St or nBA
descriptors = "mechanism_oriented" #Mordred, mechanism_oriented
perc = 90 #90

3. Run model

In [20]:
if __name__ == "__main__":
    # file & preprocessing
    try:     
        df_X = pd.read_excel("../data/XY/data_set_{}/data_set_{}_{}_descriptors.xlsx".format(data_set, data_set, descriptors), index_col = 0, sheet_name = monomer) #descriptors select
    except ValueError:
        print(f"Sheet '{monomer}' not found. Loading the first sheet instead.")
        df_X = pd.read_excel("../data/XY/data_set_{}/data_set_{}_{}_descriptors.xlsx".format(data_set, data_set, descriptors), index_col = 0)
        
    df_Y = pd.read_excel("../data/XY/data_set_{}/data_set_{}_SMILES&objective_function.xlsx".format(data_set, data_set), index_col = 0)
    
    df = pd.concat([df_X, df_Y], axis=1)
    
    df.index = df.index.astype("str")
    X = df.iloc[:, :len(df_X.columns)]
    y = np.log(df["{}_PDI".format(monomer)]-1) # St_PDI or nBA_PDI
    
    fold = len(df.index)
    kf = KFold(n_splits=fold, shuffle=True, random_state=rseed_cv)
    

    
    df_AD = pd.DataFrame(index = X.index, columns = ["Bounding_box_All", "Bounding_box_Sel", "3sigma_All", "3sigma_Sel","T2_value_All", "T2_value_Sel","3-NN_All", "3-NN_Sel"])
    for ifold, (tridx, tsidx) in enumerate(kf.split(X, y)):
        Xtr, Xts = X.iloc[tridx], X.iloc[tsidx]
        ytr, yts = y.iloc[tridx], y.iloc[tsidx]
        
        #scaling
        scaler_X_knn = StandardScaler()
        scaler_X_knn.fit(Xtr)
        Xtr_sc = pd.DataFrame(scaler_X_knn.transform(Xtr), index = Xtr.index, columns = Xtr.columns)#「標準化箱」によってXを標準化させる
        Xts_sc = pd.DataFrame(scaler_X_knn.transform(Xts), index = Xts.index, columns = Xts.columns)#「標準化箱」によってXを標準化させる
        
        #All
        #bounding box
        df_AD.loc[Xts.index, "Bounding_box_All"] = str(Xts_sc.columns[(Xts_sc < Xtr_sc.min()).any() | (Xts_sc > Xtr_sc.max()).any()])
        
        
        #3sigma
        df_AD.loc[Xts.index, "3sigma_All"] = str(Xts_sc.columns[(np.abs(Xts_sc) > 3).any()])
    
        
        #T2_value
        T2_test, T2_train_max = T2_value(Xtr_sc, Xts_sc, 6)
        df_AD.loc[Xts.index, "T2_value_All"] = T2_test<=T2_train_max
        
        
        #3-NN
        mean_of_knn_distance_pre, ad_threshold = knn(3, 1, Xtr_sc, Xts_sc)
        df_AD.loc[Xts.index, "3-NN_All"] = mean_of_knn_distance_pre<= ad_threshold
        
        
        
        #Sel
        Xtr_sel, Xts_sel = search_highly_correlated_variables_cv(Xtr_sc, Xts_sc, 0.8)
        Xtr_sel, Xts_sel = boruta_cv(Xtr_sel, ytr, Xts_sel, perc = perc, rseed_boruta = rseed_boruta) # select the best perc 
        
        #bounding box
        df_AD.loc[Xts.index, "Bounding_box_Sel"] = str(Xts_sel.columns[(Xts_sel < Xtr_sel.min()).any() | (Xts_sel > Xtr_sel.max()).any()])
        
        
        #3sigma
        df_AD.loc[Xts.index, "3sigma_Sel"] = str(Xts_sel.columns[(np.abs(Xts_sel) > 3).any()])
    
        
        #T2_value
        T2_test, T2_train_max = T2_value(Xtr_sel, Xts_sel, 6)
        df_AD.loc[Xts.index, "T2_value_Sel"] = T2_test<=T2_train_max
        
        
        #3-NN
        mean_of_knn_distance_pre, ad_threshold = knn(3, 1, Xtr_sel, Xts_sel)
        df_AD.loc[Xts.index, "3-NN_Sel"] = mean_of_knn_distance_pre<= ad_threshold
        
        
    dirname = "../result/AD"
    os.makedirs(dirname, exist_ok = True)
    
    df_AD.to_excel(dirname + "./AD_{}_{}.xlsx".format(monomer, descriptors))

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	23
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	23
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	23
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	23
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	23
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	23
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	23
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	2
Tentative: 	8
Rejected: 	13
Iteration: 	9 / 100
Confirmed: 	2
Tentative: 	8
Rejected: 	13
Iteration: 	10 / 100
Confirmed: 	2
Tentative: 	8
Rejected: 	13
Iteration: 	11 / 100
Confirmed: 	2
Tentative: 	8
Rejected: 	13
Iteration: 	12 / 100
Confirmed: 	2
Tentative: 	4
Rejected: 	17
Iteration: 	13 / 100
Confirmed: 	2
Tentative: 	4
Rejected: 	17
Iteration: 	14 / 100
Confirmed: 	2
Tentative: 	4
Rejected: 	17
Iteration: 	15 / 100
Confirmed: 	2
Tentative: 	4
Rejected: 	17
Iteration: 	16 / 100
Confirmed: 	3
Tentative: 	3
Rejected: 	17
I

KeyboardInterrupt: 