In [None]:
from hyperparams import *
from rawdata_preprocessing import *
#from RandomForestClassifier import RF_Classifying
#from SVMClassifier import SVM_Classifying

import Bio
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier

from sklearn.svm import SVC
from sklearn.model_selection import validation_curve
from sklearn.model_selection import train_test_split, GridSearchCV, KFold

from xgboost import plot_importance
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score, auc, confusion_matrix, roc_curve
import math

#from warnings import filterwarnings
#filterwarnings('ignore')

from Logger import *
from save_best_output import save_best_output

model_metrics = {}

def calc_metrics(y_label, y_proba):
    con_matrix = confusion_matrix(y_label, [1 if x >= 0.5 else 0 for x in y_proba])
    TN = float(con_matrix[0][0])
    FP = float(con_matrix[0][1])
    FN = float(con_matrix[1][0])
    TP = float(con_matrix[1][1])
    P = TP + FN
    N = TN + FP
    Sn = TP / P if P > 0 else 0
    Sp = TN / N if N > 0 else 0
    Acc = (TP + TN) / (P + N) if (P + N) > 0 else 0
    Pre = (TP) / (TP + FP) if (TP+FP) > 0 else 0
    MCC = 0
    tmp = math.sqrt((TP + FP) * (TP + FN)) * math.sqrt((TN + FP) * (TN + FN))
    if tmp != 0:
        MCC = (TP * TN - FP * FN) / tmp
    fpr, tpr, thresholds = roc_curve(y_label, y_proba)
    AUC = auc(fpr, tpr)
    return Acc, Sn, Sp, Pre, MCC, AUC

def Voting_Classifying(X, y, KFOLD_TIME) :
    #GBC_best = GBC_Classifying(X, y, KFOLD_TIME)
    #RFC_best = RF_Classifying(X, y, KFOLD_TIME)
    #SVMC_best = SVM_Classifying(X, y, KFOLD_TIME)
    #XGBC_best = XGB_Classifying(X, y, KFOLD_TIME)
    RFC_best = RandomForestClassifier(random_state=2)
    SVMC_best = SVC(probability=True, random_state=2)
    GBC_best = GradientBoostingClassifier(random_state=2)
    XGBC_best = XGBClassifier(probability=True, random_state=2)
    ABC_best = AdaBoostClassifier(random_state = 2)
    BC_best = BaggingClassifier(random_state = 2)
    LGBM_best = LGBMClassifier(random_state = 2)
    
    VC = VotingClassifier(estimators=[
        #('ada', ABC_best), 
        #                              ('bc', BC_best),
        #                              ('lgbm', LGBM_best),
                                      ('rfc', RFC_best), 
                                      ('svc', SVMC_best), 
                                      ('gbc', GBC_best), 
                                      ('xgb', XGBC_best)
                                      ], 
                          voting='soft', n_jobs=-1, verbose=10)
    param_range = [0.1, 1.0]
    param_grid = {
        'gbc__n_estimators' : [100], 'gbc__max_depth' : [6], 'gbc__min_samples_leaf': [3], 'gbc__min_samples_split' : [2], 'gbc__learning_rate' : [0.05],
        'svc__kernel' : ['rbf'], 'svc__C' : [0.1], 'svc__gamma': [0.1], 'svc__random_state' : [2],
        'xgb__kernel' : ['rbf'], 'xgb__C' : [0.1], 'xgb__num_iterations': [1000], 'xgb__gamma':[0.1], 'xgb__random_state' : [2], 'xgb__learning_rate' : [0.1], 'xgb__n_estimators' : [100], 'xgb__max_depth' : [100],
        'rfc__n_estimators' : [30],'rfc__max_depth' : [6],'rfc__min_samples_leaf' : [8],'rfc__min_samples_split' : [20], 'rfc__max_leaf_nodes' : [10],
        #'ada__base_estimator' : [RFC_best, SVMC_best], 'ada__n_estimators' : [10], 'ada__learning_rate' : [0.01],
        #'bc__base_estimator' : [SVMC_best, GBC_best, XGBC_best, ABC_best], 'bc__n_estimators' : [10],
        #'lgbm__n_estimators' : [10], 'lgbm__min_samples_leaf' : [3], 'lgbm__min_samples_split' : [2], 'lgbm__learning_rate' : [0.01]
    }
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 5)
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    gsVC = GridSearchCV(estimator=VC, param_grid=param_grid, cv=KFOLD_TIME, n_jobs = -1, verbose=10)

    gsVC = gsVC.fit(X_train,y_train)
    score = gsVC.score(X_test, y_test)
    y_test_predict = gsVC.predict_proba(X_test)
    logger.debug('len : {0} - y_test : {1} - y_test[:, 1] : '.format(len(y_test), y_test))
    logger.debug('len : {0} - y_test_predict : {1} - y_test_predict[:, 1] : {2}'.format(len(y_test_predict), y_test_predict, y_test_predict[:, 1]))
    model_metrics['Voting-Classifier'] = np.array(calc_metrics(y_test, y_test_predict[:, 1]))
    logger.debug(model_metrics)
    
    logger.debug('GridSearchCV를 이용한 최적 매개변수 점수 ==> {:.3f}'.format(gsVC.best_score_))
    logger.debug('GridSearchCV를 이용한 최적 매개변수 ==> {}'.format(gsVC.best_params_))
    logger.warning('GridSearchCV를 이용한 test점수 ==> {:.3f}'.format(score))
    logger.debug('GridSearchCV를 이용한 최고 성능 모델 ==> \n{}'.format(gsVC.best_estimator_))
    
    return score, gsVC.best_params_, gsVC.best_estimator_


def Bagging_Classifying(X, y, KFOLD_TIME) :
    BC = BaggingClassifier()
    
"""
def GBC_Classifying(X, y, KFOLD_TIME) :
    print("=======================================================")
    print("GBC_Classifying ... ")
    gbrt = GradientBoostingClassifier(random_state = 0)
    '''
    param_grid = {
        'n_estimators' : [100, 200], 
        'max_depth' : [6,8,10,12], 
        'min_samples_leaf': [3,5,7,10], 
        'min_samples_split' : [2,3,5,10], 
        'learning_rate' : [0.05, 0.1, 0.2]
    }
    '''
    param_grid = {'n_estimators' : [100], 'max_depth' : [6], 'min_samples_leaf': [3], 'min_samples_split' : [2], 'learning_rate' : [0.05]}

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 11)
    gsGBRT = GridSearchCV(gbrt, param_grid = param_grid, cv=KFOLD_TIME,
                         scoring="accuracy", n_jobs=6, verbose=2)
    
    '''
    gsGBRT.fit(X_train, y_train)
    score = gsGBRT.score(X_test, y_test)
    print('GridSearchCV를 이용한 최적 매개변수 점수 ==> {:.3f}'.format(gsGBRT.best_score_))
    print('GridSearchCV를 이용한 최적 매개변수 ==> {}'.format(gsGBRT.best_params_))
    print('GridSearchCV를 이용한 test점수 ==> {:.3f}'.format(score))
    print('GridSearchCV를 이용한 최고 성능 모델 ==> \n{}'.format(gsGBRT.best_estimator_))
    '''
    return score, gsGBRT.best_params_, gsGBRT.best_estimator_

def SVM_Classifying(X, y, KFOLD_TIME) :
    print("=======================================================")
    print("SVM_Classifying ... ")
    SVMC = SVC(probability=True)
    param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
    #param_range = [0.1, 1.0]
    '''
    param_grid = [
        {'kernel' : ['rbf'], 'C' : param_range, 'gamma': param_range, 'random_state' : [2] },
        {'kernel' = ['poly'], 'C' : param_range, 'gamma': param_range, 'random_state' : [2] },
        {'kernel' = ['linear'], 'C' : param_range, 'random_state' : [2] },
        {'kernel' = ['sigmoid'], 'C' : param_range, 'gamma': param_range, 'random_state' : [2] }
    ]
    '''
    param_grid = [
        {'kernel' : ['rbf'], 'C' : param_range, 'gamma': param_range, 'random_state' : [2] }  
    ]

    #['C', 'break_ties', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 11)
    gsSVMC = GridSearchCV(SVMC, param_grid = param_grid, cv=KFOLD_TIME,
                         scoring="accuracy", n_jobs=6, verbose=2)


    '''
    gsSVMC.fit(X, y)
    score = gsSVMC.score(X_test, y_test)

    SVMC_best = gsSVMC.best_estimator_

    print("best score : {}".format(gsSVMC.best_score_))
    print("best parameters : {}".format(gsSVMC.best_params_))
    print("train set score : {}".format(gsSVMC.score(X, y)))
    '''
    return score, gsSVMC.best_params_, gsSVMC.best_estimator_
"""
def XGB_Classifying(X, y, KFOLD_TIME, param_grid) :
    logger.debug("=======================================================")
    logger.debug("XGB_Classifying ...")
    XGBC = XGBClassifier(probability=True)
    param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
    '''
    param_grid = {'kernel' : ['rbf'], 'C' : param_range, 'gamma': param_range, 'random_state' : [2],
                  'learning_rate' : [0.01, 0.05, 0.1, 0.15, 0.2], 
                  'n_estimators' : [100, 200, 400, 600], 
                  'max_depth' : [4,6,8,10,12] }
    
    param_grid = {'kernel' : ['rbf'], 'C' : param_range, 'gamma':[0.1], 'random_state' : [2],
                  'learning_rate' : [0.01], 
                  'n_estimators' : [100], 
                  'max_depth' : [4]}
    '''
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 11)
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    gsXGBC = GridSearchCV(XGBC, param_grid = param_grid, cv=KFOLD_TIME,
                         scoring="accuracy", n_jobs=-1, verbose=10)
    
    gsXGBC.fit(X_train, y_train)
    score = gsXGBC.score(X_test, y_test)
    logger.debug('GridSearchCV를 이용한 최적 매개변수 점수 ==> {:.3f}'.format(gsXGBC.best_score_))
    logger.debug('GridSearchCV를 이용한 최적 매개변수 ==> {}'.format(gsXGBC.best_params_))
    logger.debug('GridSearchCV를 이용한 test점수 ==> {:.3f}'.format(score))
    logger.debug('GridSearchCV를 이용한 최고 성능 모델 ==> \n{}'.format(gsXGBC.best_estimator_))
    
    return score, gsXGBC.best_params_, gsXGBC.best_estimator_

def LGBM_Classifying(X, y, KFOLD_TIME, param_grid) :
    print("=======================================================")
    print("LGBM_Classifying ...")
    LGBC = LGBMClassifier(random_state = 2)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 11)
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    gsLGBM = GridSearchCV(LGBC, param_grid = param_grid, cv=KFOLD_TIME,
                         scoring="accuracy", n_jobs=-1, verbose=10)
    
    gsLGBM.fit(X_train, y_train)
    score = gsLGBM.score(X_test, y_test)
    logger.debug('GridSearchCV를 이용한 최적 매개변수 점수 ==> {:.3f}'.format(gsLGBM.best_score_))
    logger.debug('GridSearchCV를 이용한 최적 매개변수 ==> {}'.format(gsLGBM.best_params_))
    logger.debug('GridSearchCV를 이용한 test점수 ==> {:.3f}'.format(score))
    logger.debug('GridSearchCV를 이용한 최고 성능 모델 ==> \n{}'.format(gsLGBM.best_estimator_))
    
    return score, gsLGBM.best_params_, gsLGBM.best_estimator_

def RF_Classifying(X, y, KFOLD_TIME, param_grid) :
    logger.debug("=======================================================")
    logger.debug("RF_Classifying ... ")
    RFC = RandomForestClassifier(random_state=2)

    
    param_grid = {'n_estimators' : [10],'max_depth' : [6],'min_samples_leaf' : [8],'min_samples_split' : [8], 'max_leaf_nodes' : [10]}

    kfold = KFold(n_splits=KFOLD_TIME, shuffle=True, random_state=11)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 11)
    
    gsRFC = GridSearchCV(RFC, param_grid = param_grid, cv=kfold,
                         scoring="accuracy", n_jobs=-1, verbose=10)
    
    gsRFC.fit(X_train, y_train)
    # X_train, y_train 을 0.8:0.2 이런식으로 나눠서 cv 시킨다는 뜻인듯 ?
    
    score = gsRFC.score(X_test, y_test)
    logger.debug('GridSearchCV를 이용한 최적 매개변수 점수 ==> {:.3f}'.format(gsRFC.best_score_))
    logger.debug('GridSearchCV를 이용한 최적 매개변수 ==> {}'.format(gsRFC.best_params_))
    logger.debug('GridSearchCV를 이용한 test점수 ==> {:.3f}'.format(score))
    logger.debug('GridSearchCV를 이용한 최고 성능 모델 ==> \n{}'.format(gsRFC.best_estimator_))
    
    #return gsRFC
    return score, gsRFC.best_params_, gsRFC.best_estimator_

def LR_Classifying(X, y, KFOLD_TIME, param_grid) :
    logger.debug("=======================================================")
    logger.debug("LogisticRegressionCV_Classifying ... ")
    
    LRC = LogisticRegressionCV(random_state = 2)
    param_range = [0.01 ,0.1, 1, 10, 100]
    param_grid = {'C' : param_range, 'gamma':[0.1], 'random_state' : [2],
                  'learning_rate' : [0.01], 
                  'n_estimators' : [100], 
                  'max_depth' : [4]}
    gsLRC = GridSearchCV(LRC, param_grid = param_grid, cv=KFOLD_TIME,
                         scoring="accuracy", n_jobs=-1, verbose=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 11)
    gsLRC.fit(X_train, y_train)
    
    score = gsLRC.score(X_test, y_test)
    
    
    score = gsLRC.score(X_test, y_test)
    logger.debug('GridSearchCV를 이용한 최적 매개변수 점수 ==> {:.3f}'.format(gsLRC.best_score_))
    logger.debug('GridSearchCV를 이용한 최적 매개변수 ==> {}'.format(gsLRC.best_params_))
    logger.debug('GridSearchCV를 이용한 test점수 ==> {:.3f}'.format(score))
    logger.debug('GridSearchCV를 이용한 최고 성능 모델 ==> \n{}'.format(gsLRC.best_estimator_))
    return score, gsLRC.best_params_, gsLRC.best_estimator_

KFOLD_TIME = 5
cv=KFold(n_splits=3,random_state=5,shuffle=True)

p_feature_names = ['A', 'AA', 'AAA', 'AAB', 'AAC', 'AAD', 'AAE', 'AAF', 'AAG', 'AB', 'ABA', 'ABB', 'ABC', 'ABD', 'ABE', 'ABF', 'ABG', 'AC', 'ACA', 'ACB', 'ACC', 'ACD', 'ACE', 'ACF', 'ACG', 'AD', 'ADA', 'ADB', 'ADC', 'ADD', 'ADE', 'ADF', 'ADG', 'AE', 'AEA', 'AEB', 'AEC', 'AED', 'AEE', 'AEF', 'AEG', 'AF', 'AFA', 'AFB', 'AFC', 'AFD', 'AFE', 'AFF', 'AFG', 'AG', 'AGA', 'AGB', 'AGC', 'AGD', 'AGE', 'AGF', 'AGG', 'B', 'BA', 'BAA', 'BAB', 'BAC', 'BAD', 'BAE', 'BAF', 'BAG', 'BB', 'BBA', 'BBB', 'BBC', 'BBD', 'BBE', 'BBF', 'BBG', 'BC', 'BCA', 'BCB', 'BCC', 'BCD', 'BCE', 'BCF', 'BCG', 'BD', 'BDA', 'BDB', 'BDC', 'BDD', 'BDE', 'BDF', 'BDG', 'BE', 'BEA', 'BEB', 'BEC', 'BED', 'BEE', 'BEF', 'BEG', 'BF', 'BFA', 'BFB', 'BFC', 'BFD', 'BFE', 'BFF', 'BFG', 'BG', 'BGA', 'BGB', 'BGC', 'BGD', 'BGE', 'BGF', 'BGG', 'C', 'CA', 'CAA', 'CAB', 'CAC', 'CAD', 'CAE', 'CAF', 'CAG', 'CB', 'CBA', 'CBB', 'CBC', 'CBD', 'CBE', 'CBF', 'CBG', 'CC', 'CCA', 'CCB', 'CCC', 'CCD', 'CCE', 'CCF', 'CCG', 'CD', 'CDA', 'CDB', 'CDC', 'CDD', 'CDE', 'CDF', 'CDG', 'CE', 'CEA', 'CEB', 'CEC', 'CED', 'CEE', 'CEF', 'CEG', 'CF', 'CFA', 'CFB', 'CFC', 'CFD', 'CFE', 'CFF', 'CFG', 'CG', 'CGA', 'CGB', 'CGC', 'CGD', 'CGE', 'CGF', 'CGG', 'D', 'DA', 'DAA', 'DAB', 'DAC', 'DAD', 'DAE', 'DAF', 'DAG', 'DB', 'DBA', 'DBB', 'DBC', 'DBD', 'DBE', 'DBF', 'DBG', 'DC', 'DCA', 'DCB', 'DCC', 'DCD', 'DCE', 'DCF', 'DCG', 'DD', 'DDA', 'DDB', 'DDC', 'DDD', 'DDE', 'DDF', 'DDG', 'DE', 'DEA', 'DEB', 'DEC', 'DED', 'DEE', 'DEF', 'DEG', 'DF', 'DFA', 'DFB', 'DFC', 'DFD', 'DFE', 'DFF', 'DFG', 'DG', 'DGA', 'DGB', 'DGC', 'DGD', 'DGE', 'DGF', 'DGG', 'E', 'EA', 'EAA', 'EAB', 'EAC', 'EAD', 'EAE', 'EAF', 'EAG', 'EB', 'EBA', 'EBB', 'EBC', 'EBD', 'EBE', 'EBF', 'EBG', 'EC', 'ECA', 'ECB', 'ECC', 'ECD', 'ECE', 'ECF', 'ECG', 'ED', 'EDA', 'EDB', 'EDC', 'EDD', 'EDE', 'EDF', 'EDG', 'EE', 'EEA', 'EEB', 'EEC', 'EED', 'EEE', 'EEF', 'EEG', 'EF', 'EFA', 'EFB', 'EFC', 'EFD', 'EFE', 'EFF', 'EFG', 'EG', 'EGA', 'EGB', 'EGC', 'EGD', 'EGE', 'EGF', 'EGG', 'F', 'FA', 'FAA', 'FAB', 'FAC', 'FAD', 'FAE', 'FAF', 'FAG', 'FB', 'FBA', 'FBB', 'FBC', 'FBD', 'FBE', 'FBF', 'FBG', 'FC', 'FCA', 'FCB', 'FCC', 'FCD', 'FCE', 'FCF', 'FCG', 'FD', 'FDA', 'FDB', 'FDC', 'FDD', 'FDE', 'FDF', 'FDG', 'FE', 'FEA', 'FEB', 'FEC', 'FED', 'FEE', 'FEF', 'FEG', 'FF', 'FFA', 'FFB', 'FFC', 'FFD', 'FFE', 'FFF', 'FFG', 'FG', 'FGA', 'FGB', 'FGC', 'FGD', 'FGE', 'FGF', 'FGG', 'G', 'GA', 'GAA', 'GAB', 'GAC', 'GAD', 'GAE', 'GAF', 'GAG', 'GB', 'GBA', 'GBB', 'GBC', 'GBD', 'GBE', 'GBF', 'GBG', 'GC', 'GCA', 'GCB', 'GCC', 'GCD', 'GCE', 'GCF', 'GCG', 'GD', 'GDA', 'GDB', 'GDC', 'GDD', 'GDE', 'GDF', 'GDG', 'GE', 'GEA', 'GEB', 'GEC', 'GED', 'GEE', 'GEF', 'GEG', 'GF', 'GFA', 'GFB', 'GFC', 'GFD', 'GFE', 'GFF', 'GFG', 'GG', 'GGA', 'GGB', 'GGC', 'GGD', 'GGE', 'GGF', 'GGG']
r_feature_names = ['A', 'AA', 'AAA', 'AAAA', 'AAAC', 'AAAG', 'AAAU', 'AAC', 'AACA', 'AACC', 'AACG', 'AACU', 'AAG', 'AAGA', 'AAGC', 'AAGG', 'AAGU', 'AAU', 'AAUA', 'AAUC', 'AAUG', 'AAUU', 'AC', 'ACA', 'ACAA', 'ACAC', 'ACAG', 'ACAU', 'ACC', 'ACCA', 'ACCC', 'ACCG', 'ACCU', 'ACG', 'ACGA', 'ACGC', 'ACGG', 'ACGU', 'ACU', 'ACUA', 'ACUC', 'ACUG', 'ACUU', 'AG', 'AGA', 'AGAA', 'AGAC', 'AGAG', 'AGAU', 'AGC', 'AGCA', 'AGCC', 'AGCG', 'AGCU', 'AGG', 'AGGA', 'AGGC', 'AGGG', 'AGGU', 'AGU', 'AGUA', 'AGUC', 'AGUG', 'AGUU', 'AU', 'AUA', 'AUAA', 'AUAC', 'AUAG', 'AUAU', 'AUC', 'AUCA', 'AUCC', 'AUCG', 'AUCU', 'AUG', 'AUGA', 'AUGC', 'AUGG', 'AUGU', 'AUU', 'AUUA', 'AUUC', 'AUUG', 'AUUU', 'C', 'CA', 'CAA', 'CAAA', 'CAAC', 'CAAG', 'CAAU', 'CAC', 'CACA', 'CACC', 'CACG', 'CACU', 'CAG', 'CAGA', 'CAGC', 'CAGG', 'CAGU', 'CAU', 'CAUA', 'CAUC', 'CAUG', 'CAUU', 'CC', 'CCA', 'CCAA', 'CCAC', 'CCAG', 'CCAU', 'CCC', 'CCCA', 'CCCC', 'CCCG', 'CCCU', 'CCG', 'CCGA', 'CCGC', 'CCGG', 'CCGU', 'CCU', 'CCUA', 'CCUC', 'CCUG', 'CCUU', 'CG', 'CGA', 'CGAA', 'CGAC', 'CGAG', 'CGAU', 'CGC', 'CGCA', 'CGCC', 'CGCG', 'CGCU', 'CGG', 'CGGA', 'CGGC', 'CGGG', 'CGGU', 'CGU', 'CGUA', 'CGUC', 'CGUG', 'CGUU', 'CU', 'CUA', 'CUAA', 'CUAC', 'CUAG', 'CUAU', 'CUC', 'CUCA', 'CUCC', 'CUCG', 'CUCU', 'CUG', 'CUGA', 'CUGC', 'CUGG', 'CUGU', 'CUU', 'CUUA', 'CUUC', 'CUUG', 'CUUU', 'G', 'GA', 'GAA', 'GAAA', 'GAAC', 'GAAG', 'GAAU', 'GAC', 'GACA', 'GACC', 'GACG', 'GACU', 'GAG', 'GAGA', 'GAGC', 'GAGG', 'GAGU', 'GAU', 'GAUA', 'GAUC', 'GAUG', 'GAUU', 'GC', 'GCA', 'GCAA', 'GCAC', 'GCAG', 'GCAU', 'GCC', 'GCCA', 'GCCC', 'GCCG', 'GCCU', 'GCG', 'GCGA', 'GCGC', 'GCGG', 'GCGU', 'GCU', 'GCUA', 'GCUC', 'GCUG', 'GCUU', 'GG', 'GGA', 'GGAA', 'GGAC', 'GGAG', 'GGAU', 'GGC', 'GGCA', 'GGCC', 'GGCG', 'GGCU', 'GGG', 'GGGA', 'GGGC', 'GGGG', 'GGGU', 'GGU', 'GGUA', 'GGUC', 'GGUG', 'GGUU', 'GU', 'GUA', 'GUAA', 'GUAC', 'GUAG', 'GUAU', 'GUC', 'GUCA', 'GUCC', 'GUCG', 'GUCU', 'GUG', 'GUGA', 'GUGC', 'GUGG', 'GUGU', 'GUU', 'GUUA', 'GUUC', 'GUUG', 'GUUU', 'U', 'UA', 'UAA', 'UAAA', 'UAAC', 'UAAG', 'UAAU', 'UAC', 'UACA', 'UACC', 'UACG', 'UACU', 'UAG', 'UAGA', 'UAGC', 'UAGG', 'UAGU', 'UAU', 'UAUA', 'UAUC', 'UAUG', 'UAUU', 'UC', 'UCA', 'UCAA', 'UCAC', 'UCAG', 'UCAU', 'UCC', 'UCCA', 'UCCC', 'UCCG', 'UCCU', 'UCG', 'UCGA', 'UCGC', 'UCGG', 'UCGU', 'UCU', 'UCUA', 'UCUC', 'UCUG', 'UCUU', 'UG', 'UGA', 'UGAA', 'UGAC', 'UGAG', 'UGAU', 'UGC', 'UGCA', 'UGCC', 'UGCG', 'UGCU', 'UGG', 'UGGA', 'UGGC', 'UGGG', 'UGGU', 'UGU', 'UGUA', 'UGUC', 'UGUG', 'UGUU', 'UU', 'UUA', 'UUAA', 'UUAC', 'UUAG', 'UUAU', 'UUC', 'UUCA', 'UUCC', 'UUCG', 'UUCU', 'UUG', 'UUGA', 'UUGC', 'UUGG', 'UUGU', 'UUU', 'UUUA', 'UUUC', 'UUUG', 'UUUU']
xgb_r_feature_names = ['r_A', 'r_AA', 'r_AAA', 'r_AAAA', 'r_AAAC', 'r_AAAG', 'r_AAAU', 'r_AAC', 'r_AACA', 'r_AACC', 'r_AACG', 'r_AACU', 'r_AAG', 'r_AAGA', 'r_AAGC', 'r_AAGG', 'r_AAGU', 'r_AAU', 'r_AAUA', 'r_AAUC', 'r_AAUG', 'r_AAUU', 'r_AC', 'r_ACA', 'r_ACAA', 'r_ACAC', 'r_ACAG', 'r_ACAU', 'r_ACC', 'r_ACCA', 'r_ACCC', 'r_ACCG', 'r_ACCU', 'r_ACG', 'r_ACGA', 'r_ACGC', 'r_ACGG', 'r_ACGU', 'r_ACU', 'r_ACUA', 'r_ACUC', 'r_ACUG', 'r_ACUU', 'r_AG', 'r_AGA', 'r_AGAA', 'r_AGAC', 'r_AGAG', 'r_AGAU', 'r_AGC', 'r_AGCA', 'r_AGCC', 'r_AGCG', 'r_AGCU', 'r_AGG', 'r_AGGA', 'r_AGGC', 'r_AGGG', 'r_AGGU', 'r_AGU', 'r_AGUA', 'r_AGUC', 'r_AGUG', 'r_AGUU', 'r_AU', 'r_AUA', 'r_AUAA', 'r_AUAC', 'r_AUAG', 'r_AUAU', 'r_AUC', 'r_AUCA', 'r_AUCC', 'r_AUCG', 'r_AUCU', 'r_AUG', 'r_AUGA', 'r_AUGC', 'r_AUGG', 'r_AUGU', 'r_AUU', 'r_AUUA', 'r_AUUC', 'r_AUUG', 'r_AUUU', 'r_C', 'r_CA', 'r_CAA', 'r_CAAA', 'r_CAAC', 'r_CAAG', 'r_CAAU', 'r_CAC', 'r_CACA', 'r_CACC', 'r_CACG', 'r_CACU', 'r_CAG', 'r_CAGA', 'r_CAGC', 'r_CAGG', 'r_CAGU', 'r_CAU', 'r_CAUA', 'r_CAUC', 'r_CAUG', 'r_CAUU', 'r_CC', 'r_CCA', 'r_CCAA', 'r_CCAC', 'r_CCAG', 'r_CCAU', 'r_CCC', 'r_CCCA', 'r_CCCC', 'r_CCCG', 'r_CCCU', 'r_CCG', 'r_CCGA', 'r_CCGC', 'r_CCGG', 'r_CCGU', 'r_CCU', 'r_CCUA', 'r_CCUC', 'r_CCUG', 'r_CCUU', 'r_CG', 'r_CGA', 'r_CGAA', 'r_CGAC', 'r_CGAG', 'r_CGAU', 'r_CGC', 'r_CGCA', 'r_CGCC', 'r_CGCG', 'r_CGCU', 'r_CGG', 'r_CGGA', 'r_CGGC', 'r_CGGG', 'r_CGGU', 'r_CGU', 'r_CGUA', 'r_CGUC', 'r_CGUG', 'r_CGUU', 'r_CU', 'r_CUA', 'r_CUAA', 'r_CUAC', 'r_CUAG', 'r_CUAU', 'r_CUC', 'r_CUCA', 'r_CUCC', 'r_CUCG', 'r_CUCU', 'r_CUG', 'r_CUGA', 'r_CUGC', 'r_CUGG', 'r_CUGU', 'r_CUU', 'r_CUUA', 'r_CUUC', 'r_CUUG', 'r_CUUU', 'r_G', 'r_GA', 'r_GAA', 'r_GAAA', 'r_GAAC', 'r_GAAG', 'r_GAAU', 'r_GAC', 'r_GACA', 'r_GACC', 'r_GACG', 'r_GACU', 'r_GAG', 'r_GAGA', 'r_GAGC', 'r_GAGG', 'r_GAGU', 'r_GAU', 'r_GAUA', 'r_GAUC', 'r_GAUG', 'r_GAUU', 'r_GC', 'r_GCA', 'r_GCAA', 'r_GCAC', 'r_GCAG', 'r_GCAU', 'r_GCC', 'r_GCCA', 'r_GCCC', 'r_GCCG', 'r_GCCU', 'r_GCG', 'r_GCGA', 'r_GCGC', 'r_GCGG', 'r_GCGU', 'r_GCU', 'r_GCUA', 'r_GCUC', 'r_GCUG', 'r_GCUU', 'r_GG', 'r_GGA', 'r_GGAA', 'r_GGAC', 'r_GGAG', 'r_GGAU', 'r_GGC', 'r_GGCA', 'r_GGCC', 'r_GGCG', 'r_GGCU', 'r_GGG', 'r_GGGA', 'r_GGGC', 'r_GGGG', 'r_GGGU', 'r_GGU', 'r_GGUA', 'r_GGUC', 'r_GGUG', 'r_GGUU', 'r_GU', 'r_GUA', 'r_GUAA', 'r_GUAC', 'r_GUAG', 'r_GUAU', 'r_GUC', 'r_GUCA', 'r_GUCC', 'r_GUCG', 'r_GUCU', 'r_GUG', 'r_GUGA', 'r_GUGC', 'r_GUGG', 'r_GUGU', 'r_GUU', 'r_GUUA', 'r_GUUC', 'r_GUUG', 'r_GUUU', 'r_U', 'r_UA', 'r_UAA', 'r_UAAA', 'r_UAAC', 'r_UAAG', 'r_UAAU', 'r_UAC', 'r_UACA', 'r_UACC', 'r_UACG', 'r_UACU', 'r_UAG', 'r_UAGA', 'r_UAGC', 'r_UAGG', 'r_UAGU', 'r_UAU', 'r_UAUA', 'r_UAUC', 'r_UAUG', 'r_UAUU', 'r_UC', 'r_UCA', 'r_UCAA', 'r_UCAC', 'r_UCAG', 'r_UCAU', 'r_UCC', 'r_UCCA', 'r_UCCC', 'r_UCCG', 'r_UCCU', 'r_UCG', 'r_UCGA', 'r_UCGC', 'r_UCGG', 'r_UCGU', 'r_UCU', 'r_UCUA', 'r_UCUC', 'r_UCUG', 'r_UCUU', 'r_UG', 'r_UGA', 'r_UGAA', 'r_UGAC', 'r_UGAG', 'r_UGAU', 'r_UGC', 'r_UGCA', 'r_UGCC', 'r_UGCG', 'r_UGCU', 'r_UGG', 'r_UGGA', 'r_UGGC', 'r_UGGG', 'r_UGGU', 'r_UGU', 'r_UGUA', 'r_UGUC', 'r_UGUG', 'r_UGUU', 'r_UU', 'r_UUA', 'r_UUAA', 'r_UUAC', 'r_UUAG', 'r_UUAU', 'r_UUC', 'r_UUCA', 'r_UUCC', 'r_UUCG', 'r_UUCU', 'r_UUG', 'r_UUGA', 'r_UUGC', 'r_UUGG', 'r_UUGU', 'r_UUU', 'r_UUUA', 'r_UUUC', 'r_UUUG', 'r_UUUU']

train_combined_arr = []
val_combined_arr = []

def classify(npz_path, param_grid) :
    logger.debug("Dataset : {}".format(npz_path))
    mydata = np.load(npz_path)
    XP = mydata['XP']
    XR = mydata['XR']
    Y = mydata['Y']

    #combined_pd = pd.DataFrame(data= np.c_[np.c_[XP, XR], Y], columns= p_feature_names + r_feature_names + ['target'])
    combined_pd = pd.DataFrame(data= np.c_[np.c_[XP, XR], Y], columns= p_feature_names + xgb_r_feature_names + ['target'])


    features = list(combined_pd.columns[:-1])
    X = combined_pd[features]
    y = combined_pd['target']

    return Voting_Classifying(X, y, KFOLD_TIME)
    #return RF_Classifying(X, y, KFOLD_TIME, param_grid)
    #return XGB_Classifying(X, y, KFOLD_TIME, param_grid)
    #return LGBM_Classifying(X, y, KFOLD_TIME, param_grid)

def classify_and_print_NPInter(dataset):
    logger.debug("=======================================================")
    best_score, best_params, best_model = classify(dataset["NPInter"], PARAM_GRID["NPInter"]["RFC"])
    logger.warning("K_fold with {0} epoch : {1}".format(KFOLD_TIME, best_score))
    save_best_output(dataset["NPInter"], best_score, best_params, str(best_model).replace('\n', ''))
    
def classify_and_print_RPI(size, dataset):
    logger.debug("=======================================================")
    best_score, best_params, best_model = classify(dataset["RPI"][size], PARAM_GRID["RPI"][size]["RFC"])
    logger.warning("K_fold with {0} epoch : {1}".format(KFOLD_TIME, best_score))

    save_best_output(dataset["RPI"][size], best_score, best_params, str(best_model).replace('\n', ''))

if __name__ == "__main__":
    dataset = Z_NPZ_PATH
    logger.debug("Classification is about to start ... ")
    classify_and_print_RPI(369, dataset)
    classify_and_print_RPI(488, dataset)
    classify_and_print_RPI(1807, dataset)
    classify_and_print_RPI(2241, dataset)
    classify_and_print_NPInter(dataset)

Classification is about to start ... 
Dataset : npz/Z_RPI369.npz
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.1min remaining:  1.7min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  1.1min remaining:   44.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.1min finished
len : 148 - y_test : [0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1. 0.
 0. 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1.
 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 1. 0. 0. 0. 1. 0. 1. 1.
 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0.
 1. 1. 0. 0.] - y_test[:, 1] : 
len : 148 - y_test_predict : [[0.63927307 0.36072693]
 [0.62432699 0.37567301]
 [0.60001089 0.39998912]
 [0.56712291 0.43287709]
 [0.33984162 0.66015838]
 [0.29686166 0.70313834]
 [0.52972471 0.4

K_fold with 5 epoch : 0.6081081081081081
[save_best_output]best_score_so_far : 0.6081081081081081
Dataset : npz/Z_RPI488.npz
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   16.8s remaining:   25.3s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   17.0s remaining:   11.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   17.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   17.5s finished
len : 98 - y_test : [1. 0. 1. 1. 0. 0. 1. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0.
 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 1. 0.
 1. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 1. 0. 0.
 0. 1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0.
 0. 1.] - y_test[:, 1] : 
len : 98 - y_test_predict : [[0.13008367 0.86991633]
 [0.84491632 0.15508369]
 [0.49069182 0.50930818]
 [0.13278373 0.86721627]
 [0.78473597 0.21526402]
 [0.67522148 0.32477851]
 [0.13001855 0.86998145]
 [0.82707932 0.17292067]
 [0.20801111 0.79198889]
 [0.24197218 0.75802782]
 [0.12560822 0.87439178]
 [0.12754883 0.87245117]
 [0.82408818 0.17591

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.8min remaining:  2.6min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  1.8min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.8min finished
len : 648 - y_test : [0. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0.
 1. 0. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1. 0.
 0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 0. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0.
 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1.
 0. 0. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 0. 0.
 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0.
 0. 0. 0

{'Voting-Classifier': array([0.97222222, 0.98611111, 0.95486111, 0.96467391, 0.94388997,
       0.99349923])}
GridSearchCV를 이용한 최적 매개변수 점수 ==> 0.958
GridSearchCV를 이용한 최적 매개변수 ==> {'gbc__learning_rate': 0.05, 'gbc__max_depth': 6, 'gbc__min_samples_leaf': 3, 'gbc__min_samples_split': 2, 'gbc__n_estimators': 100, 'rfc__max_depth': 6, 'rfc__max_leaf_nodes': 10, 'rfc__min_samples_leaf': 8, 'rfc__min_samples_split': 20, 'rfc__n_estimators': 30, 'svc__C': 0.1, 'svc__gamma': 0.1, 'svc__kernel': 'rbf', 'svc__random_state': 2, 'xgb__C': 0.1, 'xgb__gamma': 0.1, 'xgb__kernel': 'rbf', 'xgb__learning_rate': 0.1, 'xgb__max_depth': 100, 'xgb__n_estimators': 100, 'xgb__num_iterations': 1000, 'xgb__random_state': 2}
GridSearchCV를 이용한 test점수 ==> 0.972
GridSearchCV를 이용한 최고 성능 모델 ==> 
VotingClassifier(estimators=[('rfc',
                              RandomForestClassifier(max_depth=6,
                                                     max_leaf_nodes=10,
                                                  

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  4.0min remaining:  6.0min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  4.1min remaining:  2.7min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.1min finished
len : 897 - y_test : [1. 0. 0. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 1. 0. 1. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0.
 0. 0. 1. 0. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0.
 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 0. 0. 1.
 1. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 1. 1. 1. 0. 0. 1. 0.
 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1.
 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1.
 1. 0. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 1. 1.
 1. 0

{'Voting-Classifier': array([0.83500557, 0.83371824, 0.8362069 , 0.82608696, 0.66974516,
       0.90906467])}
GridSearchCV를 이용한 최적 매개변수 점수 ==> 0.815
GridSearchCV를 이용한 최적 매개변수 ==> {'gbc__learning_rate': 0.05, 'gbc__max_depth': 6, 'gbc__min_samples_leaf': 3, 'gbc__min_samples_split': 2, 'gbc__n_estimators': 100, 'rfc__max_depth': 6, 'rfc__max_leaf_nodes': 10, 'rfc__min_samples_leaf': 8, 'rfc__min_samples_split': 20, 'rfc__n_estimators': 30, 'svc__C': 0.1, 'svc__gamma': 0.1, 'svc__kernel': 'rbf', 'svc__random_state': 2, 'xgb__C': 0.1, 'xgb__gamma': 0.1, 'xgb__kernel': 'rbf', 'xgb__learning_rate': 0.1, 'xgb__max_depth': 100, 'xgb__n_estimators': 100, 'xgb__num_iterations': 1000, 'xgb__random_state': 2}
GridSearchCV를 이용한 test점수 ==> 0.835
GridSearchCV를 이용한 최고 성능 모델 ==> 
VotingClassifier(estimators=[('rfc',
                              RandomForestClassifier(max_depth=6,
                                                     max_leaf_nodes=10,
                                                  

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 32.0min remaining: 48.1min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 32.2min remaining: 21.5min
