In [None]:
from hyperparams import *
from rawdata_preprocessing import *
from features import *

import Bio
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier

from sklearn.svm import SVC
from sklearn.model_selection import validation_curve
from sklearn.model_selection import train_test_split, GridSearchCV, KFold

from xgboost import plot_importance
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score, auc, confusion_matrix, roc_curve
import math

#from warnings import filterwarnings
#filterwarnings('ignore')

from Logger import *
from save_best_output import save_best_output

model_metrics = {
    0 : [],
    1 : [],
    2 : [],
    3 : [],
    4 : [],
    5 : [],
    6 : [],
    7 : [],
    8 : [],
    9 : [],
    "mean" : []
}
KFOLD_TIME = 5
cv=KFold(n_splits=3,random_state=5,shuffle=True)

def calc_metrics(y_label, y_proba):
    con_matrix = confusion_matrix(y_label, [1 if x >= 0.5 else 0 for x in y_proba])
    TN = float(con_matrix[0][0])
    FP = float(con_matrix[0][1])
    FN = float(con_matrix[1][0])
    TP = float(con_matrix[1][1])
    P = TP + FN
    N = TN + FP
    Sn = TP / P if P > 0 else 0
    Sp = TN / N if N > 0 else 0
    Acc = (TP + TN) / (P + N) if (P + N) > 0 else 0
    Pre = (TP) / (TP + FP) if (TP+FP) > 0 else 0
    MCC = 0
    tmp = math.sqrt((TP + FP) * (TP + FN)) * math.sqrt((TN + FP) * (TN + FN))
    if tmp != 0:
        MCC = (TP * TN - FP * FN) / tmp
    fpr, tpr, thresholds = roc_curve(y_label, y_proba)
    AUC = auc(fpr, tpr)
    return Acc, Sn, Sp, Pre, MCC, AUC

def Voting_Classifying(X, y, param_grid) :
    RFC_best = RandomForestClassifier(random_state=None)
    SVMC_best = SVC(probability=True, random_state=None)
    GBC_best = GradientBoostingClassifier(random_state=None)
    XGBC_best = XGBClassifier(probability=True, random_state=2)
    LGBM_best = LGBMClassifier(random_state = None)
    
    ABC_best = AdaBoostClassifier(random_state = 2)
    BC_best = BaggingClassifier(random_state = None)
    
    VC = VotingClassifier(estimators=[
        #('ada', ABC_best), 
        #                              ('bc', BC_best),
                                      ('lgbm', LGBM_best),
                                      ('rfc', RFC_best), 
                                      ('svc', SVMC_best), 
                                      ('gbc', GBC_best), 
                                      ('xgb', XGBC_best)
                                      ], 
                          voting='soft', n_jobs=-1, verbose=10)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = None, stratify=y)
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    gsVC = GridSearchCV(estimator=VC, param_grid=param_grid, cv=KFOLD_TIME, n_jobs = -1, verbose=10)

    gsVC = gsVC.fit(X_train,y_train)
    score = gsVC.score(X_test, y_test)
    y_test_predict = gsVC.predict_proba(X_test)
        
    model_metrics[i] = np.array(calc_metrics(y_test, y_test_predict[:, 1]))
    if i == 0 :
        model_metrics["mean"] = model_metrics[i]
    else : 
        for j in range(5) :
            model_metrics["mean"][j] = (model_metrics["mean"][j] * i + model_metrics[i][j]) / (i+1)

    logger.debug('GridSearchCV를 이용한 최적 매개변수 점수 ==> {:.3f}'.format(gsVC.best_score_))
    logger.debug('GridSearchCV를 이용한 최적 매개변수 ==> {}'.format(gsVC.best_params_))
    logger.warning('GridSearchCV를 이용한 test점수 ==> {:.3f}'.format(score))
    logger.debug('GridSearchCV를 이용한 최고 성능 모델 ==> \n{}'.format(gsVC.best_estimator_))
    logger.debug('GridSearchCV를 이용한 최고 score set ==> \n{}'.format(model_metrics[i]))
    
    return score, gsVC.best_params_, gsVC.best_estimator_

def classify(npz_path, param_grid) :
    logger.debug("Dataset : {}".format(npz_path))
    if npz_path[4:4+7] == 'STRUCT_' : 
        mydata = np.load(npz_path)
        XP = mydata['XP'] 
        XR = mydata['XR']
        Y = mydata['Y']        
        combined_pd = pd.DataFrame(data= np.c_[np.c_[XP, XR], Y], columns= p_struct_feature_names + r_struct_feature_names + ['target'])
        
    else :
        mydata = np.load(npz_path)
        XP = mydata['XP']
        XR = mydata['XR']
        Y = mydata['Y']
        combined_pd = pd.DataFrame(data= np.c_[np.c_[XP, XR], Y], columns= p_feature_names + xgb_r_feature_names + ['target'])

    features = list(combined_pd.columns[:-1])
    X = combined_pd[features]
    y = combined_pd['target']

    return Voting_Classifying(X, y, param_grid)
    #return RF_Classifying(X, y, KFOLD_TIME, param_grid)
    #return XGB_Classifying(X, y, KFOLD_TIME, param_grid)
    #return LGBM_Classifying(X, y, KFOLD_TIME, param_grid)

def classify_and_print_NPInter(dataset):
    logger.debug("=======================================================")
    best_score, best_params, best_model = classify(dataset["NPInter"], PARAM_GRID["NPInter"])
    logger.warning("K_fold with {0} epoch : {1}".format(KFOLD_TIME, best_score))
    save_best_output(dataset["NPInter"], best_score, best_params, str(best_model).replace('\n', ''))
    
def classify_and_print_RPI(size, dataset):
    logger.debug("=======================================================")
    best_score, best_params, best_model = classify(dataset["RPI"][size], PARAM_GRID["RPI"][size])
    logger.warning("K_fold with {0} epoch : {1}".format(KFOLD_TIME, best_score))

    save_best_output(dataset["RPI"][size], best_score, best_params, str(best_model).replace('\n', ''))

i=0
if __name__ == "__main__":
    for i in range(10) :
        logger.debug('{}th try'.format(i+1))
        dataset = NPZ_PATH_STRUCT
        logger.debug("Classification is about to start ... ")
        #classify_and_print_RPI(369, dataset)
        classify_and_print_RPI(488, dataset)
        classify_and_print_RPI(1807, dataset)
        #classify_and_print_RPI(2241, dataset)
        #classify_and_print_NPInter(dataset)
    print('model_metrics : {}'.format(model_metrics))

1th try
Classification is about to start ... 
Dataset : npz/STRUCT_RPI488.npz
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   16.2s remaining:   24.3s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   16.2s remaining:   10.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.6s finished
GridSearchCV를 이용한 최적 매개변수 점수 ==> 0.846
GridSearchCV를 이용한 최적 매개변수 ==> {'gbc__learning_rate': 0.1, 'gbc__max_depth': 100, 'gbc__max_features': 250, 'gbc__min_samples_leaf': 3, 'gbc__min_samples_split': 10, 'gbc__n_estimators': 60, 'lgbm__boosting': 'dart', 'lgbm__learning_rate': 0.01, 'lgbm__max_depth': 100, 'lgbm__num_iterations': 1000, 'rfc__max_depth': 6, 'rfc__max_leaf_nodes': 10, 'rfc__min_samples_leaf': 7, 'rfc__min_samples_split': 13, 'rfc__n_estimators': 125, 'svc__C': 0.01, 'svc__gamma': 0.01, 'svc__kernel': 'linear', 'xgb__boosting': 'gblinear', 'xgb__learning_rate': 0.075, 'xgb__max_depth': 100, 'xgb__num_iterations': 1000}
GridSearchCV를 이용한 test점수 ==> 0.908
GridSearch

Fitting 5 folds for each of 1 candidates, totalling 5 fits
