In [None]:
### 劃分資料集 ###
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler


data = pd.read_csv('lasso.csv')

X = data.drop('mace', axis=1)
y = data['mace']

# 分割訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# 將未平衡處理的訓練集保存為CSV文件
unbalanced_train_data = pd.DataFrame(X_train, columns=X.columns)
unbalanced_train_data['mace'] = y_train
unbalanced_train_data.to_csv('unbalanced_train_lasso.csv', index=False)

# 進行隨機欠抽樣平衡處理
rus = RandomUnderSampler()
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# 將平衡後的訓練集轉換為 DataFrame 並保存為CSV文件
resampled_data = pd.DataFrame(X_train_resampled, columns=X.columns)
resampled_data['mace'] = y_train_resampled
resampled_data.to_csv('balanced_train_lasso.csv', index=False)

# 將測試集轉換為 DataFrame 並保存為CSV文件
test_data = pd.DataFrame(X_test, columns=X.columns)
test_data['mace'] = y_test
test_data.to_csv('test_lasso.csv', index=False)

In [None]:
## SVM五折交叉&測試集驗證 ##
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

def evaluate_model(train_file, test_file):
    train_data = pd.read_csv(train_file)
    X_train = train_data.drop('mace', axis=1)
    y_train = train_data['mace']

    svm_model = SVC(probability=True)

    cv = StratifiedKFold(n_splits=5, shuffle=True)

    precision_scores = cross_val_score(svm_model, X_train, y_train, cv=cv, scoring='precision')
    recall_scores = cross_val_score(svm_model, X_train, y_train, cv=cv, scoring='recall')
    f1_scores = cross_val_score(svm_model, X_train, y_train, cv=cv, scoring='f1')
    roc_auc_scores = cross_val_score(svm_model, X_train, y_train, cv=cv, scoring='roc_auc')

    # Print cross-validation results
    print("Average Precision: %0.4f" % precision_scores.mean())
    print("Average Recall: %0.4f" % recall_scores.mean())
    print("Average F1: %0.4f" % f1_scores.mean())
    print("Average ROC AUC: %0.4f" % roc_auc_scores.mean())

    svm_model.fit(X_train, y_train)

    test_data = pd.read_csv(test_file)
    X_test = test_data.drop('mace', axis=1)
    y_test = test_data['mace']

    y_pred = svm_model.predict(X_test)
    y_pred_proba = svm_model.predict_proba(X_test)[:, 1] 

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    print("_____Test Set Evaluation Metrics_____")
    print("Precision: %0.4f" % precision)
    print("Recall: %0.4f" % recall)
    print("F1: %0.4f" % f1)
    print("ROC AUC: %0.4f" % roc_auc)

# evaluate_model('bert_balanced.csv', 'test_bert.csv')


In [None]:
## XGBoost 五折交叉&測試集驗證 ##
def evaluate_model(train_file, test_file):
    train_data = pd.read_csv(train_file)
    X_train = train_data.drop('mace', axis=1)
    y_train = train_data['mace']


    xgb_model = XGBClassifier()
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    precision_scores = cross_val_score(xgb_model, X_train, y_train, cv=cv, scoring='precision')
    recall_scores = cross_val_score(xgb_model, X_train, y_train, cv=cv, scoring='recall')
    f1_scores = cross_val_score(xgb_model, X_train, y_train, cv=cv, scoring='f1')
    roc_auc_scores = cross_val_score(xgb_model, X_train, y_train, cv=cv, scoring='roc_auc')

    print("Average Precision: %0.4f" % precision_scores.mean())
    print("Average Recall: %0.4f" % recall_scores.mean())
    print("Average F1: %0.4f" % f1_scores.mean())
    print("Average ROC AUC: %0.4f" % roc_auc_scores.mean())
    
    xgb_model.fit(X_train, y_train)
    
    test_data = pd.read_csv(test_file)
    X_test = test_data.drop('mace', axis=1)
    y_test = test_data['mace']
    y_pred = xgb_model.predict(X_test)


    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    print("_____Test Set Evaluation Metrics_____")
    print("Precision: %0.4f" % precision)
    print("Recall: %0.4f" % recall)
    print("F1: %0.4f" % f1)
    print("ROC AUC: %0.4f" % roc_auc)
    
#evaluate_model('bert_balanced.csv', 'test_bert.csv')

In [None]:
## GEV-NN五折交叉驗證 ##
import numpy as np
from Gev_network import MLP_AE
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
import pandas as pd

import os

import warnings

warnings.filterwarnings("ignore")

seed = 150
np.random.seed(seed)


def one_hot_encoding(train, variable):
    values = train[variable].unique()
    if len(values)==2:
        train[str(values[0])] = np.where(train[variable] == values[0], 1, 0)
        train = train.drop(variable, axis=1)
    else:
        for val in values:
            train[str(val)] = np.where(train[variable]==val, 1, 0)
        train = train.drop(variable, axis=1)

    return train


def training (activation, loss_weight, data_name):

    res = {'activation': [], 'data': [], 'weight': [], 'acc': [],  'precision': [],
             'recall': [], 'f-score': [] ,'auc_score': []}

    data = pd.read_csv(data_name+'.csv')

    for var in list(data):
        if data[var].dtype == 'object':
            data = one_hot_encoding(data, var)

    data = np.asarray(data)

    length = data.shape[1] - 1
    X, Y = data[:, 0:length], data[:, length]

    scaler = MinMaxScaler()
    scaler = scaler.fit(X)
    scaled_X = scaler.transform(np.asarray(X))

    skf = StratifiedKFold(n_splits=5)

    data_index = 1

    for train_index, test_index in skf.split(scaled_X, Y):

        trainX, testX = scaled_X[train_index], scaled_X[test_index]
        trainY, testY = Y[train_index], Y[test_index]

        batch_size = 16


        for weight in loss_weight:
           model = MLP_AE(trainX=trainX, trainY=trainY, epoch_number=2000, batch_size=batch_size, learning_rate=0.001,
                          encoder=[32, 16, 8],decoder=[16, 32], sofnn=[32], early_stoppting_patience=200,
                          neurons=[32], activation=activation, reg_lambda=0.00001,
                          loss_weigth=weight, rand=data_index)

           final_model = model.MLP_AE('GEV_MODEL/test1/model_ae_%s_%s' %(data_name, data_index),
                                      'GEV_MODEL/test1/model_%s_%s_%s_%s.tf' %(data_name, activation, data_index, weight))

           pred_Y, true_Y = model.predict(testX, testY, final_model)

           recall, auc_score, f_score, acc, precision = model.model_evaluation(pred_Y, true_Y)

           print("%s: 'recall' %0.4f, 'AUC', %0.4f, 'F', %0.4f, 'ACC', %0.4f, "
                 "'precision', %0.4f" %(data_name, recall, auc_score, f_score, acc, precision))

           res['activation'].append(activation)
           res['data'].append(data_name)
           res['precision'].append(precision)
           res['auc_score'].append(auc_score)
           res['acc'].append(acc)
           res['f-score'].append(f_score)
           res['recall'].append(recall)
           res['weight'].append(weight)

           data_index = data_index + 1

    res_brier = pd.DataFrame.from_dict(res)

    res_brier.to_csv('GEV_MODEL/' + 'result_final.csv', mode='a',
                     encoding='euc-kr', index=False)

    return res


loss_weight= [0.05]

# 放入檔案
data_names = ['']
for data_name in data_names:
    result_gev = training(activation='gev',loss_weight=loss_weight,data_name=data_name)

In [None]:
## GEV-NN測試集驗證 ##

from Gev_network import MLP_AE
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.models import load_model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


def one_hot_encoding(data, variable):
    values = data[variable].unique()
    if len(values) == 2:
        data[str(values[0])] = np.where(data[variable] == values[0], 1, 0)
        data = data.drop(variable, axis=1)
    else:
        for val in values:
            data[str(val)] = np.where(data[variable] == val, 1, 0)
        data = data.drop(variable, axis=1)
    return data

def training(activation, loss_weight, data_name):
    res = {'activation': [], 'data': [], 'weight': [], 'acc': [],  'precision': [],
           'recall': [], 'f-score': [], 'auc_score': []}

    data = pd.read_csv(data_name + '.csv')

    for var in list(data):
        if data[var].dtype == 'object':
            data = one_hot_encoding(data, var)

    data = np.asarray(data)
    length = data.shape[1] - 1
    X, Y = data[:, :length], data[:, length]

    scaler = MinMaxScaler()
    scaled_X = scaler.fit_transform(X)

    model = MLP_AE(trainX=scaled_X, trainY=Y, epoch_number=2000, batch_size=16, learning_rate=0.001,
                   encoder=[32, 16, 8], decoder=[16, 32], sofnn=[32], early_stoppting_patience=200,
                   neurons=[32], activation=activation, reg_lambda=0.00001,
                   loss_weigth=loss_weight, rand=1)

    final_model = model.MLP_AE('GEV_MODEL/new_test/model_ae_%s' % data_name,
                               'GEV_MODEL/new_test/model_%s_%s.tf' % (data_name, activation))

    pred_Y, true_Y = model.predict(scaled_X, Y, final_model)

    recall, auc_score, f_score, acc, precision = model.model_evaluation(pred_Y, true_Y)

    res['activation'].append(activation)
    res['data'].append(data_name)
    res['precision'].append(precision)
    res['auc_score'].append(auc_score)
    res['acc'].append(acc)
    res['f-score'].append(f_score)
    res['recall'].append(recall)
    res['weight'].append(loss_weight)

    res_brier = pd.DataFrame.from_dict(res)
    res_brier.to_csv('GEV_MODEL/' + 'result_final.csv', mode='a', encoding='euc-kr', index=False)

    return res

loss_weight = [0.05]
training('gev', loss_weight, 'balanced_structureAll')


def one_hot_encoding(data, variable):
    values = data[variable].unique()
    if len(values) == 2:
        data[str(values[0])] = np.where(data[variable] == values[0], 1, 0)
        data = data.drop(variable, axis=1)
    else:
        for val in values:
            data[str(val)] = np.where(data[variable] == val, 1, 0)
        data = data.drop(variable, axis=1)
    return data

new_test_data = pd.read_csv('test_structureAll.csv')

Y_test = new_test_data['mace']
X_test = new_test_data.drop('mace', axis=1)

for var in list(X_test):
    if X_test[var].dtype == 'object':
        X_test = one_hot_encoding(X_test, var)

X_test = np.asarray(X_test)

scaler = MinMaxScaler()
scaler = scaler.fit(X_test)
scaled_X_test = scaler.transform(np.asarray(X_test))

print("Shape of X_test after preprocessing: ", X_test.shape)

model_path = 'GEV_MODEL/new_test/model_balanced_structureAll_gev.tf'
final_model = load_model(model_path)

pred_Y_list = final_model.predict([scaled_X_test, scaled_X_test])

pred_Y_prob = pred_Y_list[1]

pred_Y_prob = np.asarray(pred_Y_prob).flatten()

pred_Y = np.where(pred_Y_prob >= 0.5, 1, 0)

accuracy = accuracy_score(Y_test, pred_Y)
precision = precision_score(Y_test, pred_Y)
recall = recall_score(Y_test, pred_Y)
f1 = f1_score(Y_test, pred_Y)
roc_auc = roc_auc_score(Y_test, pred_Y_prob)

print("_____Test Set Evaluation Metrics_____")
print("Accuracy: %0.4f" % accuracy)
print("Precision: %0.4f" % precision)
print("Recall: %0.4f" % recall)
print("F1 Score: %0.4f" % f1)
print("ROC AUC: %0.4f" % roc_auc)
