In [9]:
import os, psutil
import numpy as np
import scipy.io as sio
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV,KFold
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
import lightgbm as lgb
from sklearn.svm import SVR
import sklearn.preprocessing
from pycaret.regression import *
import tensorflow as tf
import xgboost as xgb
import smogn
from keras import backend as K
import matplotlib.pyplot as plt

def loadMatlabData():
    fileName = os.getcwd() + '/DASE/daseData.mat'
    
    ###============= Load Matlab files
    contentsMat = sio.loadmat(fileName)
    x_train = contentsMat['x_train']
    y_train = contentsMat['y_train']
    x_test = contentsMat['x_test']
    y_test = contentsMat['y_test']
    
    return x_train, y_train, x_test, y_test

def loadxlsxData():
    filename = os.getcwd() + '/DASE/KOBIO_data_final(changed).xlsx'
    
    ###============= Load xlsx files
    df = pd.read_excel(filename)    
    return df

def loadbestPred(i):    
    filename = os.getcwd() + '/DASE/result/best_pred'+str(i+1)+'.mat'
    
    contentsMat = sio.loadmat(filename)
    x_train = contentsMat['x_train']
    y_train = contentsMat['y_train']
    x_test = contentsMat['x_test']
    y_test = contentsMat['y_test']
    y_pred = contentsMat['y_pred']
    x_valid = contentsMat['x_valid']
    y_valid = contentsMat['y_valid']
    
    return x_train, x_valid, x_test, y_train, y_valid, y_test, y_pred
        
def standarizeInput(x_train, x_valid, x_test, y_train, y_valid, y_test):    
    key = ['AGE','BWT','HGT','BMI','CIGP','CIGY','PTGA','PHGA','ESR','CRP','BTIME','CDOSE','RF','ACCP']
    
    ssl = sklearn.preprocessing.StandardScaler()    
    ssl.fit(x_train[key])
    x_train[key] = ssl.transform(x_train[key])
    x_valid[key] = ssl.transform(x_valid[key])    
    x_test[key] = ssl.transform(x_test[key])    
    
    ssl = sklearn.preprocessing.MinMaxScaler()    
    ssl.fit(x_train[key])
    x_train[key] = ssl.transform(x_train[key])
    x_valid[key] = ssl.transform(x_valid[key])
    x_test[key] = ssl.transform(x_test[key])
        
    return x_train, x_valid, x_test, y_train, y_valid, y_test

def standarizeInput4(x_train, x_test, y_train, y_test):     
    key = ['AGE','BWT','HGT','BMI','CIGP','CIGY','PTGA','PHGA','ESR','CRP','BTIME','CDOSE','RF','ACCP']
    
    ssl = sklearn.preprocessing.StandardScaler()    
    ssl.fit(x_train[key])
    x_train[key] = ssl.transform(x_train[key])
    x_test[key] = ssl.transform(x_test[key])    
    
    ssl = sklearn.preprocessing.MinMaxScaler()    
    ssl.fit(x_train[key])
    x_train[key] = ssl.transform(x_train[key])
    x_test[key] = ssl.transform(x_test[key])
        
    return x_train, x_test, y_train, y_test

def applySMOGN(x_train, y_train):    
    df_smogn = pd.concat([x_train, y_train], axis=1)
    smogned = smogn.smoter(
        data=df_smogn.reset_index(drop=True),
        y='DASE',
        k=5,
        pert=0.05,
        samp_method='extreme',
        rel_thres=0.9,
        rel_method='auto',
        rel_xtrm_type='low',
        rel_coef=0.9
    )

    x_train_smogned = smogned.drop("DASE", axis=1).values
    y_train_smogned = smogned["DASE"].values

    return x_train_smogned, y_train_smogned
    # return x_train,y_train

def log_cosh_loss(y_true, y_pred):
    error = y_pred - y_true
    loss = tf.math.log1p(tf.exp(2 * error)) - 2 * tf.math.log(2.0)
    return tf.reduce_mean(loss, axis=-1)

def huber_loss(y_true, y_pred, delta=1.0):
    error = y_true - y_pred
    quadratic_part = K.minimum(K.abs(error), delta)
    linear_part = K.abs(error) - quadratic_part
    loss = 0.5 * K.square(quadratic_part) + delta * linear_part
    return K.mean(loss, axis=-1)

def modelDNN(x_train, x_valid, x_test, y_train, y_valid, y_test):
    
    input = tf.keras.layers.Input(shape=(111,1,1))
    x = tf.keras.layers.Conv2D(filters=16, kernel_size=(3,1), strides = (1), activation = 'relu', padding='valid', name='CV1')(input)
    x = tf.keras.layers.AveragePooling2D(pool_size = (3,1), strides = (2,1), name='AP1')(x)
    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Conv2D(filters=32, kernel_size=(3,1), strides = (1), activation = 'relu', padding='valid', name='CV2')(x)
    x = tf.keras.layers.AveragePooling2D(pool_size = (3,1), strides = (2,1), name='AP2')(x)
    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Conv2D(filters=64, kernel_size=(3,1), strides = (1), activation = 'relu', padding='valid', name='CV3')(x)
    x = tf.keras.layers.AveragePooling2D(pool_size = (3,1), strides = (2,1), name='AP3')(x)
    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Flatten(name='Flatten')(x)
    
    x = tf.keras.layers.Dense(128, activation='relu', name='FC0')(x)
    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Dense(64, activation='relu', name='FC1')(x)
    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Dense(32, activation='relu', name='FC2')(x)
    output = tf.keras.layers.Dense(1, name='Output')(x)
    
    model= tf.keras.models.Model(inputs=input, outputs=output)
    model.summary()
    
    callback_list = [
            tf.keras.callbacks.EarlyStopping(monitor='mse', mode='min', verbose=0, patience=10),
            tf.keras.callbacks.ModelCheckpoint(filepath=os.getcwd()+'/DASE/model/model_dnn.h5', monitor='mse', mode='min', verbose=0, save_best_only=True, save_weights_only=True),
        ]

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer,loss='mse',metrics=['mae','mse'])
    x_train = tf.expand_dims(x_train, axis=-1) 
    x_valid = tf.expand_dims(x_valid, axis=-1) 
    model.fit(x_train, y_train, batch_size=16, epochs=300, validation_data = (x_valid,y_valid), callbacks=callback_list)
    x_test = tf.expand_dims(x_test, axis=-1) 
    y_pred = model.predict(x_test)
    evaluation(y_test,y_pred)
    return y_pred
    
def modelGridSearch(x_train, x_test, y_train, y_test):
    param_grid = {
    'max_depth': [5, 7, 9, 11],
    'learning_rate': [0.1, 0.05, 0.01],
    'n_estimators': [100, 200, 300],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.9, 1],
    'booster': ['gbtree', 'gblinear'],
    }
    
    model = xgb.XGBRegressor(objective='reg:squarederror')
    
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error' , verbose = 2)
    grid_search.fit(x_train, y_train)
    
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    y_pred = best_model.predict(x_test)
    print(best_params)
    evaluation(y_test,y_pred)
    return y_pred

def modelPycaret(x_train, x_test, y_train, y_test):
    df_train = pd.concat([x_train, y_train], axis=1)
    df_test = pd.concat([x_test, y_test], axis=1)
    setup(data = df_train.reset_index(drop=True),test_data=df_test.reset_index(drop=True),target='DASE',session_id=99,index=False)
    
    best_model = compare_models(n_select=25,sort='MAE',include=['lr','lasso','ridge','en','lar','llar','omp','br','ard','par','ransac','tr','huber','kr','svm','knn','dt','rf','et','ada','gbr','mlp','xgboost','lightgbm','catboost'])
    print(best_model)
    return df_test
    
def modelXGB(x_train, x_test, y_train, y_test):    
    model = xgb.XGBRegressor(objective='reg:squarederror',n_estimators=100, learning_rate=0.05, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7).fit(x_train,y_train)
    
    train_pred = model.predict(x_train.squeeze())
    test_pred = model.predict(x_test.squeeze())
    print(model)
    evaluation(y_test.squeeze(),test_pred)
    return test_pred, train_pred

def modelRF(x_train, x_test, y_train, y_test):    
    model = RandomForestRegressor().fit(x_train,y_train)
    
    train_pred = model.predict(x_train.squeeze())
    test_pred = model.predict(x_test.squeeze())
    print(model)
    evaluation(y_test.squeeze(),test_pred)
    return test_pred, train_pred

def modelGBR(x_train, x_test, y_train, y_test):        
    model = GradientBoostingRegressor().fit(x_train.squeeze(),y_train.squeeze())
    
    train_pred = model.predict(x_train.squeeze())
    test_pred = model.predict(x_test.squeeze())
    print(model)
    evaluation(y_test.squeeze(),test_pred)
    return test_pred, train_pred

def modelADA(x_train, x_test, y_train, y_test):    
    model = AdaBoostRegressor().fit(x_train.squeeze(),y_train.squeeze())
    
    train_pred = model.predict(x_train.squeeze())
    test_pred = model.predict(x_test.squeeze())
    print(model)
    evaluation(y_test.squeeze(),test_pred)
    return test_pred, train_pred

def modelLGB(x_train, x_test, y_train, y_test):    
    model = lgb.LGBMRegressor().fit(x_train.squeeze(),y_train.squeeze())
    
    train_pred = model.predict(x_train.squeeze())
    test_pred = model.predict(x_test.squeeze())
    print(model)
    evaluation(y_test.squeeze(),test_pred)
    return test_pred, train_pred

def modelSVR(x_train, x_test, y_train, y_test):    
    
    model = SVR().fit(x_train.squeeze(),y_train.squeeze())
    
    train_pred = model.predict(x_train.squeeze())
    test_pred = model.predict(x_test.squeeze())
    print(model)
    evaluation(y_test.squeeze(),test_pred)
    return test_pred, train_pred

def modelMLMerge(x_train, x_valid, x_test, y_train, y_valid, y_test):
    input1 = tf.keras.layers.Input(shape=(1))  
    input2 = tf.keras.layers.Input(shape=(1))  
    input3 = tf.keras.layers.Input(shape=(1))  
    input4 = tf.keras.layers.Input(shape=(1))  
        
    x = tf.keras.layers.Concatenate()([input1,input2,input3,input4])
    x = tf.keras.layers.Dense(32, activation='relu', name='FC1')(x)
    # x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dense(16, activation='relu', name='FC2')(x)
    x = tf.keras.layers.Dense(8, activation='relu', name='FC3')(x)
    # x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dense(4, activation='relu', name='FC4')(x)
    output = tf.keras.layers.Dense(1, name='Output')(x)
    
    callback_list = [
            tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=10),
            tf.keras.callbacks.ModelCheckpoint(filepath=os.getcwd()+'/DASE/model/model_dnn.h5', monitor='val_loss', mode='min', verbose=0, save_best_only=True, save_weights_only=True),
        ]
    
    model= tf.keras.models.Model(inputs=[input1,input2,input3,input4], outputs=output)
    model.summary()

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer,loss='mae',metrics=['mae'])
    model.fit(x_train, y_train, batch_size=16, epochs=300, validation_data = (x_valid,y_valid), callbacks=callback_list)
    model.load_weights(filepath=os.getcwd()+'/DASE/model/model_dnn.h5')        
    y_pred = model.predict(x_test)
    evaluation(y_test,y_pred)
    return y_pred

def modelTrain(x_train, x_valid, x_test, y_train, y_valid, y_test,i):    
    model = xgb.XGBRegressor(objective='reg:squarederror',n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7
                             )
    model.fit(x_train,y_train, eval_set=[(x_train,y_train),(x_valid.values,y_valid.values)],early_stopping_rounds=20)
    y_pred = model.predict(x_test.values)
    evaluation(y_test.values,y_pred)
    model.save_model(os.getcwd() + '/DASE/model/best_model'+str(i)+'.h5')
    # pltFeature(model.feature_importances_, x_test.columns)
    
    return y_pred

# def randomNoise4(x_train, x_test, y_train, y_test):
    
    
#     return x_train, x_test, y_train, y_test

def evaluation(y_test,y_pred):
    mse = mean_squared_error(y_test, y_pred)    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2Score = r2_score(y_test,y_pred)
    print("MAE :", mae)
    print("MSE :", mse)
    print("RMSE :", rmse)    
    print("R2 :", r2Score)
    
def qunatileSplit(x_data,y_data,df_columns):  
    sorted_indices = np.argsort(y_data)
    sorted_x_data = x_data[sorted_indices]
    sorted_y_data = y_data[sorted_indices]
    num_quantiles = 10
    
    bins = np.linspace(0, len(y_data), num_quantiles+1, dtype=int)
    bin_indices = np.digitize(range(len(y_data)), bins)
    x_train, x_test, y_train, y_test = train_test_split(sorted_x_data, sorted_y_data, test_size=0.2, stratify=bin_indices, random_state=99)
    
    x_train = numpyToDataFrame(x_train,df_columns[1:-1])
    x_test = numpyToDataFrame(x_test,df_columns[1:-1])
    y_train = numpyToDataFrame(y_train,['DASE'])
    y_test = numpyToDataFrame(y_test,['DASE'])
    
    return x_train, x_test, y_train, y_test

def numpyToDataFrame(x,column):    
    df_x = pd.DataFrame(x)
    df_x.columns = column
    return df_x

def pltFeature(importance, feature_names):
    indices = importance.argsort()[::-1]
    sorted_importance = importance[indices]
    sorted_feature_names = [feature_names[i] for i in indices]

    plt.figure(figsize=(10, 6))
    plt.bar(range(len(sorted_importance)), sorted_importance)
    plt.xticks(range(len(sorted_importance)), sorted_feature_names, rotation=90)
    plt.xlabel('Feature')
    plt.ylabel('Importance')
    plt.title('Feature Importances')
    plt.tight_layout()
    plt.show()

df = loadxlsxData()
x_data = df.drop(['DASE','Subject'],axis=1)
y_data = df['DASE']
x_train, x_test, y_train, y_test = qunatileSplit(x_data.values,y_data.values,df.columns)

### grid search
# y_pred = modelGridSearch(x_train, x_test, y_train, y_test)

### single model dnn
# x_train, x_test, y_train, y_test = train_test_split(x_data.values, y_data.values, test_size=0.2, shuffle=True, random_state=99)
# x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, shuffle=True, random_state=99)

# x_train = numpyToDataFrame(x_train,df.columns[1:-1])
# x_valid = numpyToDataFrame(x_valid,df.columns[1:-1])
# x_test = numpyToDataFrame(x_test,df.columns[1:-1])
# y_train = numpyToDataFrame(y_train,['DASE'])
# y_valid = numpyToDataFrame(y_valid,['DASE'])
# y_test = numpyToDataFrame(y_test,['DASE'])

# x_train, x_valid, x_test, y_train, y_valid, y_test = standarizeInput(x_train, x_valid, x_test, y_train, y_valid, y_test)
# x_train, y_train = applySMOGN(x_train, y_train)

### single model ML
# x_train, x_valid, y_train, y_valid = qunatileSplit(x_train.values.squeeze(),y_train.values.squeeze(),df.columns)

# x_train, x_valid, x_test, y_train, y_valid, y_test = standarizeInput(x_train, x_valid, x_test, y_train, y_valid, y_test)
# x_train, y_train = applySMOGN(x_train, y_train)

x_train, x_test, y_train, y_test = standarizeInput4(x_train, x_test, y_train, y_test)
# x_train, y_train = applySMOGN(x_train, y_train)
y_pred = modelPycaret(x_train, x_test, y_train, y_test)
# y_pred = modelGridSearch(x_train, x_test, y_train, y_test)

# valid_pred3, train_pred3 = modelXGB(x_train, x_valid, y_train, y_valid)
# valid_pred4, train_pred4 = modelGBR(x_train, x_valid, y_train, y_valid)
# valid_pred5, train_pred5 = modelRF(x_train, x_valid, y_train, y_valid)
# valid_pred6, train_pred6 = modelLGB(x_train, x_valid, y_train, y_valid)

# xx_valid = [valid_pred3,valid_pred4,valid_pred5,valid_pred6]
# yy_valid = y_valid

# test_pred3, train_pred3 = modelXGB(x_train, x_test, y_train, y_test)
# test_pred4, train_pred4 = modelGBR(x_train, x_test, y_train, y_test)
# test_pred5, train_pred5 = modelRF(x_train, x_test, y_train, y_test)
# test_pred6, train_pred6 = modelLGB(x_train, x_test, y_train, y_test)

# xx_test = [test_pred3,test_pred4,test_pred5,test_pred6]
# yy_test = y_test

# xx_train = [train_pred3,train_pred4,train_pred5,train_pred6]
# yy_train = y_train

# y_preds = modelMLMerge(xx_train, xx_valid, xx_test, yy_train, yy_valid, yy_test)

# y_preds = test_pred3
# yy_test = y_test
# sio.savemat('./DASE/result/merge.mat',{'y_preds':y_preds,'yy_test':yy_test.values})

### 5 fold
# x_train = x_train.values
# y_train = y_train.values
# kf = KFold(n_splits = 5, shuffle = True, random_state = 1)
# i = 1

# for train_index, valid_index in kf.split(x_train):
#     kfX_train, kfX_valid = x_train[train_index], x_train[valid_index]
#     kfY_train, kfY_valid = y_train[train_index], y_train[valid_index]
#     dfX_test, dfY_test = x_test.copy(), y_test.copy()
    
#     dfX_train = numpyToDataFrame(kfX_train,df.columns[1:-1])
#     dfX_valid = numpyToDataFrame(kfX_valid,df.columns[1:-1])
#     dfY_train = numpyToDataFrame(kfY_train,['DASE'])
#     dfY_valid = numpyToDataFrame(kfY_valid,['DASE'])
            
#     normX_train, normX_valid, normX_test, normY_train, normY_valid, normY_test = standarizeInput(dfX_train, dfX_valid, dfX_test, dfY_train, dfY_valid, dfY_test)
    
#     augX_train, augY_train = applySMOGN(normX_train, normY_train)
#     valid_pred3, train_pred3 = modelXGB(augX_train, normX_valid, augY_train, normY_valid)
#     valid_pred4, train_pred4 = modelGBR(augX_train, normX_valid, augY_train, normY_valid)
#     valid_pred5, train_pred5 = modelRF(augX_train, normX_valid, augY_train, normY_valid)
#     valid_pred6, train_pred6 = modelLGB(augX_train, normX_valid, augY_train, normY_valid)

#     mergeX_valid = [valid_pred3,valid_pred4,valid_pred5,valid_pred6]
#     mergeY_valid = normY_valid
        
#     test_pred3, train_pred3 = modelXGB(augX_train, normX_test, augY_train, normY_test)
#     test_pred4, train_pred4 = modelGBR(augX_train, normX_test, augY_train, normY_test)
#     test_pred5, train_pred5 = modelRF(augX_train, normX_test, augY_train, normY_test)
#     test_pred6, train_pred6 = modelLGB(augX_train, normX_test, augY_train, normY_test)

#     mergeX_test = [test_pred3,test_pred4,test_pred5,test_pred6]
#     mergeY_test = normY_test

#     mergeX_train = [train_pred3,train_pred4,train_pred5,train_pred6]
#     mergeY_train = augY_train

#     y_preds = modelMLMerge(mergeX_train, mergeX_valid, mergeX_test, mergeY_train, mergeY_valid, mergeY_test)
    
#     sio.savemat(os.getcwd()+'/DASE/result/best_pred'+str(i)+'.mat',{'x_train' : mergeX_train, 'y_train' : mergeY_train, 'x_test' : mergeX_test, 'y_test' : mergeY_test.values, 'y_pred' : y_preds,
#                                                               'x_valid' : mergeX_valid, 'y_valid' : mergeY_valid.values})
#     i=i+1

### ensemble softvoting

# vPreds = []
# for i in range(5):
#     x_train, x_valid, x_test, y_train, y_valid, y_test, y_pred = loadbestPred(i)    
#     vPreds.append(y_pred)
#     evaluation(y_test,y_pred)
    
# vPred = (vPreds[0] + vPreds[1] + vPreds[2] + vPreds[3] + vPreds[4])/5
# evaluation(y_test,vPred)

ImportError: DLL load failed: 지정된 모듈을 찾을 수 없습니다.

SyntaxError: invalid syntax (728349862.py, line 1)