In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.optimizers import Adagrad
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import Adamax
from keras.wrappers.scikit_learn import KerasRegressor
from keras import backend as K

### 데이터 생성

In [2]:
train = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/d-optimal/spmm-latency-traintest/train-test-csv/nonsquare-train-1035-from-spmm-contain-todense-over-3s-1293.csv')
test = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/d-optimal/spmm-latency-traintest/train-test-csv/nonsquare-test-258-from-spmm-contain-todense-over-3s-1293.csv')

In [3]:
# # Train
# X_train = train[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc']] 
# y_train = train['sp_smdm']

# # Test
# X_test = test[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc']] 
# y_test = test['sp_smdm']

In [4]:
# Train
X_train = train[['lr','lc','rc','ld','rd','lnnz','rnnz']] 
y_train = train['sp_smdm']

# Test
X_test = test[['lr','lc','rc','ld','rd','lnnz','rnnz']] 
y_test = test['sp_smdm']

### 데이터 전처리

In [5]:
from sklearn.preprocessing import MinMaxScaler

# MinMaxScaler 객체 생성
minmax_scaler = MinMaxScaler()

# 훈련데이터의 모수 분포 저장
minmax_scaler.fit(X_train)

# 훈련 데이터 스케일링
X_train = minmax_scaler.transform(X_train)

# 테스트 데이터 스케일링
X_test = minmax_scaler.transform(X_test)

### Metric 함수 생성

In [6]:
# RMSE
def rmse(y_true, y_pred):
    rmse = K.sqrt(K.mean(K.square(y_pred - y_true))) 
    return rmse

In [7]:
# MAPE
def mean_absolute_percentage_error(y_test, y_pred):
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

### Hyperparameter 정의

In [8]:
# # dense_nparams : 초기 dense layer size
# dense_nparams = [1024]
# # dense_layer_sizes : 사용할 dense layer size 목록
# dense_layer_sizes = [(256,64,16,)]
# # input_optimizer = optimizer
# input_optimizer = [SGD, Adagrad, RMSprop, Adam, Adamax]
# # input_kernel_initializer : 가중치 초기화 방법
# input_kernel_initializer =  ['uniform', 'normal', 
#                             'glorot_uniform', 'glorot_normal',
#                             'he_uniform', 'he_normal' ]
# # input_dropout : dropout 비율
# input_dropout = [0, 0.1, 0.2, 0.3, 0.4, 0.5]
# # input_lr : learning_rate
# input_lr = [0.001, 0.01, 0.1, 0.2]

# # hyperparameter dictionary 화
# param_grid = dict(dense_nparams = dense_nparams,
#                 dense_layer_sizes = dense_layer_sizes,
#                 input_optimizer = input_optimizer,
#                 input_kernel_initializer = input_kernel_initializer,
#                 input_dropout = input_dropout,
#                 input_lr = input_lr)

In [9]:
# # dense_nparams : 초기 dense layer size
# dense_nparams = [1024]
# # dense_layer_sizes : 사용할 dense layer size 목록
# dense_layer_sizes = [(256,64,16,)]
# # input_optimizer = optimizer
# input_optimizer = [SGD, Adagrad, RMSprop, Adam, Adamax]
# # input_kernel_initializer : 가중치 초기화 방법
# input_kernel_initializer =  ['he_uniform', 'he_normal']
# # input_dropout : dropout 비율
# input_dropout = [0, 0.1]
# # input_lr : learning_rate
# input_lr = [0.001, 0.01, 0.1]

# # hyperparameter dictionary 화
# param_grid = dict(dense_nparams = dense_nparams,
#                 dense_layer_sizes = dense_layer_sizes,
#                 input_optimizer = input_optimizer,
#                 input_kernel_initializer = input_kernel_initializer,
#                 input_dropout = input_dropout,
#                 input_lr = input_lr)

### Hyperparameter Tuning 대상 정의

In [10]:
# hyperparameter tuning 대상 모델 정의
def create_model(dense_nparams, dense_layer_sizes , input_optimizer, input_kernel_initializer, input_dropout, input_lr):

    model=Sequential()
    model.add(Dense(dense_nparams, activation="relu", input_shape=(X_train.shape[1],), kernel_initializer=input_kernel_initializer))  
    model.add(Dropout(input_dropout),)
    
    # dense_layer_sizes 만큼 layer 추가
    for layer_size in dense_layer_sizes:
        model.add(Dense(layer_size, activation='relu', kernel_initializer=input_kernel_initializer))
        model.add(Dropout(input_dropout), )
    
    model.add(Dense(1))

    optimizer = input_optimizer(lr=input_lr)
    
    model.compile(optimizer = optimizer ,
                  loss='mape',
                  metrics=['mape',rmse])
    return model

# hyperparameter tuning 대상 모델 선언
# 파라미터 조합 당 epochs 는 300 번
regressor_model = KerasRegressor(build_fn=create_model, epochs=300, batch_size=10, verbose=0)

### GridSearchCV 정의 및 시작

In [11]:
# # cross_validation 정의
# kf = KFold(random_state=30,
#            n_splits=10, # Fold 는 10개로 지정
#            shuffle=True
#           )

# # gridsearch 정의
# # scoring : 검증셋의 성능을 무엇으로 측정할 것인지
# # n_jobs : 프로세스가 시스템의 모든 코어를 사용하도록    
# # verbose : 모든 log 출력하도록
# grid = GridSearchCV(estimator=regressor_model, 
#                     param_grid=param_grid, 
#                     scoring = make_scorer(mean_absolute_percentage_error, greater_is_better=False),
#                     cv = kf,
#                     n_jobs=-1,
#                     verbose=3)

# # gridsearch 시작
# grid_result = grid.fit(X_train, y_train)

# # gridesearch 결과
# print("최고의 파라미터 :", grid_result.best_params_)
# print("최고 평균 정확도 : {}".format(grid_result.best_score_))

### GridSearchCV 를 통해 탐색된 최적의 Hyperparameter 를 사용해 모델링

In [12]:
# # 모델 정의
# result_model = create_model(grid_result.best_params_['dense_nparams'],
#                             grid_result.best_params_['dense_layer_sizes'],
#                             grid_result.best_params_['input_optimizer'],
#                             grid_result.best_params_['input_kernel_initializer'],
#                             grid_result.best_params_['input_dropout'],
#                             grid_result.best_params_['input_lr'])
# # 모델 훈련
# history = result_model.fit(X_train, 
#                 y_train,
#                 epochs=300, 
#                 validation_split = 0.1, 
#                 verbose =0)

In [13]:
# hist = pd.DataFrame(history.history)
# hist['epoch'] = history.epoch
# hist.tail()

In [14]:
# def plot_history(history):
#     hist = pd.DataFrame(history.history)
#     hist['epoch'] = history.epoch

#     plt.figure(figsize=(8,12))

#     # mape metric
#     plt.subplot(2,1,1)
#     plt.xlabel('Epoch')
#     plt.ylabel('mape')
#     plt.plot(hist['epoch'], hist['mape'],
#            label='Train Error')
#     plt.plot(hist['epoch'], hist['val_mape'],
#            label = 'Val Error')
#     plt.legend()
    
#     # rmse metric
#     plt.subplot(2,1,2)
#     plt.xlabel('Epoch')
#     plt.ylabel('rmse')
#     plt.plot(hist['epoch'], hist['rmse'],
#            label='Train Error')
#     plt.plot(hist['epoch'], hist['val_rmse'],
#            label = 'Val Error')
#     plt.legend()

#     plt.show()

# plot_history(history)

### 예측 성능

In [15]:
# # 훈련데이터 예측
# y_train_pred = result_model.predict(X_train).reshape(-1,)
# print("훈련데이터 예측 mape : {}\n".format(mean_absolute_percentage_error(y_train,y_train_pred)))

# # 테스트데이터 예측
# y_test_pred = result_model.predict(X_test).reshape(-1,)
# print("테스트데이터 예측 mape : {}\n".format(mean_absolute_percentage_error(y_test,y_test_pred)))

### 고정된 Hyperparameter 를 사용해 모델링 (Early Stopping)

In [16]:
# # 모델 정의
# fix_model = create_model(512,
#                             (128, 64, 16, 8),
#                             Adagrad,
#                             'he_normal',
#                             0,
#                             0.09)

# # 에포크가 끝날 때마다 점(.)을 출력해 훈련 진행 과정을 표시합니다
# class PrintDot(keras.callbacks.Callback):
#     def on_epoch_end(self, epoch, logs):
#         if epoch % 100 == 0: print('')
#         #print("epoch : {}, logs : {}".format(epoch,logs))
#         print('.', end='')

# # monitor는 어떤 매개변수를 볼 것인지 입니다.
# # patience 매개변수는 성능 향상을 체크할 에포크 횟수입니다
# # 지정된 에포크 횟수 동안 성능 향상이 없으면 자동으로 훈련이 멈춥니다.
# early_stop = keras.callbacks.EarlyStopping(monitor='val_mape', patience=100)

# EPOCHS = 1000

# # 훈련 정확도와 검증 정확도 출력
# # 에포크마다 훈련 상태를 점검하기 위해 EarlyStopping 콜백(callback)을 사용합니다.
# history = fix_model.fit(X_train, 
#                     y_train,
#                     epochs=EPOCHS, 
#                     validation_split = 0.1, 
#                     verbose =0, 
#                     callbacks=[early_stop, PrintDot()])


In [17]:
# hist = pd.DataFrame(history.history)
# hist['epoch'] = history.epoch
# hist.tail()

In [18]:
# def plot_history(history):
#     hist = pd.DataFrame(history.history)
#     hist['epoch'] = history.epoch

#     plt.figure(figsize=(8,12))

#     # mape metric
#     plt.subplot(2,1,1)
#     plt.xlabel('Epoch')
#     plt.ylabel('mape')
#     plt.plot(hist['epoch'], hist['mape'],
#            label='Train Error')
#     plt.plot(hist['epoch'], hist['val_mape'],
#            label = 'Val Error')
#     plt.legend()
    
#     # rmse metric
#     plt.subplot(2,1,2)
#     plt.xlabel('Epoch')
#     plt.ylabel('rmse')
#     plt.plot(hist['epoch'], hist['rmse'],
#            label='Train Error')
#     plt.plot(hist['epoch'], hist['val_rmse'],
#            label = 'Val Error')
#     plt.legend()

#     plt.show()
    

# plot_history(history)

In [19]:
# # 훈련데이터 예측
# y_train_pred = fix_model.predict(X_train).reshape(-1,)
# print("훈련데이터 예측 mape : {}\n".format(mean_absolute_percentage_error(y_train,y_train_pred)))

# # 테스트데이터 예측
# y_test_pred = fix_model.predict(X_test).reshape(-1,)
# print("테스트데이터 예측 mape : {}\n".format(mean_absolute_percentage_error(y_test,y_test_pred)))

### Feture 중요도 탐색

In [20]:
# from sklearn.inspection import permutation_importance # sklearn 22 버전부터 해당
# from sklearn.metrics import make_scorer

# # MAPE
# def mean_absolute_percentage_error(y_test, y_pred):
#     y_test, y_pred = np.array(y_test), np.array(y_pred)
#     return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# # fix_model : 훈련된 모델
# # X_train : 훈련데이터 Feature
# # y_train : 훈련데이터 Target
# # scoring : Feature 를 Shuffler 한 뒤, 예측값과 실제값을 어떤 Metric 을 사용해 비교할지
# # n_repeats : 특정 Feature 를 몇번 Shuffle 할 것인지
# # random_state : 난수 고정
# result = permutation_importance(fix_model, X_train, y_train, scoring = make_scorer(mean_absolute_percentage_error,greater_is_better=False),
#                             n_repeats=20,
#                             random_state=0)
# # Feature label
# Feature = train[['lr','lc','rc','ld','rd','lnnz','rnnz']] 

# # Feature 중요도를 오름차순으로 정렬한 뒤, 해당 Feature 의 index 를 저장
# sorted_result = result.importances_mean.argsort()

# # 결과를 DataFrame 화
# importances = pd.DataFrame(result.importances_mean[sorted_result], index=Feature.columns[sorted_result]).sort_values(0, ascending=False)   
# importances

### Grid Search 를 통해 찾은 Hyperparameter 를 고정한뒤, 훈련 데이터를 섞어가며 MAPE 결과 수집

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

mape_list = []

# 모델 정의
fix_model = create_model(512,
                        (128, 64, 16, 8),
                        Adagrad,
                        'he_normal',
                        0,
                        0.09)

# 에포크가 끝날 때마다 점(.)을 출력해 훈련 진행 과정을 표시합니다
class PrintDot(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0: print('')
        #print("epoch : {}, logs : {}".format(epoch,logs))
        print('.', end='')

# monitor는 어떤 매개변수를 볼 것인지 입니다.
# patience 매개변수는 성능 향상을 체크할 에포크 횟수입니다
# 지정된 에포크 횟수 동안 성능 향상이 없으면 자동으로 훈련이 멈춥니다.
early_stop = keras.callbacks.EarlyStopping(monitor='val_mape', patience=80)

EPOCHS = 1000
    
for i in range(10):

    # 훈련 정확도와 검증 정확도 출력
    # 에포크마다 훈련 상태를 점검하기 위해 EarlyStopping 콜백(callback)을 사용합니다.
    history = fix_model.fit(X_train, 
                        y_train,
                        epochs=EPOCHS, 
                        validation_split = 0.1, 
                        verbose =0, 
                        callbacks=[early_stop, PrintDot()])
    
    # 테스트데이터 예측
    y_test_pred = fix_model.predict(X_test).reshape(-1,)
    
    mape_list.append(mean_absolute_percentage_error(y_test,y_test_pred))

mape_list = np.array(mape_list)
print("\n",mape_list)
print("median : " , np.median(mape_list))
print("min : " , np.min(mape_list))
print("max : " , np.max(mape_list)) 


....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
.......................
....................................................................................................
...........
...............................................................................................
.................................................................................
....................................................................................................
.
............................................................................