In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.optimizers import Adagrad
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import Adamax
from keras.wrappers.scikit_learn import KerasRegressor
from keras import backend as K

In [2]:
train = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/dataset/spmm-latency-dataset/extract-dataset-using-d-optimal/dataset/nonsquare-train-1035-from-spmm-contain-todense-over-3s-1293.csv')
test = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/dataset/spmm-latency-dataset/extract-dataset-using-d-optimal/dataset/nonsquare-test-258-from-spmm-contain-todense-over-3s-1293.csv')

In [3]:
# Train
X_train = train[['lr','lc','rc','ld','rd','lnnz','rnnz']] 
xgb_X_train = train[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','ld*rd']] 
y_train = train['bz_smsm']

# Test
X_test = test[['lr','lc','rc','ld','rd','lnnz','rnnz']] 
xgb_X_test = test[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','ld*rd']] 
y_test = test['bz_smsm']

In [4]:
from sklearn.preprocessing import MinMaxScaler

# MinMaxScaler 객체 생성
minmax_scaler = MinMaxScaler()

# 훈련데이터의 모수 분포 저장
minmax_scaler.fit(X_train)

# 훈련 데이터 스케일링
X_train_scaled = minmax_scaler.transform(X_train)

# 테스트 데이터 스케일링
X_test_scaled = minmax_scaler.transform(X_test)

In [5]:
# from sklearn.metrics import mean_squared_error

# def mape_error(y_test, y_pred):
#     y_test, y_pred = np.array(y_test), np.array(y_pred)
#     return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# def rmse_error(y_true, y_pred):
#     rmse = np.sqrt(np.mean(np.square(y_pred - y_true))) 
#     return rmse

In [6]:
# RMSE
def rmse(y_true, y_pred):
    rmse = K.sqrt(K.mean(K.square(y_pred - y_true))) 
    return rmse

In [7]:
# MAPE
def mean_absolute_percentage_error(y_test, y_pred):
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

### Xgbregressor

In [8]:
import xgboost as xgb

# Train + Valid cross-validation을 거친, 최적의 하이퍼파라미터를 사용
xgbregressor_model = xgb.XGBRegressor(
objective = 'reg:squarederror',
max_depth=int(16.992869637868704),
learning_rate=0.04507011754186229,
n_estimators=int(62.21252826172613),
subsample=0.8545270296912297,
reg_lambda = 0.9671716237976958,    
colsample_bytree=0.9784517810866844,
n_jobs=-1
                             )
# 모델 훈련
xgbregressor_model.fit(xgb_X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9784517810866844, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.04507011754186229, max_delta_step=0, max_depth=16,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=62, n_jobs=-1, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=0.9671716237976958, scale_pos_weight=1,
             subsample=0.8545270296912297, tree_method='exact',
             validate_parameters=1, verbosity=None)

### dnn

In [9]:
# hyperparameter tuning 대상 모델 정의
def create_model(dense_nparams, dense_layer_sizes , input_optimizer, input_kernel_initializer, input_dropout, input_lr):

    model=Sequential()
    model.add(Dense(dense_nparams, activation="relu", input_shape=(X_train.shape[1],), kernel_initializer=input_kernel_initializer))  
    model.add(Dropout(input_dropout),)
    
    # dense_layer_sizes 만큼 layer 추가
    for layer_size in dense_layer_sizes:
        model.add(Dense(layer_size, activation='relu', kernel_initializer=input_kernel_initializer))
        model.add(Dropout(input_dropout), )
    
    model.add(Dense(1))

    optimizer = input_optimizer(lr=input_lr)
    
    model.compile(optimizer = optimizer ,
                  loss='mape',
                  metrics=['mape',rmse])
    return model

In [10]:
# 모델 정의
dnn_model = create_model(1024,
                            (128, 64, 32, 16),
                            Adagrad,
                            'normal',
                            0,
                            0.07)

# 에포크가 끝날 때마다 점(.)을 출력해 훈련 진행 과정을 표시합니다
class PrintDot(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0: print('')
        #print("epoch : {}, logs : {}".format(epoch,logs))
        print('.', end='')

# monitor는 어떤 매개변수를 볼 것인지 입니다.
# patience 매개변수는 성능 향상을 체크할 에포크 횟수입니다
# 지정된 에포크 횟수 동안 성능 향상이 없으면 자동으로 훈련이 멈춥니다.
early_stop = keras.callbacks.EarlyStopping(monitor='val_mape', patience=100)

EPOCHS = 1000

# 훈련 정확도와 검증 정확도 출력
# 에포크마다 훈련 상태를 점검하기 위해 EarlyStopping 콜백(callback)을 사용합니다.
history = dnn_model.fit(X_train_scaled, 
                    y_train,
                    epochs=EPOCHS, 
                    validation_split = 0.1, 
                    verbose =0, 
                    callbacks=[early_stop, PrintDot()])


....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
............................................................

In [11]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

Unnamed: 0,loss,mape,rmse,val_loss,val_mape,val_rmse,epoch
455,6.195899,6.195899,8283.74707,13.427755,13.427755,16262.469727,455
456,5.842187,5.842187,8842.799805,13.917729,13.917729,12700.804688,456
457,5.701189,5.701189,8125.594727,18.052174,18.052174,16157.704102,457
458,6.793094,6.793094,8287.387695,14.72488,14.72488,13021.083984,458
459,5.940602,5.940602,8293.623047,14.467459,14.467459,13661.375,459


### rfr

In [12]:
# from sklearn.ensemble import RandomForestRegressor

# rfr_model = RandomForestRegressor(
# criterion='mse',
# max_depth=18,
# min_samples_leaf=2, 
# min_samples_split=3, 
# n_estimators=600
# )

# rfr_model.fit(X_train, y_train)

### linear regressor

In [13]:
# from sklearn.linear_model import LinearRegression

# linear_model = LinearRegression()

# linear_model.fit(X_train,y_train)

### polynomial

In [14]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

poly = PolynomialFeatures(degree = 3)

X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

lin_reg = LinearRegression()
lin_reg.fit(X_train_poly, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

### test

In [15]:
xgbregressor_y_pred = xgbregressor_model.predict(xgb_X_test)
dnn_y_pred = dnn_model.predict(X_test_scaled).reshape(-1,)
#linear_y_pred = linear_model.predict(X_test)
#nnls_y_pred = a*X_test['lr']+b*X_test['lc']+c*X_test['rc']+d*X_test['ld']+e*X_test['rd']+f*X_test['lnnz']+g*X_test['rnnz']+h*X_test['lr*lc']+i*X_test['lc*rc']+j*X_test['lr*rc']+k*X_test['lr*lc*rc']+l*X_test['ld*rd']+m*X_test['lr*rc*ld*rd']+n*X_test['lr*lc*rc*ld*rd']+o*X_test['lnnz*rnnz']
poly_y_pred = lin_reg.predict(X_test_poly)
#rfr_y_pred = rfr_model.predict(X_test)

In [16]:
print(mean_absolute_percentage_error(y_test,xgbregressor_y_pred))
print(mean_absolute_percentage_error(y_test,dnn_y_pred))
#print(mape_error(y_test,linear_y_pred))
# print(mean_absolute_percentage_error(y_test,nnls_y_pred))
print(mean_absolute_percentage_error(y_test,poly_y_pred))
#print(mape_error(y_test,rfr_y_pred))

18.977175792157514
12.596715104292128
89.60918631479866


In [20]:
result_list = {}
mape_list = np.array([])

# 예측값, 실제값을 확인하며 mape 계산 후 mape_list에 삽입 
for idx,value in enumerate(y_test):
    mape_temp = {}
    median_temp = np.array([])
    
    # 각 모델의 예측값
    xgbregressor_predicate = int(xgbregressor_y_pred[idx])
    dnn_predicate = int(dnn_y_pred[idx])
    #linear_predicate = int(linear_y_pred[idx])
    #nnls_predicate = int(nnls_y_pred[idx])
    poly_predicate = int(poly_y_pred[idx])
    #rfr_predicate = int(rfr_y_pred[idx])
    
    # 각 모델의 예측값을 배열에 삽입
    median_temp = np.append(median_temp, np.array([xgbregressor_predicate,dnn_predicate,poly_predicate]))
    
    # median 값 사용
    mape = abs((value - np.median(median_temp)) / value) * 100
    
    # mean 값 사용
#     mape = abs((value - np.mean(median_temp)) / value) * 100
    
    mape_temp['xgb_pred'] = xgbregressor_predicate
    mape_temp['dnn_pred'] = dnn_predicate
    #mape_temp['linear_pred'] = linear_predicate
    #mape_temp['nnls_pred'] = nnls_predicate
    mape_temp['poly_pred'] = poly_predicate
    #mape_temp['rfr_pred'] = rfr_predicate
    mape_temp['--best_pred--'] = np.median(median_temp)
    mape_temp['real'] = value
    mape_temp['mape'] = mape
    
    # 실제값과 예측값의 mape 결과 삽입
    mape_list = np.append(mape_list,np.array([mape]))
    result_list[idx] = mape_temp

result_list_sort = sorted(result_list.values(), key=lambda x:(x['mape']), reverse=True)
result_list_sort  

[{'xgb_pred': 10525,
  'dnn_pred': 3898,
  'poly_pred': 77329,
  '--best_pred--': 10525.0,
  'real': 3839,
  'mape': 696.6657983849962},
 {'xgb_pred': 6308,
  'dnn_pred': 6477,
  'poly_pred': 109648,
  '--best_pred--': 6477.0,
  'real': 6940,
  'mape': 488.05475504322766},
 {'xgb_pred': 5165,
  'dnn_pred': 4180,
  'poly_pred': 74776,
  '--best_pred--': 5165.0,
  'real': 4985,
  'mape': 462.4941491140087},
 {'xgb_pred': 4767,
  'dnn_pred': 4040,
  'poly_pred': 51639,
  '--best_pred--': 4767.0,
  'real': 4023,
  'mape': 400.8368547518436},
 {'xgb_pred': 6175,
  'dnn_pred': 3765,
  'poly_pred': 35489,
  '--best_pred--': 6175.0,
  'real': 3049,
  'mape': 396.65464086585763},
 {'xgb_pred': 6295,
  'dnn_pred': 3291,
  'poly_pred': 31268,
  '--best_pred--': 6295.0,
  'real': 3280,
  'mape': 315.1829268292683},
 {'xgb_pred': 5061,
  'dnn_pred': 5321,
  'poly_pred': 49918,
  '--best_pred--': 5321.0,
  'real': 5371,
  'mape': 274.2319865946751},
 {'xgb_pred': 13591,
  'dnn_pred': 6182,
  'poly_p

In [21]:
# 최적의 pred을 골라냈을 때 mape 평균
print(np.mean(mape_list))

34.26728717403859
