In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.optimizers import Adagrad
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import Adamax
from keras.wrappers.scikit_learn import KerasRegressor
from keras import backend as K

### 데이터 생성

In [2]:
# train = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/dataset/spmm-latency-dataset/extract-dataset-using-d-optimal/dataset/nonsquare-train-1035-from-spmm-contain-todense-over-3s-1293.csv')
# test = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/dataset/spmm-latency-dataset/extract-dataset-using-d-optimal/dataset/compare-recommendation-method-performance.csv')

In [3]:
train = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/dataset/split-spmm-dataset/split-spmm-dataset-using-doe/dataset/nonsquare-train-1035-from-spmm-contain-todense-over-3s-1293.csv')
test = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/dataset/split-spmm-dataset/split-spmm-dataset-using-doe/dataset/compare-recommendation-method-performance-in-real-workload.csv')

In [4]:
# Train
X_train = train[['lr','lc','rc','ld','rd','lnnz','rnnz']] 
sp_smdm_y_train = train['sp_smdm']
bz_smsm_y_train = train['bz_smsm']

# Test
X_test = test[['lr','lc','rc','ld','rd','lnnz','rnnz']] 
sp_smdm_y_test = test['sp_smdm']
bz_smsm_y_test = test['bz_smsm']

### 데이터 전처리

In [5]:
from sklearn.preprocessing import MinMaxScaler

# MinMaxScaler 객체 생성
minmax_scaler = MinMaxScaler()

# 훈련데이터의 모수 분포 저장
minmax_scaler.fit(X_train)

# 훈련 데이터 스케일링
X_train = minmax_scaler.transform(X_train)

# 테스트 데이터 스케일링
X_test = minmax_scaler.transform(X_test)

### Metric 함수 생성

In [6]:
# RMSE
def rmse(y_true, y_pred):
    rmse = K.sqrt(K.mean(K.square(y_pred - y_true))) 
    return rmse

In [7]:
# MAPE
def mean_absolute_percentage_error(y_test, y_pred):
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

### 기본 모델 구조

In [8]:
def create_model(dense_nparams, dense_layer_sizes , input_optimizer, input_kernel_initializer, input_dropout, input_lr):

    model=Sequential()
    model.add(Dense(dense_nparams, activation="relu", input_shape=(X_train.shape[1],), kernel_initializer=input_kernel_initializer))  
    model.add(Dropout(input_dropout),)
    
    # dense_layer_sizes 만큼 layer 추가
    for layer_size in dense_layer_sizes:
        model.add(Dense(layer_size, activation='relu', kernel_initializer=input_kernel_initializer))
        model.add(Dropout(input_dropout), )
    
    model.add(Dense(1))

    optimizer = input_optimizer(lr=input_lr)
    
    model.compile(optimizer = optimizer ,
                  loss='mape',
                  metrics=['mape',rmse])
    return model

### sp_smdm dnn 모델

In [9]:
# 모델 정의
sp_smdm_model = create_model(512,
                            (128, 64, 16, 8),
                            Adagrad,
                            'he_normal',
                            0,
                            0.09)

# 에포크가 끝날 때마다 점(.)을 출력해 훈련 진행 과정을 표시합니다
class PrintDot(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0: print('')
        #print("epoch : {}, logs : {}".format(epoch,logs))
        print('.', end='')

# monitor는 어떤 매개변수를 볼 것인지 입니다.
# patience 매개변수는 성능 향상을 체크할 에포크 횟수입니다
# 지정된 에포크 횟수 동안 성능 향상이 없으면 자동으로 훈련이 멈춥니다.
early_stop = keras.callbacks.EarlyStopping(monitor='val_mape', patience=100)

EPOCHS = 1000

# 훈련 정확도와 검증 정확도 출력
# 에포크마다 훈련 상태를 점검하기 위해 EarlyStopping 콜백(callback)을 사용합니다.
history = sp_smdm_model.fit(X_train, 
                    sp_smdm_y_train,
                    epochs=EPOCHS, 
                    validation_split = 0.1, 
                    verbose =0, 
                    callbacks=[early_stop, PrintDot()])



....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
..................................

### bz_smsm dnn 모델

In [10]:
# 모델 정의
bz_smsm_model = create_model(1024,
                            (128, 64, 32, 16),
                            Adagrad,
                            'normal',
                            0,
                            0.07)

# 에포크가 끝날 때마다 점(.)을 출력해 훈련 진행 과정을 표시합니다
class PrintDot(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0: print('')
        #print("epoch : {}, logs : {}".format(epoch,logs))
        print('.', end='')

# monitor는 어떤 매개변수를 볼 것인지 입니다.
# patience 매개변수는 성능 향상을 체크할 에포크 횟수입니다
# 지정된 에포크 횟수 동안 성능 향상이 없으면 자동으로 훈련이 멈춥니다.
early_stop = keras.callbacks.EarlyStopping(monitor='val_mape', patience=100)

EPOCHS = 1000

# 훈련 정확도와 검증 정확도 출력
# 에포크마다 훈련 상태를 점검하기 위해 EarlyStopping 콜백(callback)을 사용합니다.
history = bz_smsm_model.fit(X_train, 
                    bz_smsm_y_train,
                    epochs=EPOCHS, 
                    validation_split = 0.1, 
                    verbose =0, 
                    callbacks=[early_stop, PrintDot()])



....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
.....................................................

### 예측 성능

In [11]:
# sp_smdm 테스트데이터 예측
sp_smdm_y_pred = sp_smdm_model.predict(X_test).reshape(-1,)
print("sp_smdm 테스트데이터 예측 mape : {}\n".format(mean_absolute_percentage_error(sp_smdm_y_test,sp_smdm_y_pred)))

# bz_smsm 테스트데이터 예측
bz_smsm_y_pred = bz_smsm_model.predict(X_test).reshape(-1,)
print("bz_smsm 테스트데이터 예측 mape : {}\n".format(mean_absolute_percentage_error(bz_smsm_y_test,bz_smsm_y_pred)))

sp_smdm 테스트데이터 예측 mape : 21141.346166004114

bz_smsm 테스트데이터 예측 mape : 153386.70447175813



### 예측한, sp_smdm_y_pred 와 bz_smsm_y_pred 중 작은 값으로 y_pred_label 생성

In [12]:
y_pred_label = []

for i in range(len(X_test)):
    latency_list = []
    latency_list.append(sp_smdm_y_pred[i])
    latency_list.append(bz_smsm_y_pred[i])
    y_pred_label.append(latency_list.index(min(latency_list)))

### 예측한, sp_smdm_y_pred 와 bz_smsm_y_pred 중 작은 값으로 y_pred_latency 생성

In [13]:
y_pred_latency = []

for i in range(len(X_test)):
    latency_list = []
    latency_list.append(sp_smdm_y_pred[i])
    latency_list.append(bz_smsm_y_pred[i])
    y_pred_latency.append(min(latency_list))

### 필요시, y_pred_latency 를 csv 에 추가

In [19]:
test['proposed_system_latency'] = list(map(int, y_pred_latency))
test.to_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/dataset/split-spmm-dataset/split-spmm-dataset-using-doe/dataset/compare-recommendation-method-performance-in-real-workload.csv', index=False)  

### 실제값인, sp_smdm_y_test 과 bz_smsm_y_test 을 통해, y_real_label 생성

In [15]:
y_real_label = []

for i in range(len(X_test)):
    latency_list = []
    latency_list.append(test.loc[i,'sp_smdm'])
    latency_list.append(test.loc[i,'bz_smsm'])
    y_real_label.append(latency_list.index(min(latency_list)))

### y_pred_label 과 y_real_label 간 accuracy 측정

In [16]:
accuracy_score(y_pred_label, y_real_label)

0.85

### 결과 dataframe 생성

In [18]:
pd.options.display.float_format = '{:.5f}'.format
temp = test[['lr','lc','rc','ld','rd','lnnz','rnnz','sp_smdm','bz_smsm']]
pd.concat([temp,pd.DataFrame(y_pred_label,columns=['y_pred_label']),pd.DataFrame(sp_smdm_y_pred,columns=['sp_smdm_y_pred']),pd.DataFrame(bz_smsm_y_pred,columns=['bz_smsm_y_pred']), pd.DataFrame(y_real_label,columns=['y_real_label']) ],axis=1)

Unnamed: 0,lr,lc,rc,ld,rd,lnnz,rnnz,sp_smdm,bz_smsm,y_pred_label,sp_smdm_y_pred,bz_smsm_y_pred,y_real_label
0,1134890,1134890,10,0.0,0.0,5975248,10,456,5,1,377101.125,21347.07617,1
1,1134890,1134890,10,0.0,0.0,5975248,19,453,10,1,377099.3125,21347.08594,1
2,1134890,1134890,10,0.0,0.00257,5975248,29138,458,81,1,371585.59375,21389.46289,1
3,1134890,1134890,10,0.0,0.03625,5975248,411394,473,284,1,304275.3125,21913.79297,1
4,1134890,1134890,10,0.0,0.16965,5975248,1925327,502,826,1,122033.65625,23391.07617,0
5,1134890,1134890,50,0.0,0.0,5975248,50,1961,5,1,378020.21875,21337.48438,1
6,1134890,1134890,50,0.0,0.0,5975248,223,1957,27,1,378013.78125,21337.625,1
7,1134890,1134890,50,0.0,0.00267,5975248,151632,1949,283,1,372720.03125,21459.97852,1
8,1134890,1134890,50,0.0,0.04699,5975248,2666604,2325,1981,1,292835.375,23410.45117,1
9,1134890,1134890,50,0.0,0.29687,5975248,16845840,2358,4904,1,83454.19531,27850.85547,0
