In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import ReLU
from tensorflow.keras.optimizers import Adam

# 1727, 191
train = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/d-optimal/d-optimal-of-spmm/train-test-csv/1727-nonsquare-train-from-1918-nonsquare-spmm-over-3s.csv')
test = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/d-optimal/d-optimal-of-spmm/train-test-csv/191-nonsquare-test-from-1918-nonsquare-spmm-over-3s.csv')

In [2]:
# feature 1개 추가
# Train + Valid
X_train = train[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','lr*lc*rc','ld*rd','lr*rc*ld*rd','lr*lc*rc*ld*rd','lnnz*rnnz']] 
sp_smdm_y_train = train['sp_smdm']
bz_smsm_y_train = train['bz_smsm']

# Test
X_test = test[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','lr*lc*rc','ld*rd','lr*rc*ld*rd','lr*lc*rc*ld*rd','lnnz*rnnz']] 
sp_smdm_y_test = test['sp_smdm']
bz_smsm_y_test = test['bz_smsm']

In [3]:
# 데이터 표준화(Standardization)
from sklearn.preprocessing import StandardScaler

# 변형 객체 생성
std_scaler = StandardScaler()

# 훈련데이터의 모수 분포 저장
std_scaler.fit(X_train)

# 훈련 데이터 스케일링
X_train_scaled = std_scaler.transform(X_train)

# # 테스트 데이터의 스케일링
# X_test_scaled = std_scaler.transform(X_test)

In [4]:
from sklearn.metrics import mean_squared_error

def mape_error(y_test, y_pred):
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

def rmse_error(y_true, y_pred):
    rmse = np.sqrt(np.mean(np.square(y_pred - y_true))) 
    return rmse

# sp_smdm

### xgbregressor

In [5]:
import xgboost as xgb

# Train + Valid cross-validation을 거친, 최적의 하이퍼파라미터를 사용
sp_smdm_xgbregressor_model = xgb.XGBRegressor(
objective = 'reg:squarederror',
max_depth=int(15),
learning_rate=0.03043527433312202,
n_estimators=int(117),
subsample=0.7657624341353388,
reg_lambda = 0.6647400190467575,    
min_child_weight = 0.9475568411577447,
n_jobs=-1
                             )
# 모델 훈련
sp_smdm_xgbregressor_model.fit(X_train, sp_smdm_y_train)

XGBRegressor(learning_rate=0.03043527433312202, max_depth=15,
             min_child_weight=0.9475568411577447, n_estimators=117, n_jobs=-1,
             objective='reg:squarederror', reg_lambda=0.6647400190467575,
             subsample=0.7657624341353388)

### dnn

In [6]:
# 모델 생성
def build_model():

    model=Sequential()

    model.add(Dense(256, activation="relu", input_shape=(X_train.shape[1],)))  
    model.add(Dense(128, activation="relu"))
    model.add(Dense(64, activation="relu"))
    model.add(Dense(32, activation="relu"))
    model.add(Dense(1))
    
    optimizer = Adam(lr=0.01)
    
    model.compile(optimizer=optimizer ,
                  loss='mape',
                  metrics=['mape'])
    return model

sp_smdm_dnn_model = build_model()

In [7]:
# 에포크가 끝날 때마다 점(.)을 출력해 훈련 진행 과정을 표시합니다
class PrintDot(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0: print('')
        print('.', end='')

early_stop = keras.callbacks.EarlyStopping(monitor='val_mape', patience=150)

EPOCHS = 100000

sp_smdm_dnn_model.fit(X_train_scaled, 
                sp_smdm_y_train,
                epochs=EPOCHS, 
                validation_split = 0.1, 
                verbose =0, 
                callbacks=[early_stop, PrintDot()])


....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
................................

<tensorflow.python.keras.callbacks.History at 0x7fd132a6e910>

### linear regressor

In [8]:
from sklearn.linear_model import LinearRegression

sp_smdm_linear_model = LinearRegression()

sp_smdm_linear_model.fit(X_train,sp_smdm_y_train)

LinearRegression()

### X_test를 입력 받았을 때, ensemble 모델을 사용해 최종적인 sp_smdm y_pred 반환

In [9]:
def sp_smdm_pred(X_test):

    sp_smdm_xgbregressor_y_pred = sp_smdm_xgbregressor_model.predict(X_test)
    sp_smdm_dnn_y_pred = sp_smdm_dnn_model.predict(std_scaler.transform(X_test)).reshape(-1,)
    sp_smdm_linear_y_pred = sp_smdm_linear_model.predict(X_test)

    # 3가지 y_pred를 concat
    y_pred_concat = pd.concat([pd.DataFrame(sp_smdm_xgbregressor_y_pred),pd.DataFrame(sp_smdm_dnn_y_pred),pd.DataFrame(sp_smdm_linear_y_pred)],axis=1)   
    
    # 3가지 y_pred 중 median 값을 추출해 최종 y_pred 생성
    y_pred = y_pred_concat.median(axis=1).to_numpy()

    return y_pred
    

In [24]:
# sp_smdm 최종 y_pred 배열 생성
sp_smdm_y_pred = sp_smdm_pred(X_test)
sp_smdm_y_pred = sp_smdm_y_pred.astype(np.int64)

# bz_smsm

### xgbregressor

In [11]:
import xgboost as xgb

# Train + Valid cross-validation을 거친, 최적의 하이퍼파라미터를 사용
bz_smsm_xgbregressor_model = xgb.XGBRegressor(
objective = 'reg:squarederror',
max_depth=int(18),
learning_rate=0.04,
n_estimators=int(95),
subsample=0.5625626271955027,
reg_lambda = 0.7058132473615808,    
colsample_bytree=0.9571507504641366,
n_jobs=-1
                             )
# 모델 훈련
bz_smsm_xgbregressor_model.fit(X_train, bz_smsm_y_train)

XGBRegressor(colsample_bytree=0.9571507504641366, learning_rate=0.04,
             max_depth=18, n_estimators=95, n_jobs=-1,
             objective='reg:squarederror', reg_lambda=0.7058132473615808,
             subsample=0.5625626271955027)

### dnn

In [12]:
# 모델 생성
def build_model():

    model=Sequential()

    model.add(Dense(256, activation="relu", input_shape=(X_train.shape[1],)))  
    model.add(Dense(512, activation="relu"))
    model.add(Dense(128, activation="relu"))
    model.add(Dense(64, activation="relu"))
    model.add(Dense(32, activation="relu"))
    model.add(Dense(1))
    
    optimizer = Adam(lr=0.01)
    
    model.compile(optimizer=optimizer ,
                  loss='mape',
                  metrics=['mape'])
    return model

bz_smsm_dnn_model = build_model()

In [13]:
# 에포크가 끝날 때마다 점(.)을 출력해 훈련 진행 과정을 표시합니다
class PrintDot(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0: print('')
        print('.', end='')

early_stop = keras.callbacks.EarlyStopping(monitor='val_mape', patience=150)

EPOCHS = 100000

bz_smsm_dnn_model.fit(X_train_scaled, 
                bz_smsm_y_train,
                epochs=EPOCHS, 
                validation_split = 0.1, 
                verbose =0, 
                callbacks=[early_stop, PrintDot()])


....................................................................................................
....................................................................................................
...................................................................................................

<tensorflow.python.keras.callbacks.History at 0x7fd139ba7ad0>

### linear regressor

In [14]:
from sklearn.linear_model import LinearRegression

bz_smsm_linear_model = LinearRegression()

bz_smsm_linear_model.fit(X_train,bz_smsm_y_train)

LinearRegression()

### X_test를 입력 받았을 때, ensemble 모델을 사용해 최종적인 bz_smsm y_pred 반환

In [15]:
def bz_smsm_pred(X_test):

    bz_smsm_xgbregressor_y_pred = bz_smsm_xgbregressor_model.predict(X_test)
    bz_smsm_dnn_y_pred = bz_smsm_dnn_model.predict(std_scaler.transform(X_test)).reshape(-1,)
    bz_smsm_linear_y_pred = bz_smsm_linear_model.predict(X_test)

    # 3가지 y_pred를 concat
    y_pred_concat = pd.concat([pd.DataFrame(bz_smsm_xgbregressor_y_pred),pd.DataFrame(bz_smsm_dnn_y_pred),pd.DataFrame(bz_smsm_linear_y_pred)],axis=1)   
    
    # 3가지 y_pred 중 median 값을 추출해 최종 y_pred 생성
    y_pred = y_pred_concat.median(axis=1).to_numpy()

    return y_pred
    

In [23]:
# bz_smdm 최종 y_pred 배열 생성
bz_smsm_y_pred = bz_smsm_pred(X_test)
bz_smsm_y_pred = bz_smsm_y_pred.astype(np.int64)

### sp_smdm_y_pred와 bz_smsm_y_pred를 사용해, y_pred_label 생성

In [17]:
y_pred_label = []

for i in range(len(X_test)):
    latency_list = []
    latency_list.append(sp_smdm_y_pred[i])
    latency_list.append(bz_smsm_y_pred[i])
    y_pred_label.append(latency_list.index(min(latency_list)))

### 실제 sp_smdm과 bz_smsm을 통해, y_real_label 생성

In [19]:
y_real_label = []

for i in range(len(X_test)):
    latency_list = []
    latency_list.append(test.loc[i,'sp_smdm'])
    latency_list.append(test.loc[i,'bz_smsm'])
    y_real_label.append(latency_list.index(min(latency_list)))

### y_pred_label과 y_real_label 간 accuracy 측정

In [20]:
from sklearn.metrics import accuracy_score

accuracy_score(y_pred_label, y_real_label)

0.9427083333333334

### 결과 dataframe 생성

In [25]:
temp = test[['lr','lc','rc','ld','rd','lnnz','rnnz','sp_smdm','bz_smsm']]
pd.concat([temp,pd.DataFrame(y_pred_label,columns=['y_pred_label']),pd.DataFrame(sp_smdm_y_pred,columns=['sp_smdm_y_pred']),pd.DataFrame(bz_smsm_y_pred,columns=['bz_smsm_y_pred']), pd.DataFrame(y_real_label,columns=['y_real_label']) ],axis=1)

Unnamed: 0,lr,lc,rc,ld,rd,lnnz,rnnz,sp_smdm,bz_smsm,y_pred_label,sp_smdm_y_pred,bz_smsm_y_pred,y_real_label
0,131403,43733,7722,0.000084,0.10,484397,33774444,29621,34386,0,29454,36816,0
1,15728,31500,45522,0.008605,0.03,4263462,43019639,247887,116536,1,201186,129839,1
2,32231,80720,577,0.001605,0.30,4176859,13985604,3975,6960,0,4333,7006,0
3,33895,53317,26714,0.003706,0.01,6696942,14243336,280837,92404,1,264359,92275,1
4,124226,83003,2310,0.003706,0.23,38209922,44118447,312134,224158,1,280096,222585,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,26143,60856,21307,0.001639,0.01,2608393,12967105,88332,37055,1,86052,33104,1
188,2629,50352,42240,0.001825,0.03,241666,63807645,40930,12841,1,32018,14190,1
189,149605,87549,749,0.000319,0.15,4183568,9836644,10982,17169,0,9259,15255,0
190,44373,7434,41589,0.000563,0.13,185611,40196551,40966,86949,0,36936,74643,0
