In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import ReLU
from tensorflow.keras.optimizers import Adam

In [2]:
# 1727, 191
train = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/d-optimal/d-optimal-of-spmm/train-test-csv/1727-nonsquare-train-from-1918-nonsquare-spmm-over-3s.csv')
test = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/d-optimal/d-optimal-of-spmm/train-test-csv/191-nonsquare-test-from-1918-nonsquare-spmm-over-3s.csv')

In [3]:
# feature 1개 추가

# Train + Valid
X_train = train[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','lr*lc*rc','ld*rd','lr*rc*ld*rd','lr*lc*rc*ld*rd','lnnz*rnnz']] 
y_train = train['sp_smdm']

# Test
X_test = test[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','lr*lc*rc','ld*rd','lr*rc*ld*rd','lr*lc*rc*ld*rd','lnnz*rnnz']] 
y_test = test['sp_smdm']

In [4]:
# 데이터 표준화(Standardization)
from sklearn.preprocessing import StandardScaler

# 변형 객체 생성
std_scaler = StandardScaler()

# 훈련데이터의 모수 분포 저장
std_scaler.fit(X_train)

# 훈련 데이터 스케일링
X_train_scaled = std_scaler.transform(X_train)

# 테스트 데이터의 스케일링
X_test_scaled = std_scaler.transform(X_test)

In [5]:
from sklearn.metrics import mean_squared_error

def mape_error(y_test, y_pred):
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

def rmse_error(y_true, y_pred):
    rmse = np.sqrt(np.mean(np.square(y_pred - y_true))) 
    return rmse

### Xgbregressor

In [6]:
import xgboost as xgb

# Train + Valid cross-validation을 거친, 최적의 하이퍼파라미터를 사용
xgbregressor_model = xgb.XGBRegressor(
objective = 'reg:squarederror',
max_depth=int(15),
learning_rate=0.03043527433312202,
n_estimators=int(117),
subsample=0.7657624341353388,
reg_lambda = 0.6647400190467575,    
min_child_weight = 0.9475568411577447,
n_jobs=-1
                             )
# 모델 훈련
xgbregressor_model.fit(X_train, y_train)

XGBRegressor(learning_rate=0.03043527433312202, max_depth=15,
             min_child_weight=0.9475568411577447, n_estimators=117, n_jobs=-1,
             objective='reg:squarederror', reg_lambda=0.6647400190467575,
             subsample=0.7657624341353388)

### dnn

In [7]:
# 모델 생성
def build_model():

    model=Sequential()

    model.add(Dense(256, activation="relu", input_shape=(X_train.shape[1],)))  
    model.add(Dense(128, activation="relu"))
    model.add(Dense(64, activation="relu"))
    model.add(Dense(32, activation="relu"))
    model.add(Dense(1))
    
    optimizer = Adam(lr=0.01)
    
    model.compile(optimizer=optimizer ,
                  loss='mape',
                  metrics=['mape'])
    return model

dnn_model = build_model()

In [8]:
# 에포크가 끝날 때마다 점(.)을 출력해 훈련 진행 과정을 표시합니다
class PrintDot(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0: print('')
        print('.', end='')

early_stop = keras.callbacks.EarlyStopping(monitor='val_mape', patience=150)

EPOCHS = 100000

dnn_model.fit(X_train_scaled, 
                y_train,
                epochs=EPOCHS, 
                validation_split = 0.1, 
                verbose =0, 
                callbacks=[early_stop, PrintDot()])


....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
.....................................................................

<tensorflow.python.keras.callbacks.History at 0x7fb9dee53d10>

### rfr

In [9]:
from sklearn.ensemble import RandomForestRegressor

rfr_model = RandomForestRegressor(
criterion='mse',
max_depth=18,
min_samples_leaf=2, 
min_samples_split=4, 
n_estimators=200
)

rfr_model.fit(X_train, y_train)

RandomForestRegressor(max_depth=18, min_samples_leaf=2, min_samples_split=4,
                      n_estimators=200)

### test

In [10]:
xgbregressor_y_pred = xgbregressor_model.predict(X_test)
dnn_y_pred = dnn_model.predict(X_test_scaled).reshape(-1,)
rfr_y_pred = rfr_model.predict(X_test)

In [11]:
print(mape_error(y_test,xgbregressor_y_pred))
print(mape_error(y_test,dnn_y_pred))
print(mape_error(y_test,rfr_y_pred))

13.194099023973507
11.117865289845778
17.48727069652112


In [12]:
result_list = {}
mape_list = np.array([])

# 예측값, 실제값을 확인하며 mape 계산 후 mape_list에 삽입 
for idx,value in enumerate(y_test):
    mape_temp = {}
    median_temp = np.array([])
    
    # 각 모델의 예측값
    xgbregressor_predicate = int(xgbregressor_y_pred[idx])
    dnn_predicate = int(dnn_y_pred[idx])
    rfr_predicate = int(rfr_y_pred[idx])
    
    # 각 모델의 예측값을 배열에 삽입
    median_temp = np.append(median_temp, np.array([xgbregressor_predicate,dnn_predicate,rfr_predicate]))
    
    # median 값 사용
    mape = abs((value - np.median(median_temp)) / value) * 100
    
    # mean 값 사용
    # mape = abs((value - np.mean(median_temp)) / value) * 100
    
    mape_temp['xgb_pred'] = xgbregressor_predicate
    mape_temp['dnn_pred'] = dnn_predicate
    mape_temp['rfr_pred'] = rfr_predicate
    mape_temp['best_pred'] = np.median(median_temp)
    mape_temp['real'] = value
    mape_temp['mape'] = mape

    mape_list = np.append(mape_list,np.array([mape]))
    result_list[idx] = mape_temp

result_list_sort = sorted(result_list.values(), key=lambda x:(x['mape']), reverse=True)
result_list_sort  

[{'xgb_pred': 14837,
  'dnn_pred': 15885,
  'rfr_pred': 16789,
  'best_pred': 15885.0,
  'real': 7017,
  'mape': 126.37879435656262},
 {'xgb_pred': 12403,
  'dnn_pred': 7393,
  'rfr_pred': 15787,
  'best_pred': 12403.0,
  'real': 5957,
  'mape': 108.20882994796037},
 {'xgb_pred': 20997,
  'dnn_pred': 12728,
  'rfr_pred': 22856,
  'best_pred': 20997.0,
  'real': 11563,
  'mape': 81.58782322926577},
 {'xgb_pred': 5254,
  'dnn_pred': 5567,
  'rfr_pred': 7102,
  'best_pred': 5567.0,
  'real': 17398,
  'mape': 68.00206920335671},
 {'xgb_pred': 4188,
  'dnn_pred': 5646,
  'rfr_pred': 4883,
  'best_pred': 4883.0,
  'real': 3060,
  'mape': 59.57516339869281},
 {'xgb_pred': 58171,
  'dnn_pred': 77349,
  'rfr_pred': 62122,
  'best_pred': 62122.0,
  'real': 39889,
  'mape': 55.73717064854973},
 {'xgb_pred': 5910,
  'dnn_pred': 4104,
  'rfr_pred': 7052,
  'best_pred': 5910.0,
  'real': 3975,
  'mape': 48.679245283018865},
 {'xgb_pred': 24680,
  'dnn_pred': 24535,
  'rfr_pred': 27519,
  'best_pred'

In [13]:
# 최적의 pred을 골라냈을 때 mape 평균
print(np.mean(mape_list))

12.687710762568686
