In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import ReLU
from tensorflow.keras.optimizers import Adam

In [2]:
# 1727, 191
train = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/d-optimal/d-optimal-of-spmm/train-test-csv/1727-nonsquare-train-from-1918-nonsquare-spmm-over-3s.csv')
test = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/d-optimal/d-optimal-of-spmm/train-test-csv/191-nonsquare-test-from-1918-nonsquare-spmm-over-3s.csv')

In [3]:
# feature 1개 추가

# Train + Valid
X_train = train[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','lr*lc*rc','ld*rd','lr*rc*ld*rd','lr*lc*rc*ld*rd','lnnz*rnnz']] 
y_train = train['bz_smsm']

# Test
X_test = test[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','lr*lc*rc','ld*rd','lr*rc*ld*rd','lr*lc*rc*ld*rd','lnnz*rnnz']] 
y_test = test['bz_smsm']

In [4]:
# 데이터 표준화(Standardization)
from sklearn.preprocessing import StandardScaler

# 변형 객체 생성
std_scaler = StandardScaler()

# 훈련데이터의 모수 분포 저장
std_scaler.fit(X_train)

# 훈련 데이터 스케일링
X_train_scaled = std_scaler.transform(X_train)

# 테스트 데이터의 스케일링
X_test_scaled = std_scaler.transform(X_test)

In [5]:
from sklearn.metrics import mean_squared_error

def mape_error(y_test, y_pred):
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

def rmse_error(y_true, y_pred):
    rmse = np.sqrt(np.mean(np.square(y_pred - y_true))) 
    return rmse

### Xgbregressor

In [6]:
import xgboost as xgb

# Train + Valid cross-validation을 거친, 최적의 하이퍼파라미터를 사용
xgbregressor_model = xgb.XGBRegressor(
objective = 'reg:squarederror',
max_depth=int(18),
learning_rate=0.04,
n_estimators=int(95),
subsample=0.5625626271955027,
reg_lambda = 0.7058132473615808,    
colsample_bytree=0.9571507504641366,
n_jobs=-1
                             )
# 모델 훈련
xgbregressor_model.fit(X_train, y_train)

XGBRegressor(colsample_bytree=0.9571507504641366, learning_rate=0.04,
             max_depth=18, n_estimators=95, n_jobs=-1,
             objective='reg:squarederror', reg_lambda=0.7058132473615808,
             subsample=0.5625626271955027)

### dnn

In [14]:
# 모델 생성
def build_model():

    model=Sequential()

    model.add(Dense(256, activation="relu", input_shape=(X_train.shape[1],)))  
    model.add(Dense(512, activation="relu"))
    model.add(Dense(128, activation="relu"))
    model.add(Dense(64, activation="relu"))
    model.add(Dense(32, activation="relu"))
    model.add(Dense(1))
    
    optimizer = Adam(lr=0.01)
    
    model.compile(optimizer=optimizer ,
                  loss='mape',
                  metrics=['mape'])
    return model

dnn_model = build_model()

In [15]:
# 에포크가 끝날 때마다 점(.)을 출력해 훈련 진행 과정을 표시합니다
class PrintDot(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0: print('')
        print('.', end='')

early_stop = keras.callbacks.EarlyStopping(monitor='val_mape', patience=150)

EPOCHS = 100000

dnn_model.fit(X_train_scaled, 
                y_train,
                epochs=EPOCHS, 
                validation_split = 0.1, 
                verbose =0, 
                callbacks=[early_stop, PrintDot()])


....................................................................................................
....................................................................................................
....................................................................................................
.........................................

<tensorflow.python.keras.callbacks.History at 0x7ff0a91a76d0>

### rfr

In [11]:
# from sklearn.ensemble import RandomForestRegressor

# rfr_model = RandomForestRegressor(
# criterion='mse',
# max_depth=18,
# min_samples_leaf=2, 
# min_samples_split=3, 
# n_estimators=600
# )

# rfr_model.fit(X_train, y_train)

RandomForestRegressor(max_depth=18, min_samples_leaf=2, min_samples_split=3,
                      n_estimators=600)

### linear regressor

### nnls

### test

In [16]:
xgbregressor_y_pred = xgbregressor_model.predict(X_test)
dnn_y_pred = dnn_model.predict(X_test_scaled).reshape(-1,)
rfr_y_pred = rfr_model.predict(X_test)

In [17]:
print(mape_error(y_test,xgbregressor_y_pred))
print(mape_error(y_test,dnn_y_pred))
print(mape_error(y_test,rfr_y_pred))

9.52215099157472
22.182128921876853
12.747918985369736


In [37]:
result_list = {}
mape_list = np.array([])

# 예측값, 실제값을 확인하며 mape 계산 후 mape_list에 삽입 
for idx,value in enumerate(y_test):
    mape_temp = {}
    median_temp = np.array([])
    
    # 각 모델의 예측값
    xgbregressor_predicate = int(xgbregressor_y_pred[idx])
    dnn_predicate = int(dnn_y_pred[idx])
    rfr_predicate = int(rfr_y_pred[idx])
    
    # 각 모델의 예측값을 배열에 삽입
    median_temp = np.append(median_temp, np.array([xgbregressor_predicate,dnn_predicate,rfr_predicate]))
    
    # median 값 사용
    mape = abs((value - np.median(median_temp)) / value) * 100
    
    # mean 값 사용
    # mape = abs((value - np.mean(median_temp)) / value) * 100
    
    mape_temp['xgb_pred'] = xgbregressor_predicate
    mape_temp['dnn_pred'] = dnn_predicate
    mape_temp['rfr_pred'] = rfr_predicate
    mape_temp['best_pred'] = np.median(median_temp)
    mape_temp['real'] = value
    mape_temp['mape'] = mape

    mape_list = np.append(mape_list,np.array([mape]))
    result_list[idx] = mape_temp

result_list_sort = sorted(result_list.values(), key=lambda x:(x['mape']), reverse=True)
result_list_sort  

[{'xgb_pred': 5567,
  'dnn_pred': 13391,
  'rfr_pred': 7628,
  'best_pred': 7628.0,
  'real': 3651,
  'mape': 108.92906053136127},
 {'xgb_pred': 44795,
  'dnn_pred': 49658,
  'rfr_pred': 55197,
  'best_pred': 49658.0,
  'real': 31007,
  'mape': 60.15093366014126},
 {'xgb_pred': 10185,
  'dnn_pred': 5257,
  'rfr_pred': 8437,
  'best_pred': 8437.0,
  'real': 18850,
  'mape': 55.241379310344826},
 {'xgb_pred': 5288,
  'dnn_pred': 4552,
  'rfr_pred': 6243,
  'best_pred': 5288.0,
  'real': 3457,
  'mape': 52.964998553659235},
 {'xgb_pred': 8465,
  'dnn_pred': 10524,
  'rfr_pred': 10182,
  'best_pred': 10182.0,
  'real': 6960,
  'mape': 46.293103448275865},
 {'xgb_pred': 21685,
  'dnn_pred': 22213,
  'rfr_pred': 22621,
  'best_pred': 22213.0,
  'real': 15340,
  'mape': 44.80443285528031},
 {'xgb_pred': 4934,
  'dnn_pred': 5870,
  'rfr_pred': 7377,
  'best_pred': 5870.0,
  'real': 4171,
  'mape': 40.7336370175018},
 {'xgb_pred': 19022,
  'dnn_pred': 15711,
  'rfr_pred': 20918,
  'best_pred': 

In [38]:
# 최적의 pred을 골라냈을 때 mape 평균
print(np.mean(mape_list))

10.31855772051985
