# **실제 분류 데이터를 활용하여 SVR학습 모델 학습**

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler,normalize
from sklearn.metrics import confusion_matrix, mean_squared_error

import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split

import warnings
warnings.filterwarnings("ignore")

**mortgage - regression 예제**

실제 분류 예제 데이터(mortgage.csv) 를 사용하여 인공신경망 모델을 학습해보겠습니다.

학습데이터를 정규화작업을 거쳐, 학습과 모델 평가를 위해 6:4의 비율로 학습데이터와 평가데이터로 나누어 보도록 보겠습니다.

또한, 예측 성능 평가를 위해 MAPE 성능지표 산출 함수를 정의하였습니다.

In [2]:
mor = pd.read_csv('./data/mortgage.csv', delimiter=',')
print(mor.shape)

(1049, 16)


In [3]:
mor

Unnamed: 0,OneMonthCDRate,OneY.CMaturityRate,ThreeM.Rate.AuctionAverage,ThreeM.Rate.SecondaryMarket,ThreeY.CMaturityRate,FiveY.CMaturityRate,BankCredit,Currency,DemandDeposits,FederalFunds,MoneyStock,CheckableDeposits,LoansLeases,SavingsDeposits,TradeCurrencies,ThirtyY.CMortgageRate
0,8.72,90.729,9.69,7.62,7.60,7.72,7.69,2605.8,223.4,279.6,8.52,794.4,564.8,2020.2,894.7,7.66
1,13.85,109.392,17.19,12.06,12.47,13.94,13.82,1347.4,124.4,230.8,14.35,443.0,314.8,1033.8,343.9,13.73
2,6.59,87.979,9.94,5.74,5.67,7.42,7.73,2280.2,198.9,287.2,6.77,755.9,551.0,1743.9,936.5,6.65
3,17.43,96.064,15.07,15.20,15.01,13.13,12.89,1237.4,115.5,241.9,18.12,410.3,291.3,933.1,377.5,14.24
4,3.16,85.121,8.21,2.97,2.94,4.93,5.85,2937.9,288.7,336.4,2.96,1012.5,715.6,2110.2,1179.5,3.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1044,9.34,85.261,10.46,8.04,7.96,9.01,8.99,2428.8,211.7,288.9,8.59,787.1,569.3,1866.6,928.4,8.89
1045,7.28,83.494,9.63,6.52,6.31,7.41,7.77,2749.2,250.2,274.2,6.40,825.4,568.2,2112.1,926.2,6.71
1046,5.72,92.767,8.18,5.24,5.26,6.48,6.56,4794.8,524.7,345.6,5.62,1120.6,587.7,3532.6,1737.8,6.12
1047,8.19,91.191,10.12,7.73,7.70,8.36,8.39,2684.6,233.8,273.7,8.30,809.2,567.6,2066.7,914.8,8.05


In [4]:
mor_y = mor['ThirtyY.CMortgageRate']
mor_x = mor.drop(['ThirtyY.CMortgageRate'],axis=1)
print('X:',mor_x.shape)
print('y:',mor_y.shape)

X: (1049, 15)
y: (1049,)


- 데이터 정규화

In [5]:
scaler = StandardScaler()
scaler

StandardScaler()

In [6]:
mor_x = scaler.fit_transform(mor_x)
mor_x

array([[ 0.35491563, -0.45799122, -0.2403576 , ...,  0.09625994,
         0.0848087 , -0.16115823],
       [ 1.8746428 ,  0.83226746,  2.29560079, ..., -1.49612074,
        -1.28483455, -1.64134569],
       [-0.2760822 , -0.64811132, -0.15582566, ...,  0.00836053,
        -0.29884137, -0.04882737],
       ...,
       [-0.53381371, -0.31709495, -0.75093056, ...,  0.24212201,
         2.18481727,  2.10453902],
       [ 0.19790678, -0.42605105, -0.09496266, ...,  0.11409461,
         0.14937522, -0.10714267],
       [ 0.79631787,  2.46405097,  1.25078593, ..., -1.0642671 ,
        -0.91132073, -0.7187815 ]])

In [7]:
mor_train_x, mor_test_x, mor_train_y, mor_test_y = train_test_split(mor_x, mor_y, test_size=0.4, random_state=0)

---

- 실제값 대비 잔차가 얼마나 좋아졌는지 확인

$$MAPE = \frac{100\%}{n} \sum^{n}_{t=1}|\frac{A_t-F_t}{A_t}|$$

$A_t$: actual value

$F_t$: forecast value

In [8]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

---

### 단순 Multiple Linear Regression 학습 모델과의 비교

SVM regression과 단순한 MLR(Multiple Linear Regression)의 성능 비교를 위해 MLR학습 모델을 정의하였습니다.

In [9]:
#  package
from sklearn.linear_model import LinearRegression

# model/fit
mor_regression = LinearRegression()
mor_regression.fit(mor_train_x,mor_train_y)

# prediction
mor_regression_pred = mor_regression.predict(mor_test_x)

# eval
mor_regression_mape = mean_absolute_percentage_error(mor_test_y,mor_regression_pred)
mor_regression_rmse = np.sqrt(mean_squared_error(y_true=mor_test_y, y_pred=mor_regression_pred))

print('==== mortgage - regression ================')
print('MAPE:{}'.format(mor_regression_mape))
print('===========================================')

MAPE:1.1532713612507317


---

### Neural Network Regression 학습 모델과의 비교

SVM regression과 ANN regression의 성능 비교를 위해 모델을 정의하였습니다. 

기본적인 학습 파라미터인 'hidden node'와 'max_iteration'을 cross-valdation을 통해서 최적화를 진행하였습니다.

In [10]:
### Neural Network model training
from sklearn.neural_network import MLPRegressor
def NN_CV_regresser(train_x, train_y, max_nodes,CV_N ,plot=False):
    
    #1부터 10까지의 neighbor에 대해 최적의 k를 cross-validation을 이용하여 찾아냄
    parameters = {'hidden_layer_sizes': np.arange(start=1, stop=max_nodes,step=3).tolist(),
                  'max_iter': [300,500]}
    
    clf = MLPRegressor(learning_rate_init=0.01)
    
    grid_clf = GridSearchCV(clf, parameters,cv=CV_N)

    grid_clf.fit(train_x,train_y)

    print( grid_clf.best_params_)
    
    optimal_parameters = list(grid_clf.best_params_.values())

    print("The optimal number of hidden nodes : {}\n& max iteration : {}".format(optimal_parameters[0],optimal_parameters[1]))

    opt_NN = MLPRegressor(hidden_layer_sizes=grid_clf.best_params_['hidden_layer_sizes'],
                          max_iter=grid_clf.best_params_['max_iter'])
    opt_NN.fit(X=train_x, y=train_y)

    return opt_NN

In [11]:
mor_optNNR=NN_CV_regresser(train_x=mor_train_x, train_y=mor_train_y,max_nodes=16, CV_N=3)
mor_optNNR_pred = mor_optNNR.predict(X=mor_test_x)

{'hidden_layer_sizes': 10, 'max_iter': 300}
The optimal number of hidden nodes : 10
& max iteration : 300


In [12]:
mor_nnr_mape = mean_absolute_percentage_error(mor_test_y,mor_optNNR_pred)
mor_nnr_rmse = np.sqrt(mean_squared_error(y_true=mor_test_y, y_pred=mor_optNNR_pred))

print('==== mortgage - NN regression ================')
print('MAPE:{}'.format(mor_nnr_mape))
print('==============================================')


MAPE:9.051088545619436


---

### Kernel SVR

In [14]:
def CV_SVR_classifier(train_x, train_y,CV_n):
    parameters =  {'C':[0.01,0.05,0.1,1,5,10,],'gamma':[0.01,0.02,0.08,0.1,1], 'epsilon':[0.01,0.1,1]}
    print("매개변수 그리드:\n{}".format(parameters))
 
    # RBF SVR instance 생성
    clf = SVR(kernel='rbf')

    # Grid RBF SVR instance 생성
    grid_clf = GridSearchCV(clf, parameters, cv=CV_n)
    
    # cross-validation
    grid_clf.fit(train_x,train_y)

    print(grid_clf.best_params_)
    

    print("The optimal 'C' : {}\nThe optimal 'epsilon' : {}\nThe optimal 'Gamma' : {}".format(grid_clf.best_params_['C'],
                                                                                              grid_clf.best_params_['epsilon'],
                                                                                              grid_clf.best_params_['gamma']))
    # using best params
    opt_SVR = SVR(C=grid_clf.best_params_['C'],
                  epsilon=grid_clf.best_params_['epsilon'],
                  gamma=grid_clf.best_params_['gamma'])
    opt_SVR.fit(X=train_x, y=train_y)
    
    return opt_SVR

In [15]:
mor_opt_SVR = CV_SVR_classifier(train_x=mor_train_x, train_y=mor_train_y,CV_n=10)
mor_opt_SVR

매개변수 그리드:
{'C': [0.01, 0.05, 0.1, 1, 5, 10], 'gamma': [0.01, 0.02, 0.08, 0.1, 1], 'epsilon': [0.01, 0.1, 1]}
{'C': 10, 'epsilon': 0.01, 'gamma': 0.1}
The optimal 'C' : 10
The optimal 'epsilon' : 0.01
The optimal 'Gamma' : 0.1


SVR(C=10, epsilon=0.01, gamma=0.1)

In [15]:
mor_opt_SVR_pred = mor_opt_SVR.predict(X=mor_test_x)

In [16]:
mor_svr_mape = mean_absolute_percentage_error(mor_test_y,mor_opt_SVR_pred)
mor_svr_rmse = np.sqrt(mean_squared_error(y_true=mor_test_y, y_pred=mor_opt_SVR_pred))

print('==== mortgage - SVM regression ================')
print('MAPE:{}'.format(round(mor_svr_mape,4)))
print('===============================================')


MAPE:0.6545


---


- **세가지 모델('MLR', 'ANN', 'SVR')의 예측 성능 비교**



In [17]:
pd.DataFrame([[mor_regression_rmse, mor_regression_mape],
 [mor_nnr_rmse,mor_nnr_mape],
 [mor_svr_rmse,mor_svr_mape]], index=['L_reg','NN','SVR'], columns=['RMSE','MAPE'])

Unnamed: 0,RMSE,MAPE
L_reg,0.114446,1.153271
NN,0.520955,6.848234
SVR,0.073614,0.65449
