In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('Labeled-ML.csv')
data = data.drop(['Unnamed: 0', 'Date'], axis=1).drop([0], axis=0)
data = data.reset_index(drop=True)

In [5]:
data.shape

(5876, 15)

👆🏻 總共有5876筆資料

In [3]:
data.head()

Unnamed: 0,Day,Month,Year,Price,10D Return,250roll_mean,250roll_std,state,Invest,In_Time,In_Price,Fixed_In,Fixed_Price,D_In,D_Price
0,6,1,1997,12.13,0.011676,0.01464,0.004191,mid,0,0,,1,12.13,0,
1,7,1,1997,12.12,0.013378,0.014219,0.003052,mid,0,0,,0,,0,
2,8,1,1997,12.17,0.01841,0.015267,0.003256,mid,0,0,,0,,0,
3,9,1,1997,12.2,0.020921,0.016398,0.003787,high,0,0,,0,,0,
4,10,1,1997,12.08,0.005828,0.014636,0.005486,low,0,0,,0,,0,


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5876 entries, 0 to 5875
Data columns (total 15 columns):
Day             5876 non-null int64
Month           5876 non-null int64
Year            5876 non-null int64
Price           5876 non-null float64
10D Return      5876 non-null float64
250roll_mean    5876 non-null float64
250roll_std     5876 non-null float64
state           5876 non-null object
Invest          5876 non-null int64
In_Time         5876 non-null int64
In_Price        47 non-null float64
Fixed_In        5876 non-null int64
Fixed_Price     280 non-null float64
D_In            5876 non-null int64
D_Price         73 non-null float64
dtypes: float64(7), int64(7), object(1)
memory usage: 688.7+ KB


👆🏻 其中有3個欄位有遺失值，分別是 `In_Price`、`Fixed_Price`、`D_Price`

In [8]:
data = data.drop(['In_Price', 'Fixed_Price', 'D_Price'], axis=1)

In [9]:
data.head()

Unnamed: 0,Day,Month,Year,Price,10D Return,250roll_mean,250roll_std,state,Invest,In_Time,Fixed_In,D_In
0,6,1,1997,12.13,0.011676,0.01464,0.004191,mid,0,0,1,0
1,7,1,1997,12.12,0.013378,0.014219,0.003052,mid,0,0,0,0
2,8,1,1997,12.17,0.01841,0.015267,0.003256,mid,0,0,0,0
3,9,1,1997,12.2,0.020921,0.016398,0.003787,high,0,0,0,0
4,10,1,1997,12.08,0.005828,0.014636,0.005486,low,0,0,0,0


👆🏻 刪除有遺失值的欄位

In [31]:
data['state'].unique()

array(['mid', 'high', 'low', 'extremely low', 'very low', 'Buy',
       'very high', 'extremely high'], dtype=object)

👆🏻 總共有8種標籤

---

In [137]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [125]:
data.columns

Index(['Day', 'Month', 'Year', 'Price', '10D Return', '250roll_mean',
       '250roll_std', 'state', 'Invest', 'In_Time', 'Fixed_In', 'D_In'],
      dtype='object')

In [126]:
X = data[['Day', 'Month', 'Year', 'Price', '10D Return', '250roll_mean', '250roll_std','Invest', 'In_Time', 'Fixed_In', 'D_In']]
y = data['state']

👆🏻 分X跟y

In [127]:
print('X：', X.shape)
print('y：', y.shape)

X： (5876, 11)
y： (5876,)


In [128]:
5876/10*8

4700.8

In [129]:
X_train = X.iloc[:4701,:].values
X_test = X.iloc[4701:,:].values
y_train = y.iloc[:4701]
y_test = y.iloc[4701:]

👆🏻 分割測試集與訓練集，採 8:2

In [130]:
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

👆🏻 將x進行標準化

#### kernel = linear

In [141]:
svm = SVC(kernel='linear', probability=True)
svm.fit(X_train_std, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [142]:
y_pred = svm.predict(X_test_std)

In [143]:
accuracy_score(y_test, y_pred)

0.9685106382978723

👆🏻 在`kernel = linear`時，模型準確率為 0.97

In [145]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[  9,   0,   0,   0,   0,   0,   0,   0],
       [  0,   1,   0,   0,   0,   0,   1,   0],
       [  0,   0,  31,   0,   0,   0,   0,   0],
       [  0,   0,   0, 132,   0,   2,   1,   0],
       [  0,   0,   0,   0,  82,   9,   0,   0],
       [  0,   0,   0,   1,   0, 849,   0,   0],
       [  0,   0,   0,   7,   0,   0,  17,   0],
       [  0,   0,  12,   0,   4,   0,   0,  17]], dtype=int64)

👆🏻 在`kernel = linear`的混淆矩陣

#### kernel = rbf

In [146]:
svm = SVC(kernel='rbf', probability=True)
svm.fit(X_train_std, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [147]:
y_pred = svm.predict(X_test_std)

In [148]:
accuracy_score(y_test, y_pred)

0.8493617021276596

👆🏻 在`kernel = rbf`時，模型準確率為 0.85

In [149]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[  8,   0,   0,   0,   0,   0,   0,   1],
       [  0,   0,   0,   1,   0,   0,   0,   1],
       [  0,   0,   2,   0,  15,   0,   0,  14],
       [  0,   0,   0,  93,   0,  42,   0,   0],
       [  0,   0,   0,   0,  41,  50,   0,   0],
       [  0,   0,   0,   2,   0, 848,   0,   0],
       [  0,   0,   0,  24,   0,   0,   0,   0],
       [  0,   0,   1,   0,  26,   0,   0,   6]], dtype=int64)

👆🏻 在`kernel = rbf`的混淆矩陣