# 利用台股VIX指數估計台指期的報酬
# 其中也探討VIX高低和台指期交易量

# 載入模組
## 套件需求: pandas，sklearn中的RandomForest分類、回歸、交叉驗證。

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

import pandas as pd

# 讀取檔案
## 資料來源於TEJ
## 台指期貨標的VIX指數 & 台指期

## 期間為:2017/6/19~2018/6/15 將資料切為測試與訓練資料，
## 測試資料為2017/6/19到2018/6/15。訓練資料為2018年6/15倒回去2017年6/19。
## 並修改訓練與測試資料欄位名稱為: VIX、VIX日期、VIX開盤價、VIX最高價、VIX最低價、VIX收盤價、台指期名稱、台指期日期、台指期報酬、台指期收盤、台指期成交量。

In [2]:
test = pd.read_csv("test.csv", error_bad_lines=False)
train = pd.read_csv("train.csv", error_bad_lines=False)
#submit = pd.read_csv('gender_submission.csv')

# 改欄位名

In [3]:
train.columns=['vix','vix_date','vix_open','vix_high','vix_low','vix_close','future','fu_date','fu_ret','fu_colse','fu_vol']


# 查看資料型態

In [4]:
train.head()

Unnamed: 0,vix,vix_date,vix_open,vix_high,vix_low,vix_close,future,fu_date,fu_ret,fu_colse,fu_vol
0,TXO_N5,2018/6/15,14.21,14.9,13.81,14.13,TX201806 ??? 2018/06,2018/6/15,0.6281,11055,145449
1,TXO_N5,2018/6/14,14.31,14.35,13.87,14.21,TX201807 ??? 2018/07,2018/6/15,0.6052,10808,24657
2,TXO_N5,2018/6/13,14.19,14.36,14.01,14.01,TX201806 ??? 2018/06,2018/6/14,-1.4445,10985,143124
3,TXO_N5,2018/6/12,14.49,14.55,14.15,14.42,TX201807 ??? 2018/07,2018/6/14,-1.3954,10742,11364
4,TXO_N5,2018/6/11,13.86,14.45,13.8,14.45,TX201806 ??? 2018/06,2018/6/13,0.0449,11146,114215


In [5]:
test.head()

Unnamed: 0,vix,vix_date,vix_open,vix_high,vix_low,vix_close,future,fu_date,fu_ret,fu_close,fu_volum
0,TXO_N5,20170619,10.32,11.02,9.96,11.02,TX201803 ??? 2018/03,20170619,1.0289,9819,168204
1,TXO_N5,20170620,9.97,10.04,9.71,9.86,TX201803 ??? 2018/03,20170620,1.049,9922,184342
2,TXO_N5,20170621,9.88,10.13,9.82,9.93,TX201803 ??? 2018/03,20170621,0.1613,9938,180584
3,TXO_N5,20170622,10.3,10.38,9.78,10.02,TX201803 ??? 2018/03,20170622,0.6239,10002,121899
4,TXO_N5,20170623,9.9,10.4,9.8,10.4,TX201803 ??? 2018/03,20170623,-0.12,9988,105120


## 查看資料型態，探段資料筆數以及型態
## 若有不同需要去遺失值等資料處理

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 936 entries, 0 to 935
Data columns (total 11 columns):
vix          455 non-null object
vix_date     455 non-null object
vix_open     455 non-null float64
vix_high     455 non-null float64
vix_low      455 non-null float64
vix_close    455 non-null float64
future       936 non-null object
fu_date      936 non-null object
fu_ret       927 non-null float64
fu_colse     936 non-null int64
fu_vol       936 non-null int64
dtypes: float64(5), int64(2), object(4)
memory usage: 80.5+ KB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247 entries, 0 to 246
Data columns (total 11 columns):
vix          247 non-null object
vix_date     247 non-null int64
vix_open     247 non-null float64
vix_high     247 non-null float64
vix_low      247 non-null float64
vix_close    247 non-null float64
future       247 non-null object
fu_date      247 non-null int64
fu_ret       247 non-null float64
fu_close     247 non-null int64
fu_volum     247 non-null int64
dtypes: float64(5), int64(4), object(2)
memory usage: 21.3+ KB


## 查看資料分佈，以方便之後過濾極值

In [8]:
train.describe()

Unnamed: 0,vix_open,vix_high,vix_low,vix_close,fu_ret,fu_colse,fu_vol
count,455.0,455.0,455.0,455.0,927.0,936.0,936.0
mean,12.448879,12.922462,11.987055,12.365033,0.064679,10555.685897,37793.850427
std,3.3,4.318183,2.950731,3.198237,0.729217,279.185047,59756.884565
min,8.08,8.19,7.73,7.82,-4.9954,9819.0,17.0
25%,10.25,10.455,9.945,10.14,-0.31915,10335.75,279.75
50%,11.48,11.91,11.16,11.53,0.117,10529.5,1749.5
75%,13.715,14.015,13.27,13.805,0.4971,10761.5,89802.0
max,34.46,57.36,29.03,30.07,2.9976,11227.0,409245.0


In [9]:
test.describe()

Unnamed: 0,vix_date,vix_open,vix_high,vix_low,vix_close,fu_date,fu_ret,fu_close,fu_volum
count,247.0,247.0,247.0,247.0,247.0,247.0,247.0,247.0,247.0
mean,20175090.0,13.104737,13.622834,12.649717,13.116842,20175090.0,0.057362,10548.178138,143502.190283
std,4685.796,3.347928,4.255705,2.930674,3.245273,4685.796,0.774394,286.091933,39598.891933
min,20170620.0,8.44,8.86,7.82,7.82,20170620.0,-4.9954,9819.0,24665.0
25%,20170910.0,10.915,11.245,10.615,10.945,20170910.0,-0.3512,10331.0,119712.0
50%,20171210.0,12.09,12.53,11.78,12.15,20171210.0,0.1131,10580.0,139254.0
75%,20180320.0,14.35,14.78,13.785,14.165,20180320.0,0.5107,10758.5,166639.5
max,20180620.0,34.46,54.92,25.3,30.07,20180620.0,2.9828,11161.0,433346.0


# 計算技術指標
## 我們願使用KD、MACD、RSI技術指標的值當作分類依據，
## 因此需載numpy、talib，技術指標於talib運算後與之前欄位項目一同匯入t1，
## 並移除空值與遺漏值
## 其中利用技術指標是採用VIX指數來做為計算
## 想看看以往都是利用股價計算去做的技術指標在VIX終能否適用

In [10]:
import talib
import numpy
def talib2df(talib_output):
    if type(talib_output) == list:
        ret = pd.DataFrame(talib_output).transpose()
    else:
        ret = pd.Series(talib_output)
    ret.index = test['vix_close'].index
    return ret;

In [11]:
t1 = {
    'close':test.vix_close.dropna().astype(float),
    'open':test.vix_open.dropna().astype(float),
    'high':test.vix_high.dropna().astype(float),
    'low':test.vix_low.dropna().astype(float),
    'volume': test.fu_volum.dropna().astype(float)    
}

# 採用 KD/MACD/RSI指標

In [12]:
KD = talib2df(talib.abstract.STOCH(t1, fastk_period=9))

In [13]:
MACD = talib2df(talib.abstract.MACD(t1))

In [14]:
RSI = talib2df(talib.abstract.RSI(t1))

In [15]:
t1=pd.DataFrame(t1)
t1 = pd.concat([test,KD,MACD,RSI], axis=1)

In [16]:
t1.columns=['vix','vix_date','vix_open','vix_high','vix_low','vix_close','future','fu_date','fu_ret','fu_colse','fu_vol','k','d','dif12','dif26','macd','rsi']

In [17]:
t1.head(50)

Unnamed: 0,vix,vix_date,vix_open,vix_high,vix_low,vix_close,future,fu_date,fu_ret,fu_colse,fu_vol,k,d,dif12,dif26,macd,rsi
0,TXO_N5,20170619,10.32,11.02,9.96,11.02,TX201803 ??? 2018/03,20170619,1.0289,9819,168204,,,,,,
1,TXO_N5,20170620,9.97,10.04,9.71,9.86,TX201803 ??? 2018/03,20170620,1.049,9922,184342,,,,,,
2,TXO_N5,20170621,9.88,10.13,9.82,9.93,TX201803 ??? 2018/03,20170621,0.1613,9938,180584,,,,,,
3,TXO_N5,20170622,10.3,10.38,9.78,10.02,TX201803 ??? 2018/03,20170622,0.6239,10002,121899,,,,,,
4,TXO_N5,20170623,9.9,10.4,9.8,10.4,TX201803 ??? 2018/03,20170623,-0.12,9988,105120,,,,,,
5,TXO_N5,20170626,10.39,12.17,10.32,12.17,TX201803 ??? 2018/03,20170626,2.1526,10203,195349,,,,,,
6,TXO_N5,20170627,11.89,12.55,11.75,12.12,TX201803 ??? 2018/03,20170627,-0.1078,10192,121428,,,,,,
7,TXO_N5,20170628,12.48,12.6,12.25,12.52,TX201803 ??? 2018/03,20170628,-1.0302,10087,165922,,,,,,
8,TXO_N5,20170629,12.38,12.38,11.79,11.84,TX201803 ??? 2018/03,20170629,0.2677,10115,134893,,,,,,
9,TXO_N5,20170630,12.76,12.76,12.11,12.33,TX201712 ??? 2017/12,20170630,-0.4141,10103,129676,,,,,,


# 移除空值(由於技術指標計算後前幾天會有空值情況)
## 沒有移除會有錯誤產生

In [18]:
t1 = t1.query('vix_date > 20170803')

### 先載入sklearn的模型選擇、預先處理、指標與ensemble模型後我們開始進行分類。
### 我們將(交易量增加的)、(報酬率>0)、(波動率指數增加的)設為1; 因為想要用類別(不是連續)的變數，
### 因此需要載入虛擬變數(dummy variables)，最後測試並建立volume的數據。

In [19]:
import numpy as np

from sklearn import model_selection, ensemble, preprocessing, metrics

### 移除遺漏值、與設交易量增加&報酬大於零&VIX指數上漲  為 1

In [20]:
df=t1
df['pre_vol']=(df.fu_vol - df.fu_vol.shift(1)) > 0
df['pre_ret']=(df.fu_ret -  0 ) > 0
df['pre_vix_close']=(df.vix_close - df.vix_close.shift(1)) > 0

df=df.dropna()

### 創造 dummy variables

In [21]:
label_encoder = preprocessing.LabelEncoder()
encoded_label = label_encoder.fit_transform(df["future"])
encoded_label2 = label_encoder.fit_transform(df["vix"])
df['future']= encoded_label
df['vix'] = encoded_label2

### 測試還有建立的數據_VOLUME

In [23]:
selected_features = ['vix_date','vix_open','vix_high','vix_low','vix_close','fu_vol','k','d','macd','rsi','fu_ret','future']
pre_vol_X = df[selected_features]
pre_vol_y = df['pre_vol']
train_X, test_X, train_y, test_y = model_selection.train_test_split(pre_vol_X, pre_vol_y, test_size = 0.3)

### 接下來我們載入 random forest 模型並進行預測

In [24]:
forest = ensemble.RandomForestClassifier(n_estimators = 300)
forest_fit = forest.fit(train_X, train_y)

### 預測

In [25]:
test_y_predicted = forest.predict(test_X)
#test_y_predicted

### 績效

In [26]:
accuracy = metrics.accuracy_score(test_y, test_y_predicted)
#print(accuracy)
fpr, tpr, thresholds = metrics.roc_curve(test_y, test_y_predicted)
auc = metrics.auc(fpr, tpr)
#print(auc)
print('準確率: {}'.format(auc))
print('AUC值: {}'.format(accuracy))

準確率: 0.6533333333333334
AUC值: 0.65625


## 預測期貨交易量

In [27]:
import numpy as np
today_X = df[selected_features]
today_y_predicted = forest.predict(today_X)
proba = forest.predict_proba(today_X)

In [28]:
print('隔日交易量: ' + format(np.where(today_y_predicted==True,'增','減')[0]))
print( '明日增加的機率: {}'.format(proba[0][1]))

隔日交易量: 減
明日增加的機率: 0.15666666666666668


## 建立預測報酬ret

In [30]:
selected_features_ret = ['vix_date','fu_date','vix_high','vix_low','fu_vol','k','d','macd','rsi','future']
pre_ret_X = df[selected_features_ret]
pre_ret_y = df['pre_ret']
train_X2, test_X2, train_y2, test_y2 = model_selection.train_test_split(pre_ret_X, pre_ret_y, test_size = 0.3)

In [31]:
forest = ensemble.RandomForestClassifier(n_estimators = 100)
forest_fit = forest.fit(train_X2, train_y2)

In [32]:
test_y2_predicted = forest.predict(test_X2)

In [33]:
accuracy = metrics.accuracy_score(test_y2, test_y2_predicted)
#print(accuracy)
fpr, tpr, thresholds = metrics.roc_curve(test_y2, test_y2_predicted)
auc = metrics.auc(fpr, tpr)
#print(auc)
print('準確率: {}'.format(auc))
print('AUC值: {}'.format(accuracy))

準確率: 0.4757085020242915
AUC值: 0.5


In [34]:
today_X2 = df[selected_features_ret]
today_y2_predicted = forest.predict(today_X2)
proba = forest.predict_proba(today_X2)
print('預期隔日報酬: ' + format(np.where(today_y_predicted==True,'正','負')[0]))
print( '明日賺的機率: {}'.format(proba[0][1]))

預期隔日報酬: 負
明日賺的機率: 0.43


## 建立預測VIX

In [35]:
#虛擬變數
label_encoder = preprocessing.LabelEncoder()
encoded_label = label_encoder.fit_transform(df["vix_close"])

In [36]:
selected_features_vix = ['vix_date','vix_open','vix_high','vix_low','vix_close','fu_vol','k','d','macd','rsi','fu_ret']
pre_vix_close_X = df[selected_features_vix]
pre_vix_close_y = df['pre_vix_close']
train_X3, test_X3, train_y3, test_y3 = model_selection.train_test_split(pre_vix_close_X, pre_vix_close_y, test_size = 0.3)

In [37]:
forest = ensemble.RandomForestClassifier(n_estimators = 300)
forest_fit = forest.fit(train_X3, train_y3)

In [38]:
test_y3_predicted = forest.predict(test_X3)
#test_y3_predicted

In [39]:
accuracy = metrics.accuracy_score(test_y3, test_y3_predicted)
#print(accuracy)
fpr, tpr, thresholds = metrics.roc_curve(test_y3, test_y3_predicted)
auc = metrics.auc(fpr, tpr)
#print(auc)
print('準確率: {}'.format(auc))
print('AUC值: {}'.format(accuracy))

準確率: 0.7598039215686274
AUC值: 0.765625


In [40]:
today_X3 = df[selected_features_vix]
today_y3_predicted = forest.predict(today_X3)
proba = forest.predict_proba(today_X3)
print('預期隔日VIX指數: ' + format(np.where(today_y_predicted==True,'增','減')[0]))
print( '震盪更大的機率: {}'.format(proba[0][1]))

預期隔日VIX指數: 減
震盪更大的機率: 0.12333333333333334


## 用上述分類來預測台指期的交易量增加or減少，及預測成功機率。 
## 最後再進行台指期報酬率的預測(賺錢or賠錢)，並顯示準確度與AUC值。

In [41]:
linreg = LinearRegression()
linreg_fit = linreg.fit(train_X3, train_y3)
cv = cross_val_score(linreg, train_X3, train_y3, cv=10)
print('Train Accuracy:', cv.mean())

cv = cross_val_score(linreg, test_X3, test_y3, cv=10)
print('Test Accuracy:', cv.mean())

test_y3_predicted = linreg.predict(test_X3)
#print('Predicted Closing Price: %.2f\n' % make_prediction(quotes_df, linreg))

Train Accuracy: 0.13445313024694755
Test Accuracy: 0.12334176365838365


In [43]:
test_y_predicted = linreg.predict(today_X3)
test_y_predicted

array([ 2.89628071e-01,  1.27662419e-01,  5.62733611e-01,  7.77800548e-01,
        1.09024574e+00,  7.22281644e-01,  8.10539896e-01,  1.82151803e-01,
        3.46176832e-01,  2.58240378e-01,  5.79615434e-01,  5.39513161e-01,
        2.10841539e-01,  3.76674350e-01,  1.79240831e-01,  3.70654381e-01,
        6.13448826e-01,  7.26384636e-01,  4.55616717e-01,  5.17373492e-01,
        3.12296301e-01,  5.50974043e-01,  4.88796596e-01,  6.79548999e-01,
        4.48624930e-01,  1.13234169e-01,  1.37994503e-01,  4.46843986e-05,
        3.13314691e-01,  1.44016660e-01,  1.70130033e-01,  3.45267114e-01,
        8.69491776e-01,  6.48574048e-01,  1.02533440e-01,  8.17274069e-01,
        8.03671724e-01,  7.66259351e-01,  1.43589242e-01,  2.20332542e-01,
        1.25290918e-01, -4.12230912e-03,  5.19765653e-02,  3.22743609e-01,
        3.31424929e-01,  4.31809089e-01,  1.86751210e-01,  2.67262133e-01,
        3.45252813e-01,  5.29444650e-01,  7.69654656e-01,  4.35289130e-01,
        5.25124868e-02,  

Predict Accuracy: 1.0
