## 前處理

In [1]:
import time
import numpy as np
import pandas as pd

EXECUTION_START_TIME = time.time()

df = pd.read_csv('data.csv')
df_test = pd.read_csv('test.csv')



In [2]:
# import
pd.options.mode.chained_assignment = None      # 解決會噴 warning 的狀況

from sklearn.impute import SimpleImputer       # 匯入填補缺失值的工具
from sklearn.preprocessing import LabelEncoder # 匯入 Label Encoder
import matplotlib.pyplot as plt
import seaborn as sns

### 處理缺失值
檢查資料庫是否有缺失值，由count欄位可以確定沒有。

In [3]:
data = df[['Open Price','High Price', 'Close Price','Low Price', 'Volume']]
test = df_test[['Open Price','High Price', 'Close Price','Low Price', 'Volume']]

print(data.describe())
print(test.describe())

        Open Price   High Price  Close Price    Low Price        Volume
count  2264.000000  2264.000000  2264.000000  2264.000000  2.264000e+03
mean   1656.142686  1664.427054  1656.767562  1647.425128  2.948755e+09
std     485.292193   484.808706   485.226120   485.576116  1.351675e+09
min     679.280000   695.270000   676.530000   666.790000  5.181584e+08
25%    1238.602500  1246.695000  1239.337500  1227.587500  2.048980e+09
50%    1649.135000  1656.145000  1650.405000  1639.600000  2.506637e+09
75%    2070.300000  2079.507500  2071.190000  2058.757500  3.373334e+09
max    2692.710000  2694.970000  2690.160000  2685.920000  9.120100e+09
        Open Price   High Price  Close Price    Low Price        Volume
count   252.000000   252.000000   252.000000   252.000000  2.520000e+02
mean   2747.910397  2762.747778  2746.030873  2730.143929  2.154050e+09
std      99.017479    92.991345   100.251272   106.490954  4.538626e+08
min    2363.120000  2410.340000  2351.100000  2346.580000  9.516

### 轉換日期資料
為了方便計算，為每個條目新增日期

In [4]:
data['Date'] = range(0,len(data.index))
test['Date'] = range(0,len(test.index))

### 轉換資料
目的是用前四筆資料之間的差值預測第五筆，因此要先將前四筆的資料彙整。

In [5]:
openPrice = []
closePrice = []
lowPrice = []
highPrice = []
volume = []

pre_open = 0
pre_close = 0
pre_low = 0
pre_high = 0
pre_volume = 0

for index, row in data.iterrows():
    openPrice.append(1 if row['Open Price'] - pre_open>0 else 0)
    closePrice.append(1 if row['Close Price'] - pre_close>0 else 0)
    lowPrice.append(1 if row['Low Price'] - pre_low>0 else 0)
    highPrice.append(1 if row['High Price'] - pre_high>0 else 0)
    volume.append(1 if row['Volume'] - pre_volume>0 else 0)
    
    pre_open = row['Open Price']
    pre_close = row['Close Price']
    pre_low = row['Low Price']
    pre_high = row['High Price']
    pre_volume = row['Volume'] 

In [6]:
open1 = []
open2 = []
open3 = []
openOut = []

close1 = []
close2 = []
close3 = []
closeOut = []

low1 = []
low2 = []
low3 = []
lowOut = []

high1 = []
high2 = []
high3 = []
highOut = []

volume1 = []
volume2 = []
volume3 = []
volumeOut = []


for index in range(1,len(data)-3):
    open1.append(openPrice[index])
    open2.append(openPrice[index+1])
    open3.append(openPrice[index+2])
    openOut.append(openPrice[index+3])
    
    close1.append(closePrice[index])
    close2.append(closePrice[index+1])
    close3.append(closePrice[index+2])
    closeOut.append(closePrice[index+3])
    
    low1.append(lowPrice[index])
    low2.append(lowPrice[index+1])
    low3.append(lowPrice[index+2])
    lowOut.append(lowPrice[index+3])
    
    high1.append(highPrice[index])
    high2.append(highPrice[index+1])
    high3.append(highPrice[index+2])
    highOut.append(highPrice[index+3])
    
    volume1.append(volume[index])
    volume2.append(volume[index+1])
    volume3.append(volume[index+2])
    volumeOut.append(volume[index+3])

In [7]:
train_x = pd.DataFrame({
    'open1': open1,
    'open2': open2,
    'open3': open3,
    'close1': close1,
    'close2': close2,
    'close3': close3,
#     'low1': low1,
#     'low2': low2,
#     'low3': low3,
#     'high1': high1,
#     'high2': high2,
#     'high3': high3,
    'volume1': volume1,
    'volume2': volume2,
    'volume3': volume3
})

train_y = pd.DataFrame({
    'open': openOut,
    'close': closeOut,
    'low': lowOut,
    'high': highOut,
    'volume': volumeOut
})

接著用一樣的方法，對測試資料做一樣的整理。

In [8]:
openPrice = []
closePrice = []
lowPrice = []
highPrice = []
volume = []

pre_open = 0
pre_close = 0
pre_low = 0
pre_high = 0
pre_volume = 0

for index, row in test.iterrows():
    openPrice.append(1 if row['Open Price'] - pre_open>0 else 0)
    closePrice.append(1 if row['Close Price'] - pre_close>0 else 0)
    lowPrice.append(1 if row['Low Price'] - pre_low>0 else 0)
    highPrice.append(1 if row['High Price'] - pre_high>0 else 0)
    volume.append(1 if row['Volume'] - pre_volume>0 else 0)
    
    pre_open = row['Open Price']
    pre_close = row['Close Price']
    pre_low = row['Low Price']
    pre_high = row['High Price']
    pre_volume = row['Volume'] 

In [9]:
open1 = []
open2 = []
open3 = []
openOut = []

close1 = []
close2 = []
close3 = []
closeOut = []

low1 = []
low2 = []
low3 = []
lowOut = []

high1 = []
high2 = []
high3 = []
highOut = []

volume1 = []
volume2 = []
volume3 = []
volumeOut = []


for index in range(1,len(test)-3):
    open1.append(openPrice[index])
    open2.append(openPrice[index+1])
    open3.append(openPrice[index+2])
    openOut.append(openPrice[index+3])
    
    close1.append(closePrice[index])
    close2.append(closePrice[index+1])
    close3.append(closePrice[index+2])
    closeOut.append(closePrice[index+3])
    
    low1.append(lowPrice[index])
    low2.append(lowPrice[index+1])
    low3.append(lowPrice[index+2])
    lowOut.append(lowPrice[index+3])
    
    high1.append(highPrice[index])
    high2.append(highPrice[index+1])
    high3.append(highPrice[index+2])
    highOut.append(highPrice[index+3])
    
    volume1.append(volume[index])
    volume2.append(volume[index+1])
    volume3.append(volume[index+2])
    volumeOut.append(volume[index+3])

In [10]:
test_x = pd.DataFrame({
    'open1': open1,
    'open2': open2,
    'open3': open3,
    'close1': close1,
    'close2': close2,
    'close3': close3,
#     'low1': low1,
#     'low2': low2,
#     'low3': low3,
#     'high1': high1,
#     'high2': high2,
#     'high3': high3,
    'volume1': volume1,
    'volume2': volume2,
    'volume3': volume3
})

test_y = pd.DataFrame({
    'open': openOut,
    'close': closeOut,
    'low': lowOut,
    'high': highOut,
    'volume': volumeOut
})

## 模型訓練

匯入需要的套件

In [11]:
from sklearn.metrics import accuracy_score            # 匯入準確度計算工具
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

### 線性迴歸

In [12]:
model = LinearRegression()
model.fit(train_x,train_y['close'])

predict = model.predict(test_x)

tp = 0
tn = 0
fp = 0
fn = 0

for index in range(0,len(predict)) :
    if predict[index]>=0.54:
        if train_y['close'][index]==1:
            tp = tp + 1
        else:
            fp = fp + 1
    else:
        if train_y['close'][index]==1:
            fn = fn + 1
        else:
            tn = tn + 1
print(tp)
print(fp)
print(tn)
print(fn)

print("Recall:{}".format(tp/(tp+fn)))
print("Position:{}".format(tp/(tp+fp)))
print("Accuracy:{}".format((tp+tn)/(tp+fp+tn+fn)))

88
57
53
50
Recall:0.6376811594202898
Position:0.6068965517241379
Accuracy:0.5685483870967742


### NN

In [13]:
model = MLPClassifier(solver='adam', random_state=1010,max_iter=1000)
model.fit(train_x,train_y['close'])

predict = model.predict(test_x)

tp = 0
tn = 0
fp = 0
fn = 0

for index in range(0,len(predict)) :
    if predict[index]==1:
        if train_y['close'][index]==1:
            tp = tp + 1
        else:
            fp = fp + 1
    else:
        if train_y['close'][index]==1:
            fn = fn + 1
        else:
            tn = tn + 1
print(tp)
print(fp)
print(tn)
print(fn)

print("Recall:{}".format(tp/(tp+fn)))
print("Position:{}".format(tp/(tp+fp)))
print("Accuracy:{}".format((tp+tn)/(tp+fp+tn+fn)))

109
72
38
29
Recall:0.7898550724637681
Position:0.6022099447513812
Accuracy:0.592741935483871


### Ensemble

In [14]:
model = RandomForestClassifier(random_state=1012,n_estimators=500)
model.fit(train_x,train_y['close'])

predict = model.predict(test_x)

tp = 0
tn = 0
fp = 0
fn = 0

for index in range(0,len(predict)) :
    if predict[index]==1:
        if train_y['close'][index]==1:
            tp = tp + 1
        else:
            fp = fp + 1
    else:
        if train_y['close'][index]==1:
            fn = fn + 1
        else:
            tn = tn + 1
print(tp)
print(fp)
print(tn)
print(fn)

print("Recall:{}".format(tp/(tp+fn)))
print("Position:{}".format(tp/(tp+fp)))
print("Accuracy:{}".format((tp+tn)/(tp+fp+tn+fn)))

104
72
38
34
Recall:0.7536231884057971
Position:0.5909090909090909
Accuracy:0.5725806451612904


In [15]:
# 請勿更動此區塊程式碼

EXECUTION_END_TIME = time.time() # 計算執行時間
print('total execution time: {}'.format(EXECUTION_END_TIME - EXECUTION_START_TIME))

total execution time: 5.140761852264404
