In [1]:
# 請勿更動此區塊程式碼

import time
import numpy as np
import pandas as pd

EXECUTION_START_TIME = time.time()

df = pd.read_csv('data.csv')
df_test = pd.read_csv('test.csv')



## 資料分析與前處理

In [2]:
# import
pd.options.mode.chained_assignment = None      # 解決會噴 warning 的狀況

from sklearn.impute import SimpleImputer       # 匯入填補缺失值的工具
from sklearn.preprocessing import LabelEncoder # 匯入 Label Encoder
import matplotlib.pyplot as plt
import seaborn as sns

### 處理缺失值
檢查資料庫是否有缺失值，由count欄位可以確定沒有。

In [4]:
data = df[['Open Price','High Price', 'Close Price','Low Price', 'Volume']]
test = df_test[['Open Price','High Price', 'Close Price','Low Price', 'Volume']]

data.describe()
test.describe()

Unnamed: 0,Open Price,High Price,Close Price,Low Price,Volume
count,252.0,252.0,252.0,252.0,252.0
mean,2747.910397,2762.747778,2746.030873,2730.143929,2154050000.0
std,99.017479,92.991345,100.251272,106.490954,453862600.0
min,2363.12,2410.34,2351.1,2346.58,951652300.0
25%,2689.465,2705.8475,2690.5125,2663.6775,1879841000.0
50%,2742.17,2755.575,2741.92,2725.29,2063820000.0
75%,2811.43,2824.8025,2814.3225,2800.63,2341078000.0
max,2936.76,2940.91,2930.75,2927.11,4560164000.0


### 轉換日期資料
為了方便計算，為每個條目新增日期

In [None]:
data['Date'] = range(0,len(data.index))
test['Date'] = range(0,len(test.index))

### 轉換資料
目的是用前四筆資料之間的差值預測第五筆，因此要先將前四筆的資料彙整。

In [None]:
openPrice = []
closePrice = []
lowPrice = []
highPrice = []
volume = []

pre_open = 0
pre_close = 0
pre_low = 0
pre_high = 0
pre_volume = 0

for index, row in data.iterrows():
    openPrice.append(row['Open Price'] - pre_open)
    closePrice.append(row['Close Price'] - pre_close)
    lowPrice.append(row['Low Price'] - pre_low)
    highPrice.append(row['High Price'] - pre_high)
    volume.append(row['Volume Price'] - pre_volume)
    
    pre_open = row['Open Price']
    pre_close = row['Close Price']
    pre_low = row['Low Price']
    pre_high = row['High Price']
    pre_volume = row['Volume Price'] 


### 轉換類別資料
為了使得NN可以運作，新增漲或跌的label

In [None]:
trand = []
lastPrice = 0.0
for price in train_y['Close Price'] :
    if price > lastPrice :
        trand.append(1)
    elif price == lastPrice :
        trand.append(0)
    else :
        trand.append(-1)
    lastPrice = price
    
nn_train_y = pd.DataFrame({
    'trand':trand
})

In [None]:
trand = []
lastPrice = 0.0
for price in test_y['Close Price'] :
    if price > lastPrice :
        trand.append(1)
    elif price == lastPrice :
        trand.append(0)
    else :
        trand.append(-1)
    lastPrice = price
    
nn_test_y = pd.DataFrame({
    'trand':trand
})

## 模型訓練

匯入需要的套件

In [None]:
from sklearn.model_selection import KFold             # 匯入 K 次交叉驗證工具
from sklearn.tree import DecisionTreeClassifier       # 匯入決策樹模型
from sklearn.metrics import accuracy_score            # 匯入準確度計算工具
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier

### 線性回歸

In [None]:
model = LinearRegression()
model.fit(np.array(train_x['Date']).reshape(-1, 1),train_y)

predict = model.predict(np.array(test_x['Date']).reshape(-1, 1))
plt.plot(test_x['Date'], predict, "r")
plt.plot(test_x['Date'], test_y, "b")
plt.show()

predict = model.predict(np.array(train_x['Date']).reshape(-1, 1))
plt.plot(train_x['Date'], predict, "r")
plt.plot(train_x['Date'], train_y, "b")
plt.show()

### NN
由於NN是用來分類，所以要先把預測內容轉換成類別（漲跌）

In [None]:
model = MLPClassifier(solver='lbfgs', random_state=1010)
model.fit(np.array(train_x['Date']).reshape(-1, 1),nn_train_y)

predict = model.predict(np.array(train_x['Date']).reshape(-1, 1))

tp = 0
tn = 0
fp = 0
fn = 0

pd.DataFrame(predict).hist()
nn_train_y['trand'].hist()

for index in range(0,len(predict)) :
    if predict[index]==1:
        if nn_train_y['trand'][index]==1:
            tp = tp + 1
        else:
            fp = fp + 1
    else:
        if nn_train_y['trand'][index]==1:
            fn = fn + 1
        else:
            tn = tn + 1
print(tp)
print(fp)
print(tn)
print(fn)

# print("Recall:{}".format(tp/(tp+fn)))
# print("Position:{}".format(tp/(tp+fp)))

### 

In [None]:
# 請勿更動此區塊程式碼

EXECUTION_END_TIME = time.time() # 計算執行時間
print('total execution time: {}'.format(EXECUTION_END_TIME - EXECUTION_START_TIME))