In [1]:
import time
import numpy as np
import pandas as pd

EXECUTION_START_TIME = time.time()

time = pd.read_csv('data/Time.csv')
kospi = pd.read_csv('data/KOSPI.csv')



In [2]:
# import
pd.options.mode.chained_assignment = None      # 解決會噴 warning 的狀況

from sklearn.impute import SimpleImputer       # 匯入填補缺失值的工具
from sklearn.preprocessing import LabelEncoder # 匯入 Label Encoder
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

In [3]:
print(time['date'].describe())

count            133
unique           133
top       2020-03-19
freq               1
Name: date, dtype: object


## 轉換日期資料
把日期資料型態轉換成python Timestamp

In [4]:
time["date"] = pd.to_datetime(time["date"], format='%Y-%m-%d')
kospi["Date"] = pd.to_datetime(kospi["Date"], format='%b %d, %Y')

In [5]:
print(kospi["Price"][0])
print(kospi["Date"][0])

kospi = kospi.iloc[::-1]
kospi = kospi.reset_index(drop=True)

print(kospi["Price"][0])
print(kospi["Date"][0])

2,181.87
2020-06-05 00:00:00
2,175.17
2020-01-02 00:00:00


## 轉換數值資料
把數值轉換成類型

In [6]:
for i in range(0, len(kospi.index)):
    kospi['Change %'][i] = 0 if kospi['Change %'][i][0] =='-' else 1

## 整理確診資料
將累積確診資料換算成每日確診人數

In [7]:
time["diff confirmed"] = 0
time["diff confirmed"][0] = time["confirmed"][0]
pre = time["confirmed"][0]

for i in range(1, len(time.index)):
    time["diff confirmed"][i] = time["confirmed"][i] - pre
    pre = time["confirmed"][i]

## 彙整資料
把確診和kospi資料合併

In [8]:
kospi["comfirmed_1"] = np.nan
kospi["comfirmed_2"] = np.nan
kospi["comfirmed_3"] = np.nan
kospi["groth_1"] = np.nan
kospi["groth_2"] = np.nan
kospi["groth_3"] = np.nan

for i in range(0, len(kospi.index)):
    if kospi["Date"][i] < time["date"][0]+datetime.timedelta(days=3):
        continue
    if kospi["Date"][i]-datetime.timedelta(days=1) > time["date"][len(time.index)-1]:
        continue
    if i-3<0:
        continue
    date = kospi["Date"][i]
    kospi["comfirmed_1"][i] = time['diff confirmed'][time["date"]==(date-datetime.timedelta(days=1))]
    kospi["comfirmed_2"][i] = time['diff confirmed'][time["date"]==(date-datetime.timedelta(days=2))]
    kospi["comfirmed_3"][i] = time['diff confirmed'][time["date"]==(date-datetime.timedelta(days=3))]
    kospi["groth_1"][i] = kospi['Change %'][i-1]
    kospi["groth_2"][i] = kospi['Change %'][i-2]
    kospi["groth_3"][i] = kospi['Change %'][i-3]


kospi = kospi.dropna()
kospi = kospi.reset_index(drop=True)

## 選取資料

In [9]:
train_x = kospi[["groth_1", "groth_2", "groth_3"]]
train_y = kospi["Change %"]
date = kospi[["Date"]]

In [10]:
print(train_x.describe())

         groth_1    groth_2    groth_3
count  87.000000  87.000000  87.000000
mean    0.517241   0.505747   0.517241
std     0.502599   0.502865   0.502599
min     0.000000   0.000000   0.000000
25%     0.000000   0.000000   0.000000
50%     1.000000   1.000000   1.000000
75%     1.000000   1.000000   1.000000
max     1.000000   1.000000   1.000000


In [11]:
train_y = train_y.astype('int')
train_y

0     0
1     0
2     1
3     0
4     0
     ..
82    1
83    1
84    0
85    1
86    1
Name: Change %, Length: 87, dtype: int64

## 只有股票資料 vs 股票資料加卻診人數
### 只有股票資料

In [12]:
## 股票資料
### 模型訓練

from sklearn.model_selection import KFold             # 匯入 K 次交叉驗證工具
from sklearn.tree import DecisionTreeClassifier       # 匯入決策樹模型
from sklearn.metrics import accuracy_score            # 匯入準確度計算工具

kf = KFold(n_splits=5,                                # 設定 K 值
           random_state=1012,
           shuffle=True)
kf.get_n_splits(train_x)                              # 給予資料範圍

train_acc_list = []                                   # 儲存每次訓練模型的準確度
valid_acc_list = []                                   # 儲存每次驗證模型的準確度

for train_index, valid_index in kf.split(train_x):    # 每個迴圈都會產生不同部份的資料
    train_x_split = train_x.iloc[train_index]         # 產生訓練資料
    train_y_split = train_y.iloc[train_index]         # 產生訓練資料標籤
    valid_x_split = train_x.iloc[valid_index]         # 產生驗證資料
    valid_y_split = train_y.iloc[valid_index]         # 產生驗證資料標籤
    model = DecisionTreeClassifier(random_state=1012,
                                   criterion='gini',                           
                                   max_depth=3,
                                   max_leaf_nodes=2 ** 5)
    
    model.fit(train_x_split, train_y_split)           # 訓練決策樹模型
    
    train_pred_y = model.predict(train_x_split)       # 確認模型是否訓練成功
    train_acc = accuracy_score(train_y_split,         # 計算訓練資料準確度
                               train_pred_y)
    valid_pred_y = model.predict(valid_x_split)       # 驗證模型是否訓練成功
    valid_acc = accuracy_score(valid_y_split,         # 計算驗證資料準確度
                               valid_pred_y)
    
    train_acc_list.append(train_acc)
    valid_acc_list.append(valid_acc)

print((
    'average train accuracy: {}\n' +
    '    min train accuracy: {}\n' +
    '    max train accuracy: {}\n' +
    'average valid accuracy: {}\n' +
    '    min valid accuracy: {}\n' +
    '    max valid accuracy: {}').format(
    np.mean(train_acc_list),                          # 輸出平均訓練準確度
    np.min(train_acc_list),                           # 輸出最低訓練準確度
    np.max(train_acc_list),                           # 輸出最高訓練準確度
    np.mean(valid_acc_list),                          # 輸出平均驗證準確度
    np.min(valid_acc_list),                           # 輸出最低驗證準確度
    np.max(valid_acc_list)                            # 輸出最高驗證準確度
))

model = DecisionTreeClassifier(random_state=1012,
                                   criterion='gini',                           
                                   max_depth=3,
                                   max_leaf_nodes=2 ** 5)
model.fit(train_x, train_y)                       # 訓練決策樹模型

pred_y = model.predict(train_x)                   # 確認模型是否訓練成功
acc = accuracy_score(train_y, pred_y)             # 計算準確度

print('accuracy: {}'.format(acc))                 # 輸出準確度

average train accuracy: 0.6091925465838509
    min train accuracy: 0.5714285714285714
    max train accuracy: 0.6428571428571429
average valid accuracy: 0.4816993464052287
    min valid accuracy: 0.4117647058823529
    max valid accuracy: 0.5555555555555556
accuracy: 0.5977011494252874


In [13]:
# 模型訓練 ensemble

from sklearn import ensemble

kf = KFold(n_splits=5,                                # 設定 K 值
           random_state=1012,
           shuffle=True)
kf.get_n_splits(train_x)                              # 給予資料範圍

train_acc_list = []                                   # 儲存每次訓練模型的準確度
valid_acc_list = []                                   # 儲存每次驗證模型的準確度

for train_index, valid_index in kf.split(train_x):    # 每個迴圈都會產生不同部份的資料
    train_x_split = train_x.iloc[train_index]         # 產生訓練資料
    train_y_split = train_y.iloc[train_index]         # 產生訓練資料標籤
    valid_x_split = train_x.iloc[valid_index]         # 產生驗證資料
    valid_y_split = train_y.iloc[valid_index]         # 產生驗證資料標籤
    
    model = ensemble.AdaBoostClassifier(random_state=1012,n_estimators = 10)
    
    model.fit(train_x_split, train_y_split)           # 訓練決策樹模型
    
    train_pred_y = model.predict(train_x_split)       # 確認模型是否訓練成功
    train_acc = accuracy_score(train_y_split,         # 計算訓練資料準確度
                               train_pred_y)
    valid_pred_y = model.predict(valid_x_split)       # 驗證模型是否訓練成功
    valid_acc = accuracy_score(valid_y_split,         # 計算驗證資料準確度
                               valid_pred_y)
    
    train_acc_list.append(train_acc)
    valid_acc_list.append(valid_acc)

print((
    'average train accuracy: {}\n' +
    '    min train accuracy: {}\n' +
    '    max train accuracy: {}\n' +
    'average valid accuracy: {}\n' +
    '    min valid accuracy: {}\n' +
    '    max valid accuracy: {}').format(
    np.mean(train_acc_list),                          # 輸出平均訓練準確度
    np.min(train_acc_list),                           # 輸出最低訓練準確度
    np.max(train_acc_list),                           # 輸出最高訓練準確度
    np.mean(valid_acc_list),                          # 輸出平均驗證準確度
    np.min(valid_acc_list),                           # 輸出最低驗證準確度
    np.max(valid_acc_list)                            # 輸出最高驗證準確度
))
model = ensemble.AdaBoostClassifier(random_state=1012,n_estimators = 10)
model.fit(train_x, train_y)                       

pred_y = model.predict(train_x)                   # 確認模型是否訓練成功
acc = accuracy_score(train_y, pred_y)             # 計算準確度

print('accuracy: {}'.format(acc))                 # 輸出準確度

average train accuracy: 0.5603726708074535
    min train accuracy: 0.5285714285714286
    max train accuracy: 0.5857142857142857
average valid accuracy: 0.4503267973856209
    min valid accuracy: 0.3333333333333333
    max valid accuracy: 0.6470588235294118
accuracy: 0.5287356321839081


### 股票資料加卻診人數

In [14]:
train_x = kospi[["comfirmed_1", "comfirmed_2", "comfirmed_3","groth_1", "groth_2", "groth_3"]]
train_x

Unnamed: 0,comfirmed_1,comfirmed_2,comfirmed_3,groth_1,groth_2,groth_3
0,0.0,0.0,1.0,1.0,0.0,1.0
1,1.0,1.0,0.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,1.0
3,0.0,0.0,1.0,1.0,0.0,0.0
4,2.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...
82,16.0,25.0,23.0,1.0,0.0,1.0
83,19.0,16.0,25.0,1.0,1.0,0.0
84,40.0,19.0,16.0,1.0,1.0,1.0
85,79.0,40.0,19.0,0.0,1.0,1.0


In [15]:
train_y

0     0
1     0
2     1
3     0
4     0
     ..
82    1
83    1
84    0
85    1
86    1
Name: Change %, Length: 87, dtype: int64

In [16]:
### 模型訓練

from sklearn.model_selection import KFold             # 匯入 K 次交叉驗證工具
from sklearn.tree import DecisionTreeClassifier       # 匯入決策樹模型
from sklearn.metrics import accuracy_score            # 匯入準確度計算工具

kf = KFold(n_splits=5,                                # 設定 K 值
           random_state=1012,
           shuffle=True)
kf.get_n_splits(train_x)                              # 給予資料範圍

train_acc_list = []                                   # 儲存每次訓練模型的準確度
valid_acc_list = []                                   # 儲存每次驗證模型的準確度

for train_index, valid_index in kf.split(train_x):    # 每個迴圈都會產生不同部份的資料
    train_x_split = train_x.iloc[train_index]         # 產生訓練資料
    train_y_split = train_y.iloc[train_index]         # 產生訓練資料標籤
    valid_x_split = train_x.iloc[valid_index]         # 產生驗證資料
    valid_y_split = train_y.iloc[valid_index]         # 產生驗證資料標籤
    model = DecisionTreeClassifier(random_state=1012,
                                   criterion='gini',                           
                                   max_depth=3,
                                   max_leaf_nodes=2 ** 5)
    
    model.fit(train_x_split, train_y_split)           # 訓練決策樹模型
    
    train_pred_y = model.predict(train_x_split)       # 確認模型是否訓練成功
    train_acc = accuracy_score(train_y_split,         # 計算訓練資料準確度
                               train_pred_y)
    valid_pred_y = model.predict(valid_x_split)       # 驗證模型是否訓練成功
    valid_acc = accuracy_score(valid_y_split,         # 計算驗證資料準確度
                               valid_pred_y)
    
    train_acc_list.append(train_acc)
    valid_acc_list.append(valid_acc)

print((
    'average train accuracy: {}\n' +
    '    min train accuracy: {}\n' +
    '    max train accuracy: {}\n' +
    'average valid accuracy: {}\n' +
    '    min valid accuracy: {}\n' +
    '    max valid accuracy: {}').format(
    np.mean(train_acc_list),                          # 輸出平均訓練準確度
    np.min(train_acc_list),                           # 輸出最低訓練準確度
    np.max(train_acc_list),                           # 輸出最高訓練準確度
    np.mean(valid_acc_list),                          # 輸出平均驗證準確度
    np.min(valid_acc_list),                           # 輸出最低驗證準確度
    np.max(valid_acc_list)                            # 輸出最高驗證準確度
))

model = DecisionTreeClassifier(random_state=1012,
                                   criterion='gini',                           
                                   max_depth=3,
                                   max_leaf_nodes=2 ** 5)
model.fit(train_x, train_y)                       # 訓練決策樹模型

pred_y = model.predict(train_x)                   # 確認模型是否訓練成功
acc = accuracy_score(train_y, pred_y)             # 計算準確度

print('accuracy: {}'.format(acc))                 # 輸出準確度

average train accuracy: 0.6868322981366459
    min train accuracy: 0.6571428571428571
    max train accuracy: 0.7246376811594203
average valid accuracy: 0.5045751633986928
    min valid accuracy: 0.4117647058823529
    max valid accuracy: 0.6111111111111112
accuracy: 0.6781609195402298


In [22]:
## 模型訓練 ensemble

from sklearn import ensemble

kf = KFold(n_splits=5,                                # 設定 K 值
           random_state=1012,
           shuffle=True)
kf.get_n_splits(train_x)                              # 給予資料範圍

train_acc_list = []                                   # 儲存每次訓練模型的準確度
valid_acc_list = []                                   # 儲存每次驗證模型的準確度

for train_index, valid_index in kf.split(train_x):    # 每個迴圈都會產生不同部份的資料
    train_x_split = train_x.iloc[train_index]         # 產生訓練資料
    train_y_split = train_y.iloc[train_index]         # 產生訓練資料標籤
    valid_x_split = train_x.iloc[valid_index]         # 產生驗證資料
    valid_y_split = train_y.iloc[valid_index]         # 產生驗證資料標籤
    
    model = ensemble.AdaBoostClassifier(random_state=1012,n_estimators = 10)
    
    model.fit(train_x_split, train_y_split)           # 訓練決策樹模型
    
    train_pred_y = model.predict(train_x_split)       # 確認模型是否訓練成功
    train_acc = accuracy_score(train_y_split,         # 計算訓練資料準確度
                               train_pred_y)
    valid_pred_y = model.predict(valid_x_split)       # 驗證模型是否訓練成功
    valid_acc = accuracy_score(valid_y_split,         # 計算驗證資料準確度
                               valid_pred_y)
    
    train_acc_list.append(train_acc)
    valid_acc_list.append(valid_acc)

print((
    'average train accuracy: {}\n' +
    '    min train accuracy: {}\n' +
    '    max train accuracy: {}\n' +
    'average valid accuracy: {}\n' +
    '    min valid accuracy: {}\n' +
    '    max valid accuracy: {}').format(
    np.mean(train_acc_list),                          # 輸出平均訓練準確度
    np.min(train_acc_list),                           # 輸出最低訓練準確度
    np.max(train_acc_list),                           # 輸出最高訓練準確度
    np.mean(valid_acc_list),                          # 輸出平均驗證準確度
    np.min(valid_acc_list),                           # 輸出最低驗證準確度
    np.max(valid_acc_list)                            # 輸出最高驗證準確度
))
model = ensemble.AdaBoostClassifier(random_state=1012,n_estimators = 10)
model.fit(train_x, train_y)                       

pred_y = model.predict(train_x)                   # 確認模型是否訓練成功
acc = accuracy_score(train_y, pred_y)             # 計算準確度

print('accuracy: {}'.format(acc))                 # 輸出準確度

average train accuracy: 0.7699792960662526
    min train accuracy: 0.7246376811594203
    max train accuracy: 0.8
average valid accuracy: 0.5281045751633987
    min valid accuracy: 0.47058823529411764
    max valid accuracy: 0.5555555555555556
accuracy: 0.7241379310344828
