In [1]:
import time
import numpy as np
import pandas as pd

EXECUTION_START_TIME = time.time()

time = pd.read_csv('data/Time.csv')
kospi = pd.read_csv('data/KOSPI.csv')



In [2]:
# import
pd.options.mode.chained_assignment = None      # 解決會噴 warning 的狀況

from sklearn.impute import SimpleImputer       # 匯入填補缺失值的工具
from sklearn.preprocessing import LabelEncoder # 匯入 Label Encoder
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

In [3]:
print(time['date'].describe())

count            133
unique           133
top       2020-04-20
freq               1
Name: date, dtype: object


## 轉換日期資料
把日期資料型態轉換成python Timestamp

In [4]:
time["date"] = pd.to_datetime(time["date"], format='%Y-%m-%d')
kospi["Date"] = pd.to_datetime(kospi["Date"], format='%b %d, %Y')

In [5]:
print(kospi["Price"][0])
print(kospi["Date"][0])

kospi = kospi.iloc[::-1]
kospi = kospi.reset_index(drop=True)

print(kospi["Price"][0])
print(kospi["Date"][0])

2,181.87
2020-06-05 00:00:00
2,175.17
2020-01-02 00:00:00


## 轉換數值資料
把數值轉換成類型

In [6]:
for i in range(0, len(kospi.index)):
    kospi['Change %'][i] = 0 if kospi['Change %'][i][0] =='-' else 1

## 整理確診資料
將累積確診資料換算成每日確診人數

In [7]:
time["diff confirmed"] = 0
time["diff confirmed"][0] = time["confirmed"][0]
pre = time["confirmed"][0]

for i in range(1, len(time.index)):
    time["diff confirmed"][i] = time["confirmed"][i] - pre
    pre = time["confirmed"][i]

## 彙整資料
把確診和kospi資料合併

In [8]:
kospi["comfirmed_1"] = np.nan
kospi["comfirmed_2"] = np.nan
kospi["comfirmed_3"] = np.nan

for i in range(0, len(kospi.index)):
    if kospi["Date"][i] < time["date"][0]+datetime.timedelta(days=3):
        continue
    if kospi["Date"][i]-datetime.timedelta(days=1) > time["date"][132]:
        continue
    date = kospi["Date"][i]
    kospi["comfirmed_1"][i] = time['diff confirmed'][time["date"]==(date-datetime.timedelta(days=1))]
    kospi["comfirmed_2"][i] = time['diff confirmed'][time["date"]==(date-datetime.timedelta(days=2))]
    kospi["comfirmed_3"][i] = time['diff confirmed'][time["date"]==(date-datetime.timedelta(days=3))]


kospi = kospi.dropna()
kospi = kospi.reset_index(drop=True)

## 選取資料

In [9]:
train_x = kospi[["comfirmed_1", "comfirmed_2", "comfirmed_3"]]
train_y = kospi["Change %"]
date = kospi[["Date"]]

In [10]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   comfirmed_1  87 non-null     float64
 1   comfirmed_2  87 non-null     float64
 2   comfirmed_3  87 non-null     float64
dtypes: float64(3)
memory usage: 2.2 KB


In [11]:
train_y = train_y.astype('int')
train_y

0     0
1     0
2     1
3     0
4     0
     ..
82    1
83    1
84    0
85    1
86    1
Name: Change %, Length: 87, dtype: int64

## 模型訓練

In [12]:
### 模型訓練

from sklearn.model_selection import KFold             # 匯入 K 次交叉驗證工具
from sklearn.tree import DecisionTreeClassifier       # 匯入決策樹模型
from sklearn.metrics import accuracy_score            # 匯入準確度計算工具

kf = KFold(n_splits=5,                                # 設定 K 值
           random_state=1012,
           shuffle=True)
kf.get_n_splits(train_x)                              # 給予資料範圍

train_acc_list = []                                   # 儲存每次訓練模型的準確度
valid_acc_list = []                                   # 儲存每次驗證模型的準確度

for train_index, valid_index in kf.split(train_x):    # 每個迴圈都會產生不同部份的資料
    train_x_split = train_x.iloc[train_index]         # 產生訓練資料
    train_y_split = train_y.iloc[train_index]         # 產生訓練資料標籤
    valid_x_split = train_x.iloc[valid_index]         # 產生驗證資料
    valid_y_split = train_y.iloc[valid_index]         # 產生驗證資料標籤
    model = DecisionTreeClassifier(random_state=1012,
                                   criterion='gini',                           
                                   max_depth=4,       
                                   max_leaf_nodes=2 ** 4)
    
    model.fit(train_x_split, train_y_split)           # 訓練決策樹模型
    
    train_pred_y = model.predict(train_x_split)       # 確認模型是否訓練成功
    train_acc = accuracy_score(train_y_split,         # 計算訓練資料準確度
                               train_pred_y)
    valid_pred_y = model.predict(valid_x_split)       # 驗證模型是否訓練成功
    valid_acc = accuracy_score(valid_y_split,         # 計算驗證資料準確度
                               valid_pred_y)
    
    train_acc_list.append(train_acc)
    valid_acc_list.append(valid_acc)

print((
    'average train accuracy: {}\n' +
    '    min train accuracy: {}\n' +
    '    max train accuracy: {}\n' +
    'average valid accuracy: {}\n' +
    '    min valid accuracy: {}\n' +
    '    max valid accuracy: {}').format(
    np.mean(train_acc_list),                          # 輸出平均訓練準確度
    np.min(train_acc_list),                           # 輸出最低訓練準確度
    np.max(train_acc_list),                           # 輸出最高訓練準確度
    np.mean(valid_acc_list),                          # 輸出平均驗證準確度
    np.min(valid_acc_list),                           # 輸出最低驗證準確度
    np.max(valid_acc_list)                            # 輸出最高驗證準確度
))

model = DecisionTreeClassifier(random_state=1012,
                                   criterion='gini',                           
                                   max_depth=4,       
                                   max_leaf_nodes=2 ** 4)
model.fit(train_x, train_y)                       # 訓練決策樹模型

pred_y = model.predict(train_x)                   # 確認模型是否訓練成功
acc = accuracy_score(train_y, pred_y)             # 計算準確度

print('accuracy: {}'.format(acc))                 # 輸出準確度

average train accuracy: 0.7356107660455485
    min train accuracy: 0.6811594202898551
    max train accuracy: 0.8
average valid accuracy: 0.5163398692810457
    min valid accuracy: 0.4117647058823529
    max valid accuracy: 0.6111111111111112
accuracy: 0.7126436781609196


In [13]:
# 模型訓練 ensemble

from sklearn import ensemble

kf = KFold(n_splits=5,                                # 設定 K 值
           random_state=1012,
           shuffle=True)
kf.get_n_splits(train_x)                              # 給予資料範圍

train_acc_list = []                                   # 儲存每次訓練模型的準確度
valid_acc_list = []                                   # 儲存每次驗證模型的準確度

for train_index, valid_index in kf.split(train_x):    # 每個迴圈都會產生不同部份的資料
    train_x_split = train_x.iloc[train_index]         # 產生訓練資料
    train_y_split = train_y.iloc[train_index]         # 產生訓練資料標籤
    valid_x_split = train_x.iloc[valid_index]         # 產生驗證資料
    valid_y_split = train_y.iloc[valid_index]         # 產生驗證資料標籤
    
    model = ensemble.AdaBoostClassifier(random_state=1012,n_estimators = 10)
    
    model.fit(train_x_split, train_y_split)           # 訓練決策樹模型
    
    train_pred_y = model.predict(train_x_split)       # 確認模型是否訓練成功
    train_acc = accuracy_score(train_y_split,         # 計算訓練資料準確度
                               train_pred_y)
    valid_pred_y = model.predict(valid_x_split)       # 驗證模型是否訓練成功
    valid_acc = accuracy_score(valid_y_split,         # 計算驗證資料準確度
                               valid_pred_y)
    
    train_acc_list.append(train_acc)
    valid_acc_list.append(valid_acc)

print((
    'average train accuracy: {}\n' +
    '    min train accuracy: {}\n' +
    '    max train accuracy: {}\n' +
    'average valid accuracy: {}\n' +
    '    min valid accuracy: {}\n' +
    '    max valid accuracy: {}').format(
    np.mean(train_acc_list),                          # 輸出平均訓練準確度
    np.min(train_acc_list),                           # 輸出最低訓練準確度
    np.max(train_acc_list),                           # 輸出最高訓練準確度
    np.mean(valid_acc_list),                          # 輸出平均驗證準確度
    np.min(valid_acc_list),                           # 輸出最低驗證準確度
    np.max(valid_acc_list)                            # 輸出最高驗證準確度
))
model = ensemble.AdaBoostClassifier(random_state=1012,n_estimators = 10)
model.fit(train_x, train_y)                       

pred_y = model.predict(train_x)                   # 確認模型是否訓練成功
acc = accuracy_score(train_y, pred_y)             # 計算準確度

print('accuracy: {}'.format(acc))                 # 輸出準確度

average train accuracy: 0.7614492753623189
    min train accuracy: 0.7246376811594203
    max train accuracy: 0.8
average valid accuracy: 0.5294117647058824
    min valid accuracy: 0.3888888888888889
    max valid accuracy: 0.6470588235294118
accuracy: 0.7701149425287356


In [14]:
# 模型訓練 SVC

from sklearn.svm import SVC                       # 匯入支援向量機模型

kf = KFold(n_splits=5,                                # 設定 K 值
           random_state=1012,
           shuffle=True)
kf.get_n_splits(train_x)                              # 給予資料範圍

train_acc_list = []                                   # 儲存每次訓練模型的準確度
valid_acc_list = []                                   # 儲存每次驗證模型的準確度
c=20

for train_index, valid_index in kf.split(train_x):    # 每個迴圈都會產生不同部份的資料
    train_x_split = train_x.iloc[train_index]         # 產生訓練資料
    train_y_split = train_y.iloc[train_index]         # 產生訓練資料標籤
    valid_x_split = train_x.iloc[valid_index]         # 產生驗證資料
    valid_y_split = train_y.iloc[valid_index]         # 產生驗證資料標籤
    
    model = SVC(random_state=1012,C=c)
    
    model.fit(train_x_split, train_y_split)           # 訓練決策樹模型
    
    train_pred_y = model.predict(train_x_split)       # 確認模型是否訓練成功
    train_acc = accuracy_score(train_y_split,         # 計算訓練資料準確度
                               train_pred_y)
    valid_pred_y = model.predict(valid_x_split)       # 驗證模型是否訓練成功
    valid_acc = accuracy_score(valid_y_split,         # 計算驗證資料準確度
                               valid_pred_y)
    
    train_acc_list.append(train_acc)
    valid_acc_list.append(valid_acc)

print((
    'average train accuracy: {}\n' +
    '    min train accuracy: {}\n' +
    '    max train accuracy: {}\n' +
    'average valid accuracy: {}\n' +
    '    min valid accuracy: {}\n' +
    '    max valid accuracy: {}').format(
    np.mean(train_acc_list),                          # 輸出平均訓練準確度
    np.min(train_acc_list),                           # 輸出最低訓練準確度
    np.max(train_acc_list),                           # 輸出最高訓練準確度
    np.mean(valid_acc_list),                          # 輸出平均驗證準確度
    np.min(valid_acc_list),                           # 輸出最低驗證準確度
    np.max(valid_acc_list)                            # 輸出最高驗證準確度
))
model = SVC(random_state=1012,C=c)
model.fit(train_x, train_y)                       

pred_y = model.predict(train_x)                   # 確認模型是否訓練成功
acc = accuracy_score(train_y, pred_y)             # 計算準確度

print('accuracy: {}'.format(acc))                 # 輸出準確度

average train accuracy: 0.6380538302277433
    min train accuracy: 0.6
    max train accuracy: 0.6666666666666666
average valid accuracy: 0.4594771241830065
    min valid accuracy: 0.4117647058823529
    max valid accuracy: 0.5294117647058824
accuracy: 0.6091954022988506


In [15]:
# Logistic

from sklearn.linear_model import LogisticRegression

kf = KFold(n_splits=5,                                # 設定 K 值
           random_state=1012,
           shuffle=True)
kf.get_n_splits(train_x)                              # 給予資料範圍

train_acc_list = []                                   # 儲存每次訓練模型的準確度
valid_acc_list = []                                   # 儲存每次驗證模型的準確度

for train_index, valid_index in kf.split(train_x):    # 每個迴圈都會產生不同部份的資料
    train_x_split = train_x.iloc[train_index]         # 產生訓練資料
    train_y_split = train_y.iloc[train_index]         # 產生訓練資料標籤
    valid_x_split = train_x.iloc[valid_index]         # 產生驗證資料
    valid_y_split = train_y.iloc[valid_index]         # 產生驗證資料標籤
    
    model = LogisticRegression(random_state=1012)
    
    model.fit(train_x_split, train_y_split)           # 訓練決策樹模型
    
    train_pred_y = model.predict(train_x_split)       # 確認模型是否訓練成功
    train_acc = accuracy_score(train_y_split,         # 計算訓練資料準確度
                               train_pred_y)
    valid_pred_y = model.predict(valid_x_split)       # 驗證模型是否訓練成功
    valid_acc = accuracy_score(valid_y_split,         # 計算驗證資料準確度
                               valid_pred_y)
    
    train_acc_list.append(train_acc)
    valid_acc_list.append(valid_acc)

print((
    'average train accuracy: {}\n' +
    '    min train accuracy: {}\n' +
    '    max train accuracy: {}\n' +
    'average valid accuracy: {}\n' +
    '    min valid accuracy: {}\n' +
    '    max valid accuracy: {}').format(
    np.mean(train_acc_list),                          # 輸出平均訓練準確度
    np.min(train_acc_list),                           # 輸出最低訓練準確度
    np.max(train_acc_list),                           # 輸出最高訓練準確度
    np.mean(valid_acc_list),                          # 輸出平均驗證準確度
    np.min(valid_acc_list),                           # 輸出最低驗證準確度
    np.max(valid_acc_list)                            # 輸出最高驗證準確度
))
model = LogisticRegression(random_state=1012)
model.fit(train_x, train_y)                       

pred_y = model.predict(train_x)                   # 確認模型是否訓練成功
acc = accuracy_score(train_y, pred_y)             # 計算準確度

print('accuracy: {}'.format(acc))                 # 輸出準確度

average train accuracy: 0.58351966873706
    min train accuracy: 0.5285714285714286
    max train accuracy: 0.6376811594202898
average valid accuracy: 0.4470588235294118
    min valid accuracy: 0.35294117647058826
    max valid accuracy: 0.5
accuracy: 0.5747126436781609
