In [1]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
from collections import OrderedDict
import datetime
import time
from pathlib import Path
from functional import seq
from IPython.display import HTML
import matplotlib.pyplot as plt
from etf_tools import kd_rsv, ez_plot, candle_stick, rsi, ema, macd, atr_std, vr_obv, trend



plt.style.use('ggplot')
plt.ioff()

In [2]:
src_dir = '../data/raw/groupbycode/all'
dest_dir = '../data/raw/groupbycode/trainingset/'

col_dtypes = OrderedDict(code=str, date=str, name=str, open=float, high=float, low=float, close=float, volume=int, weekday=int)


etf0050 = pd.read_csv(os.path.join(src_dir, '0050.csv'), names=col_dtypes.keys(), dtype=col_dtypes, skiprows=1)
display(etf0050.shape)
display(etf0050.head())
display(etf0050.tail())

(1322, 9)

Unnamed: 0,code,date,name,open,high,low,close,volume,weekday
0,50,20130102,元大台灣50,54.0,54.65,53.9,54.4,16487,3
1,50,20130103,元大台灣50,54.9,55.05,54.65,54.85,29020,4
2,50,20130104,元大台灣50,54.85,54.85,54.4,54.5,9837,5
3,50,20130107,元大台灣50,54.55,54.55,53.9,54.25,8910,1
4,50,20130108,元大台灣50,54.0,54.2,53.65,53.9,12507,2


Unnamed: 0,code,date,name,open,high,low,close,volume,weekday
1317,50,20180521,元大台灣50,81.2,82.1,81.2,82.05,3889,1
1318,50,20180522,元大台灣50,81.8,82.3,81.75,81.75,1837,2
1319,50,20180523,元大台灣50,81.9,82.0,81.3,81.35,2124,3
1320,50,20180524,元大台灣50,81.35,81.7,81.25,81.55,1135,4
1321,50,20180525,元大台灣50,81.55,82.05,81.4,81.85,3115,5


In [3]:
def gen_all_features(df):
    feat_kd_rsv = kd_rsv(df.close, high=df.high, low=df.low)
    feat_candle_stick = candle_stick(open_price=df.open,
                                 close_price=df.close,
                                 high=df.high, low=df.low)
    feat_rsi = rsi(df.close)
    ema12 = ema(df.close, n_days=12)
    ema26 = ema(df.close, n_days=26)
    
    feat_macd = macd(df.close)
    feat_atr_std = atr_std(df.close, high=df.high, low=df.low, n_days=20, fillna=df.close[:20].mean())
    feat_vr_obv = vr_obv(df.close, df.volume)
    feat_trend = trend(df.close)
    
    return pd.concat([ema12, ema26, feat_kd_rsv, 
                      feat_rsi, feat_candle_stick, 
                      feat_macd, feat_atr_std, 
                      feat_vr_obv, feat_trend], axis=1)
    
    
def gen_y(price, n_days=5):
    datalen = len(price)
    y = []
    for i in range(datalen):
        head = i + 1
        if head < datalen:
            seg = price[head : head + n_days]
            y.append(pd.Series(seg.values, index=[f'y_{i + 1}' for i in range(len(seg))]))
    return pd.DataFrame(y)
    

    
def gen_trainingset(df, n_days=5):
    features = gen_all_features(df)
    y = gen_y(df.close, n_days=n_days)
    
    return pd.concat([df, features, y], axis=1)


In [8]:
features = gen_all_features(etf0050)
display(features.tail())
print(features.shape)
print(features.dtypes)

Unnamed: 0,ema12,ema26,rsv,k,d,upward,downward,rs,rsi,kbody,...,osc,price_std_20,atr_20,atr_std_20,vr,obv,obv_ma12,obv_ma12_diff,trend_up,trend_down
1317,81.132781,81.074629,0.788462,0.712629,0.734449,0.435714,0.235714,0.648936,0.393548,0.85,...,0.276527,1.16674,0.840832,-1.072232,2.110442,1557441,1535982.0,21459.0,0,0
1318,81.227737,81.124657,0.604651,0.676996,0.715489,0.3,0.278571,0.518519,0.341463,-0.05,...,0.257165,1.206067,0.820758,-1.180998,1.988491,1555604,1540334.0,15270.333333,0,0
1319,81.246547,81.141349,0.285714,0.547873,0.660176,0.171429,0.335714,0.338028,0.252632,-0.55,...,0.207426,1.206264,0.804008,-1.206516,1.854914,1553480,1544187.0,9293.166667,0,0
1320,81.293232,81.171619,0.4,0.499075,0.607013,0.2,0.214286,0.482759,0.325581,0.2,...,0.179073,1.194053,0.787835,-1.206354,1.808462,1554615,1547689.0,6926.416667,0,0
1321,81.378889,81.22187,0.645161,0.547283,0.587302,0.228571,0.214286,0.516129,0.340426,0.3,...,0.171583,1.165953,0.772875,-1.172595,1.805658,1557730,1551170.0,6560.0,0,0


(1322, 27)
ema12            float64
ema26            float64
rsv              float64
k                float64
d                float64
upward           float64
downward         float64
rs               float64
rsi              float64
kbody            float64
kbody_top        float64
kbody_bottom     float64
up_shadow        float64
low_shadow       float64
open_gap         float64
dif              float64
macd             float64
osc              float64
price_std_20     float64
atr_20           float64
atr_std_20       float64
vr               float64
obv                int64
obv_ma12         float64
obv_ma12_diff    float64
trend_up           int64
trend_down         int64
dtype: object


In [5]:
y = gen_y(etf0050.close, n_days=7)
display(y.tail(n=10))

Unnamed: 0,y_1,y_2,y_3,y_4,y_5,y_6,y_7
1311,82.5,81.65,81.75,81.2,80.95,82.05,81.75
1312,81.65,81.75,81.2,80.95,82.05,81.75,81.35
1313,81.75,81.2,80.95,82.05,81.75,81.35,81.55
1314,81.2,80.95,82.05,81.75,81.35,81.55,81.85
1315,80.95,82.05,81.75,81.35,81.55,81.85,
1316,82.05,81.75,81.35,81.55,81.85,,
1317,81.75,81.35,81.55,81.85,,,
1318,81.35,81.55,81.85,,,,
1319,81.55,81.85,,,,,
1320,81.85,,,,,,


In [6]:
path_seq = seq(list(Path(src_dir).glob('*.csv')))
print(path_seq.len())

1710


In [7]:
# sub_path_seq = path_seq.take(10)

In [7]:
%%time
tic = time.time()

(path_seq.map(lambda p: pd.read_csv(p, names=col_dtypes.keys(), dtype=col_dtypes, skiprows=1))
             .map(lambda df: gen_trainingset(df))
             .for_each(lambda df: df.to_csv(os.path.join(dest_dir, f'{df.code[0]}.csv'), index=False)))
#              .for_each(lambda df: df.to_pickle(os.path.join(dest_dir, f'{df.code[0]}.p'))))

toc = time.time()
print(f'{toc - tic:.3f} sec.')

582.102 sec.
CPU times: user 9min 21s, sys: 1.3 s, total: 9min 22s
Wall time: 9min 42s


# 以下很髒

In [66]:
arr = np.asarray([ 16487.,  29020.,   9837.,   8910.])

', '.join(arr.astype(str))
# pd.DataFrame(arr.reshape((1, -1)), columns=[f'y_{i + 1}' for i in range(4)])

'16487.0, 29020.0, 9837.0, 8910.0'

In [71]:
etf0050.to_pickle()

Unnamed: 0,code,date,name,open,high,low,close,volume
50,20130102,元大台灣50,54.0,54.65,53.9,54.4,16487.0,3
50,20130103,元大台灣50,54.9,55.05,54.65,54.85,29020.0,4
50,20130104,元大台灣50,54.85,54.85,54.4,54.5,9837.0,5
50,20130107,元大台灣50,54.55,54.55,53.9,54.25,8910.0,1
50,20130108,元大台灣50,54.0,54.2,53.65,53.9,12507.0,2


In [90]:
n_days = 5

close_price = etf0050.close.reset_index(drop=True)
data_len = len(close_price)
y = []
# pd.Series(data=np.asarray([ 16487.,  29020.,   9837.,   8910.]), index=[f'y_{i + 1}' for i in range(4)])
for i in range(data_len):
    head = i + 1
    if head < data_len:
        seg = close_price[head : head + n_days]
        y.append(pd.Series(seg.values, index=[f'y_{i + 1}' for i in range(len(seg))]))
zz = pd.DataFrame(y)

In [94]:
zz = pd.DataFrame(y)

In [95]:
display(etf0050.head(10))
zz.head()

Unnamed: 0,code,date,name,open,high,low,close,volume,weekday
0,50,20130102,元大台灣50,54.0,54.65,53.9,54.4,16487,3
1,50,20130103,元大台灣50,54.9,55.05,54.65,54.85,29020,4
2,50,20130104,元大台灣50,54.85,54.85,54.4,54.5,9837,5
3,50,20130107,元大台灣50,54.55,54.55,53.9,54.25,8910,1
4,50,20130108,元大台灣50,54.0,54.2,53.65,53.9,12507,2
5,50,20130109,元大台灣50,53.75,54.3,53.75,54.1,7529,3
6,50,20130110,元大台灣50,54.3,54.65,54.15,54.5,13953,4
7,50,20130111,元大台灣50,54.7,54.8,54.35,54.45,11837,5
8,50,20130114,元大台灣50,54.0,54.5,53.8,54.5,7282,1
9,50,20130115,元大台灣50,54.2,54.45,53.9,54.0,6609,2


Unnamed: 0,y_1,y_2,y_3,y_4,y_5
0,54.85,54.5,54.25,53.9,54.1
1,54.5,54.25,53.9,54.1,54.5
2,54.25,53.9,54.1,54.5,54.45
3,53.9,54.1,54.5,54.45,54.5
4,54.1,54.5,54.45,54.5,54.0


In [96]:
display(etf0050.tail(10))
zz.tail()

Unnamed: 0,code,date,name,open,high,low,close,volume,weekday
1276,50,20180320,元大台灣50,83.2,83.8,83.2,83.8,2895,2
1277,50,20180321,元大台灣50,83.8,84.1,83.6,83.85,4891,3
1278,50,20180322,元大台灣50,84.0,84.6,83.45,83.55,3429,4
1279,50,20180323,元大台灣50,81.85,82.4,81.8,82.1,8259,5
1280,50,20180326,元大台灣50,81.85,82.2,81.6,82.2,4369,1
1281,50,20180327,元大台灣50,83.0,83.4,82.95,83.4,3277,2
1282,50,20180328,元大台灣50,82.9,82.9,82.2,82.25,4161,3
1283,50,20180329,元大台灣50,82.25,82.35,81.8,82.1,4099,4
1284,50,20180330,元大台灣50,82.65,83.05,82.65,82.85,4994,5
1285,50,20180331,元大台灣50,82.85,83.05,82.75,82.95,878,6


Unnamed: 0,y_1,y_2,y_3,y_4,y_5
1280,83.4,82.25,82.1,82.85,82.95
1281,82.25,82.1,82.85,82.95,
1282,82.1,82.85,82.95,,
1283,82.85,82.95,,,
1284,82.95,,,,


In [97]:
x = pd.concat([etf0050, zz], axis=1)

In [98]:
display(x.head())
display(x.tail())

Unnamed: 0,code,date,name,open,high,low,close,volume,weekday,y_1,y_2,y_3,y_4,y_5
0,50,20130102,元大台灣50,54.0,54.65,53.9,54.4,16487,3,54.85,54.5,54.25,53.9,54.1
1,50,20130103,元大台灣50,54.9,55.05,54.65,54.85,29020,4,54.5,54.25,53.9,54.1,54.5
2,50,20130104,元大台灣50,54.85,54.85,54.4,54.5,9837,5,54.25,53.9,54.1,54.5,54.45
3,50,20130107,元大台灣50,54.55,54.55,53.9,54.25,8910,1,53.9,54.1,54.5,54.45,54.5
4,50,20130108,元大台灣50,54.0,54.2,53.65,53.9,12507,2,54.1,54.5,54.45,54.5,54.0


Unnamed: 0,code,date,name,open,high,low,close,volume,weekday,y_1,y_2,y_3,y_4,y_5
1281,50,20180327,元大台灣50,83.0,83.4,82.95,83.4,3277,2,82.25,82.1,82.85,82.95,
1282,50,20180328,元大台灣50,82.9,82.9,82.2,82.25,4161,3,82.1,82.85,82.95,,
1283,50,20180329,元大台灣50,82.25,82.35,81.8,82.1,4099,4,82.85,82.95,,,
1284,50,20180330,元大台灣50,82.65,83.05,82.65,82.85,4994,5,82.95,,,,
1285,50,20180331,元大台灣50,82.85,83.05,82.75,82.95,878,6,,,,,


In [149]:
temp = pd.read_csv(os.path.join(dest_dir, '0050.csv'))

In [152]:
temp.tail()

Unnamed: 0,code,date,name,open,high,low,close,volume,weekday,ema12,...,low_shadow,open_gap,dif,macd,osc,y_1,y_2,y_3,y_4,y_5
1281,50,20180327,元大台灣50,83.0,83.4,82.95,83.4,3277,2,83.175484,...,0.05,0.8,0.064911,0.101646,-0.036735,82.25,82.1,82.85,82.95,
1282,50,20180328,元大台灣50,82.9,82.9,82.2,82.25,4161,3,83.033102,...,0.05,-0.5,-0.013725,0.078572,-0.092296,82.1,82.85,82.95,,
1283,50,20180329,元大台灣50,82.25,82.35,81.8,82.1,4099,4,82.889548,...,0.3,0.0,-0.087143,0.045429,-0.132572,82.85,82.95,,,
1284,50,20180330,元大台灣50,82.65,83.05,82.65,82.85,4994,5,82.883463,...,0.0,0.55,-0.083843,0.019574,-0.103418,82.95,,,,
1285,50,20180331,元大台灣50,82.85,83.05,82.75,82.95,878,6,82.8937,...,0.1,0.0,-0.072325,0.001195,-0.073519,,,,,
