In [1]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
from collections import OrderedDict
import datetime
import time
import csv
import matplotlib.pyplot as plt
from etf_tools import kd_rsv, ez_plot, candle_stick, rsi, ema, macd

plt.style.use('ggplot')
plt.ioff()

In [2]:
src_dir = '../data/raw/'
dest_dir = os.path.join(src_dir, 'groupbycode/all')
csv_files = ['tetfp.csv', 'tsharep.csv']
col_dtypes = OrderedDict(code=str, date=str, name=str, open=float, high=float, low=float, close=float, volume=int)


In [3]:

tetfp = pd.read_csv(os.path.join(src_dir, 'tetfp.csv'), names=col_dtypes.keys(), dtype=col_dtypes, skiprows=1)
tsharep = pd.read_csv(os.path.join(src_dir, 'tsharep.csv'), names=col_dtypes.keys(), dtype=col_dtypes, skiprows=1)

print(tetfp.shape)
display(tetfp.head())
print(tsharep.shape)
display(tsharep.tail())

(19395, 8)


Unnamed: 0,code,date,name,open,high,low,close,volume
0,50,20130102,元大台灣50,54.0,54.65,53.9,54.4,16487
1,50,20130103,元大台灣50,54.9,55.05,54.65,54.85,29020
2,50,20130104,元大台灣50,54.85,54.85,54.4,54.5,9837
3,50,20130107,元大台灣50,54.55,54.55,53.9,54.25,8910
4,50,20130108,元大台灣50,54.0,54.2,53.65,53.9,12507


(2042069, 8)


Unnamed: 0,code,date,name,open,high,low,close,volume
2042064,9962,20180521,有益,11.1,11.1,11.0,11.05,89
2042065,9962,20180522,有益,11.15,11.15,11.0,11.05,45
2042066,9962,20180523,有益,11.05,11.05,11.0,11.0,29
2042067,9962,20180524,有益,11.05,11.05,11.0,11.05,25
2042068,9962,20180525,有益,11.05,11.05,11.0,11.05,21


In [4]:
pd.Series(tetfp.code.unique()).to_csv('ETF_CODE', index=False)
pd.Series(tetfp.date.unique()).to_csv('ETF_DATE', index=False)

In [5]:
stock_all = pd.concat([tetfp, tsharep], axis=0).reset_index(drop=True)

In [6]:
print(stock_all.shape)
display(stock_all.tail())

(2061464, 8)


Unnamed: 0,code,date,name,open,high,low,close,volume
2061459,9962,20180521,有益,11.1,11.1,11.0,11.05,89
2061460,9962,20180522,有益,11.15,11.15,11.0,11.05,45
2061461,9962,20180523,有益,11.05,11.05,11.0,11.0,29
2061462,9962,20180524,有益,11.05,11.05,11.0,11.05,25
2061463,9962,20180525,有益,11.05,11.05,11.0,11.05,21


In [7]:
weekday = stock_all.date.apply(lambda date: datetime.datetime.strptime(date, '%Y%m%d').weekday() + 1)

In [8]:
print(weekday.shape)
display(weekday.head())
display(weekday.tail())

(2061464,)


0    3
1    4
2    5
3    1
4    2
Name: date, dtype: int64

2061459    1
2061460    2
2061461    3
2061462    4
2061463    5
Name: date, dtype: int64

In [9]:
weekday.name = 'weekday'

In [10]:
stock_all_add_weekday = pd.concat([stock_all, weekday], axis=1)
display(stock_all_add_weekday.tail())

Unnamed: 0,code,date,name,open,high,low,close,volume,weekday
2061459,9962,20180521,有益,11.1,11.1,11.0,11.05,89,1
2061460,9962,20180522,有益,11.15,11.15,11.0,11.05,45,2
2061461,9962,20180523,有益,11.05,11.05,11.0,11.0,29,3
2061462,9962,20180524,有益,11.05,11.05,11.0,11.05,25,4
2061463,9962,20180525,有益,11.05,11.05,11.0,11.05,21,5


In [11]:
%%time
tic = time.time()

_ = stock_all_add_weekday.groupby(by='code').apply(
    lambda df: df.sort_values('date').to_csv(os.path.join(dest_dir, f'{df.name}.csv'),
                                             index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
)

toc = time.time()

print(f'{toc - tic:.3f} sec.')

15.766 sec.
CPU times: user 10.5 s, sys: 334 ms, total: 10.9 s
Wall time: 15.8 s
