In [1]:
# library import-------------------------------------------
import os
import time
from pathlib import Path

import FinanceDataReader as fdr
import numpy as np
import pandas as pd
import pandas_ta as ta
from pykrx import stock
from tqdm import tqdm
import parmap

from tsai.all import *

In [2]:

windowLength=50                  #input 50 days
predictDayAbove=1                #predict next 1 day above, pct change


trainDateStart='2014-01-01'
trainDateStop='2022-01-01'
testDateStop='2022-07-01'


KospiTickers = stock.get_market_ticker_list(testDateStop, market="KOSPI")
tickers=KospiTickers

preprocess1Path=Path(f"preprocess1_date{testDateStop}")
preprocess2Path=Path(f"preprocess2_date{testDateStop}_day{predictDayAbove}")
os.makedirs(preprocess1Path, exist_ok = True) 
os.makedirs(preprocess2Path, exist_ok = True) 



In [3]:
# cut date and filtering
 
for code in tqdm(tickers):
    saveFileName=Path('./data/pykrx/')/(code+'.csv')   #saveFileName=Path('./data/fdr/')/(code+'.csv')
    if not os.path.exists(saveFileName):
        continue

    #get ohlcv
    df = pd.read_csv(saveFileName,index_col=0)
    df=df[df['Volume']!=0]
    df=df[df['Open']!=0]
    df=df[df['High']!=0]
    df=df[df['Low']!=0]
    df=df[df['Close']!=0]
    df=df.loc[trainDateStart:]
    df=df.loc[:testDateStop]  
    df=df[['Close','Open','High','Low','Volume']]
    df.index=pd.to_datetime(df.index)
    if len(df)<300: continue    #over 300days
    if np.max(df.Close.pct_change(1))>0.30 or np.min(df.Close.pct_change(1))<-0.30: continue    #너무 확움직이는거 제외
    # if (df.Volume * df.Close)[-50:].mean() <400000000: continue #거래량 많은거 4억원이상
    # if df.Close[-50:].mean() <1000:continue   #동전주 스킵   


    df.to_csv(preprocess1Path/(code+'.csv')) 
    
    

100%|█████████████████████████████████████████| 943/943 [00:11<00:00, 79.19it/s]


In [4]:
# concat all tickers as one npy
# populate columns


sampleListKey=["X","y","trainIndex","testIndex","ticker","yDate", "unlabelX","unlabelTicker","unlabelDate"]
sampleListDict={key:[] for key in sampleListKey}
baseIndex=0


for code in tqdm(tickers):
    file=preprocess1Path/(code+".csv")
    if not file.exists():
        continue
    df = pd.read_csv(file,index_col=0)
    
    
    
    #stock indicator
    df.ta.sma( append=True, length=5)
    df.ta.sma( append=True, length=15)
    df.ta.sma( append=True, length=60)
    df.ta.bbands( append=True)
    df.ta.macd( append=True)
    df.ta.kdj( append=True)
    df.ta.rsi( append=True)
    df.ta.cci( append=True)
    df.ta.ema( append=True)
    df.ta.roc( append=True)
    df.ta.stochrsi( append=True)
    df.ta.atr( append=True)
    df.ta.ema( append=True, length=20)
    
    #drop na data, change type to float32
    df['labels'] = df.Close.pct_change(predictDayAbove)
    df = df.dropna()    
    df = df.astype(np.float32)
    df.index=pd.to_datetime(df.index)



    # window split and train/test split
    dfLen=len(df.loc[:testDateStop]  )-windowLength-predictDayAbove+1
    if dfLen>0:
        # train test split index calculate
        dfTrainLen=max(0,len(df.loc[:trainDateStop]  )-windowLength-predictDayAbove+1)
        dfTestLen=dfLen-dfTrainLen
        trainIndex=np.arange(dfTrainLen)+baseIndex
        testIndex=np.arange(dfTrainLen,dfLen)+baseIndex
        baseIndex+=dfLen
        

        X, y = SlidingWindow(windowLength, stride=1, horizon=[predictDayAbove], get_x=df.columns, get_y='labels')(df)
        unlabelX, _ = SlidingWindow(windowLength, stride=1, horizon=[predictDayAbove], get_x=df.columns, get_y=[],start=dfLen)(df)

        
        
        sampleListDict["X"]=np.append(sampleListDict["X"],X,axis=0) if sampleListDict["X"]!=[] else X
        sampleListDict["y"]=np.append(sampleListDict["y"],y,axis=0) if sampleListDict["y"]!=[] else y        
        sampleListDict["trainIndex"]+=trainIndex.tolist()
        sampleListDict["testIndex"]+=testIndex.tolist()
        sampleListDict["ticker"]+=[code]*len(y)
        sampleListDict["yDate"]+=df.index.strftime("%Y-%m-%d").tolist()[-len(y):]
        
        sampleListDict["unlabelX"]=np.append(sampleListDict["unlabelX"],unlabelX,axis=0) if sampleListDict["unlabelX"]!=[] else unlabelX 
        sampleListDict["unlabelTicker"]+=[code]*predictDayAbove
        sampleListDict["unlabelDate"]+=pd.date_range(df.index[-1], periods=predictDayAbove+1).format(formatter=lambda x: x.strftime('%Y-%m-%d'))[1:]



    
        
# save npy
for key in sampleListDict:
    np.save(preprocess2Path/f'pykrx_1_{key}.npy', sampleListDict[key])
    
    

100%|█████████████████████████████████████████| 943/943 [14:39<00:00,  1.07it/s]
