### 업비트 API를 활용한 코인가격변동 예측 모형 개발
* 작성일 : 2022-04-08
* 작성자 : 윤성준
* 버전 : 0.1
* 변경이력 

## 목차
1. 데이터 수집
2. 항목 생성
3. 요건 정의 (dev/val, target 등)
4. ML모델학습
5. 평가
---

1. 데이터 수집

In [1]:
import pyupbit
import os
import pandas as pd
import pickle
from datetime import datetime
from dateutil.relativedelta import relativedelta
import talib
from talib import abstract

In [2]:
tickers = pyupbit.get_tickers(fiat="KRW")
print(tickers)

['KRW-BTC', 'KRW-ETH', 'KRW-NEO', 'KRW-MTL', 'KRW-LTC', 'KRW-XRP', 'KRW-ETC', 'KRW-OMG', 'KRW-SNT', 'KRW-WAVES', 'KRW-XEM', 'KRW-QTUM', 'KRW-LSK', 'KRW-STEEM', 'KRW-XLM', 'KRW-ARDR', 'KRW-ARK', 'KRW-STORJ', 'KRW-GRS', 'KRW-REP', 'KRW-ADA', 'KRW-SBD', 'KRW-POWR', 'KRW-BTG', 'KRW-ICX', 'KRW-EOS', 'KRW-TRX', 'KRW-SC', 'KRW-ONT', 'KRW-ZIL', 'KRW-POLY', 'KRW-ZRX', 'KRW-LOOM', 'KRW-BCH', 'KRW-BAT', 'KRW-IOST', 'KRW-RFR', 'KRW-CVC', 'KRW-IQ', 'KRW-IOTA', 'KRW-MFT', 'KRW-ONG', 'KRW-GAS', 'KRW-UPP', 'KRW-ELF', 'KRW-KNC', 'KRW-BSV', 'KRW-THETA', 'KRW-QKC', 'KRW-BTT', 'KRW-MOC', 'KRW-ENJ', 'KRW-TFUEL', 'KRW-MANA', 'KRW-ANKR', 'KRW-AERGO', 'KRW-ATOM', 'KRW-TT', 'KRW-CRE', 'KRW-MBL', 'KRW-WAXP', 'KRW-HBAR', 'KRW-MED', 'KRW-MLK', 'KRW-STPT', 'KRW-ORBS', 'KRW-VET', 'KRW-CHZ', 'KRW-STMX', 'KRW-DKA', 'KRW-HIVE', 'KRW-KAVA', 'KRW-AHT', 'KRW-LINK', 'KRW-XTZ', 'KRW-BORA', 'KRW-JST', 'KRW-CRO', 'KRW-TON', 'KRW-SXP', 'KRW-HUNT', 'KRW-PLA', 'KRW-DOT', 'KRW-SRM', 'KRW-MVL', 'KRW-STRAX', 'KRW-AQT', 'KRW-GLM', 

In [3]:
WORKDATE = '2022-04-01'
REQMONTHS = 7

workdate = datetime.fromisoformat(WORKDATE)
startdate = workdate - relativedelta(months=REQMONTHS)

In [4]:
# n개월치 데이터 가져오기 - 재활용 가능한 함수로 구현
def get_ohlcv_dump(filename = 'data.pickle', workdate = datetime.now(), reqmonths = 7, force_download = False):
    """    download ohlcv data from upbit and dump it as a pickle. if already has one, use it.
    Parameters    
        filename(String) : pickle name
        workdate(String, format %Y-%m-%d) : base date of work, download until workdate-1
        reqmonths(int) : num of requiring month
        force_download(bool)
    returns
        ohlcvas(key : ticker, value : pandas dataframe)

    """
    ohlcvs = {}
    if force_download or not os.path.exists(filename):
        
        startdate = workdate - relativedelta(months=reqmonths)
        
        for ticker in tickers:
            ohlcvs[ticker] = pyupbit.get_ohlcv_from(ticker, interval="minute1", to=workdate.strftime("%Y%m%d"), fromDatetime=startdate)
            # just for fast test, get 3 rows
            # ohlcvs[ticker] = pyupbit.get_ohlcv(ticker, interval="minute1", to=workdate.strftime("%Y%m%d"), count = 3)
        
        # Save pickle
        with open(filename,"wb") as fw:
            pickle.dump(ohlcvs, fw)
        
    else:
        # Load pickle
        with open(filename,"rb") as fr:
            ohlcvs = pickle.load(fr)

    return ohlcvs
    

In [5]:
ohlcvs = get_ohlcv_dump(filename = 'data.pickle', workdate = workdate, reqmonths = REQMONTHS)

2. 데이터 정제

In [6]:
for key, val in ohlcvs.items():
    print(key, val.size)

KRW-BTC 1826814
KRW-ETH 1826502
KRW-NEO 1716108
KRW-MTL 1536216
KRW-LTC 1672956
KRW-XRP 1826754
KRW-ETC 1813032
KRW-OMG 1737996
KRW-SNT 1602306
KRW-WAVES 1704420
KRW-XEM 1476048
KRW-QTUM 1761078
KRW-LSK 1462116
KRW-STEEM 1488834
KRW-XLM 1735842
KRW-ARDR 1445334
KRW-ARK 1464846
KRW-STORJ 1581816
KRW-GRS 1376070
KRW-REP 1449246
KRW-ADA 1811994
KRW-SBD 1353252
KRW-POWR 1685922
KRW-BTG 1629966
KRW-ICX 1668246
KRW-EOS 1788900
KRW-TRX 1810698
KRW-SC 1563060
KRW-ONT 1550724
KRW-ZIL 1551738
KRW-POLY 1599582
KRW-ZRX 1424304
KRW-LOOM 1467552
KRW-BCH 1654794
KRW-BAT 1651668
KRW-IOST 1458768
KRW-RFR 1388598
KRW-CVC 1581660
KRW-IQ 1310058
KRW-IOTA 1476150
KRW-MFT 1449816
KRW-ONG 1575312
KRW-GAS 1552836
KRW-UPP 1354794
KRW-ELF 1613814
KRW-KNC 1610424
KRW-BSV 1532196
KRW-THETA 1591764
KRW-QKC 1389702
KRW-BTT 1755510
KRW-MOC 1544514
KRW-ENJ 1656336
KRW-TFUEL 1603488
KRW-MANA 1802304
KRW-ANKR 1557480
KRW-AERGO 1543176
KRW-ATOM 1799424
KRW-TT 1438518
KRW-CRE 1528842
KRW-MBL 1547706
KRW-WAXP 1692864
KRW-

In [7]:
assert isinstance(ohlcvs['KRW-BTC'].index, pd.DatetimeIndex), 'not datetime index'

In [8]:
ohlcvs['KRW-BTC']

Unnamed: 0,open,high,low,close,volume,value
2021-09-01 00:00:00,55326000.0,55422000.0,55326000.0,55327000.0,16.458649,9.113422e+08
2021-09-01 00:01:00,55327000.0,55399000.0,55327000.0,55393000.0,9.624262,5.327570e+08
2021-09-01 00:02:00,55400000.0,55477000.0,55391000.0,55443000.0,12.328954,6.834197e+08
2021-09-01 00:03:00,55439000.0,55463000.0,55343000.0,55350000.0,18.474244,1.023428e+09
2021-09-01 00:04:00,55378000.0,55385000.0,55331000.0,55340000.0,12.750241,7.056209e+08
...,...,...,...,...,...,...
2022-03-31 23:55:00,56461000.0,56485000.0,56461000.0,56481000.0,7.931584,4.479158e+08
2022-03-31 23:56:00,56480000.0,56490000.0,56480000.0,56490000.0,4.296142,2.426775e+08
2022-03-31 23:57:00,56490000.0,56499000.0,56485000.0,56488000.0,5.051226,2.853419e+08
2022-03-31 23:58:00,56489000.0,56505000.0,56486000.0,56499000.0,3.184290,1.799036e+08


In [9]:
def create_features(df, tp = 10, multiple = 3, repeat = 6):

    ######### 항목 생성 시작
    for _ in range(repeat): #6
        
        # 공통항목
        df[['upperband' + str(tp),'middleband' + str(tp),'lowerband' + str(tp)]] \
                                        = abstract.BBANDS(df, timeperiod = tp, nbdevup=2.0, nbdevdn=2.0)
        df['upperband' + str(tp)] = df['upperband' + str(tp)] / df['close']
        df['middleband' + str(tp)] = df['middleband' + str(tp)] / df['close']
        df['lowerband' + str(tp)] = df['lowerband' + str(tp)] / df['close']
        df['dema' + str(tp)]     = abstract.DEMA(df, timeperiod = tp) / df['close']
        df['ema' + str(tp)]      = abstract.EMA(df, timeperiod = tp) / df['close']
        df['kama' + str(tp)]     = abstract.KAMA(df, timeperiod = tp) / df['close']
        df['midpoint' + str(tp)] = abstract.MIDPOINT(df, timeperiod=tp) / df['close']
        df['midprice' + str(tp)] = abstract.MIDPRICE(df, timeperiod=tp) / df['close']
        df['t3' + str(tp)]       = abstract.T3(df, timeperiod=tp, vfactor=0.7) / df['close']
        df['tema' + str(tp)]     = abstract.TEMA(df, timeperiod=tp) / df['close']
        df['trima' + str(tp)]    = abstract.TRIMA(df, timeperiod=tp) / df['close']
        df['wma' + str(tp)]      = abstract.WMA(df, timeperiod=tp) / df['close']
        
        # 모멘텀
        df['adx' + str(tp)]      = abstract.ADX(df, timeperiod=tp) # no normal
        df['adxr' + str(tp)]     = abstract.ADXR(df, timeperiod=tp) # no normal
        df['apo' + str(tp)]      = abstract.APO(df, fastperiod=tp, slowperiod = int(tp/2)) / df['close']
        df[['aroonup' + str(tp),'aroondown' + str(tp)]] \
                                        = abstract.AROON(df, timeperiod=tp) # no normal
        df['aroonosc' + str(tp)] = abstract.AROONOSC(df, timeperiod=tp) # no normal
        df['cci' + str(tp)]      = abstract.CCI(df, timeperiod=tp) # no normal
        df['cmo' + str(tp)]      = abstract.CMO(df, timeperiod=tp) # no normal
        df['dx' + str(tp)]       = abstract.DX(df, timeperiod=tp) # no normal
        df[['macd' + str(tp),'macds' + str(tp),'macdh' + str(tp)]] \
                                        = abstract.MACD(df, fastperiod=tp, slowperiod = int(tp/2), signalperiod = int(tp/3)) # no normal
        df['mfi' + str(tp)]      = abstract.MFI(df, timeperiod=tp) # no normal
        df['minus_di' + str(tp)] = abstract.MINUS_DI(df, timeperiod=tp) # no normal
        df['minus_dm' + str(tp)] = abstract.MINUS_DM(df, timeperiod=tp) # no normal
        df['mom' + str(tp)]      = abstract.MOM(df, timeperiod=tp) # no normal
        df['plus_di' + str(tp)]  = abstract.PLUS_DI(df, timeperiod=tp) # no normal
        df['plus_dm' + str(tp)]  = abstract.PLUS_DM(df, timeperiod=tp) # no normal
        df['ppo' + str(tp)]      = abstract.PPO(df, fastperiod=tp, slowperiod = int(tp/2)) # no normal
        df['roc' + str(tp)]      = abstract.ROC(df, timeperiod=tp) # no normal
        df['rsi' + str(tp)]      = abstract.RSI(df, timeperiod=tp) # no normal
        df[['slowk' + str(tp),'slowd' + str(tp)]] \
                                        = abstract.STOCH(df, fastk_period=tp, slowk_period= int(tp/2), slowd_period= int(tp/2)) # no normal
        df[['fastk' + str(tp),'fastd' + str(tp)]] \
                                        = abstract.STOCHF(df, fastk_period=tp, fastd_period= int(tp/2)) # no normal
        df[['fastkrsi' + str(tp),'fastdrsi' + str(tp)]] \
                                        = abstract.STOCHRSI(df, timeperiod=tp, fastk_period=tp, fastd_period= int(tp/2)) # no normal
        df['trix' + str(tp)]     = abstract.TRIX(df, timeperiod=tp) # no normal
        df['ultosc' + str(tp)]   = abstract.ULTOSC(df, timeperiod1=int(tp/4), timeperiod2=int(tp/2), timeperiod3=tp) # no normal
        df['willr' + str(tp)]    = abstract.WILLR(df, timeperiod=tp) # no normal

        # Volume        
        df['adosc' + str(tp)]    = abstract.ADOSC(df, fastperiod=int(tp/3), slowperiod=tp) # no normal?

        # Volatility 
        df['natr']               = abstract.NATR(df, timeperiod=tp) # no normal

        # stat
        df['reg_inter']          = abstract.LINEARREG_INTERCEPT(df, timeperiod=tp) / df['close']
        df['reg_slope']          = abstract.LINEARREG_SLOPE(df, timeperiod=tp) # no normal
        
        tp *= multiple
    
    # 공통항목
    df[['mama','fama']]  = abstract.MAMA(df, fastlimit=0.5, slowlimit=0.05)
    df['mama'] = df['mama'] / df['close']
    df['fama'] = df['fama'] / df['close']
    df['ht']             = abstract.HT_TRENDLINE(df) / df['close']
    df['sar']            = abstract.SAR(df, acceleration=0.02, maximum=0.2) / df['close']
    
    # 모멘텀
    df['bop']            = abstract.BOP(df) # no normal

    # Volume
    df['ad']             = abstract.AD(df) # no normal?
    df['obv']            = abstract.OBV(df) # no normal?

    # Volatility 
    df['trange']         = abstract.TRANGE(df) / df['close']

    # Cycle 
    df['ht_dcperiod']    = abstract.HT_DCPERIOD(df) # no normal?
    df['ht_dcphase']     = abstract.HT_DCPHASE(df) # no normal?
    df[['ht_inphase','ht_quad']]            = abstract.HT_PHASOR(df) # no normal?
    df[['ht_sine','ht_leadsine']]           = abstract.HT_SINE(df) # no normal?
    df['ht_trendmode']   = abstract.HT_TRENDMODE(df) # no normal?

    # Pattern Recognition
    for fn in talib.get_function_groups()['Pattern Recognition']:
        df[fn] = (abstract.Function(fn))(df)

    # target
    df['target']      = abstract.ROC(df, timeperiod=10).shift(-10) # no normal

    ######### 항목 생성 끝

    return df

In [10]:
tot_idx = pd.date_range(start = startdate , end = workdate, freq = 'T', closed = 'left')
tot_set = pd.DataFrame(index = tot_idx)

tot_sets = {}
for key, val in ohlcvs.items():

    # 비어있는 index 채우기
    tot_ohlcv = pd.DataFrame(index = tot_idx).join(val, how = 'left')

    # 결측으로 시작하는 경우, 해당 코인 제외
    if tot_ohlcv.head(1).isna().iloc[0,0]: continue

    # 거래가 없어서 결측인 경우 처리
    tot_ohlcv.close.fillna(method = 'ffill', inplace = True)
    tot_ohlcv.open.fillna(tot_ohlcv.close, inplace = True)
    tot_ohlcv.high.fillna(tot_ohlcv.close, inplace = True)
    tot_ohlcv.low.fillna(tot_ohlcv.close, inplace = True)
    tot_ohlcv.value.fillna(tot_ohlcv.close, inplace = True)
    tot_ohlcv.volume.fillna(value = 0, inplace = True)
    
    # 항목 생성
    tot_ohlcv = create_features(tot_ohlcv, tp = 10, multiple = 3, repeat = 5)

    # raw column 삭제
    tot_ohlcv.drop(columns=['open','high','low','close','value'])
    
    # 항목명 변경
    tot_ohlcv = tot_ohlcv.rename(columns = {c:key+'-'+c for c in tot_ohlcv.columns})

    # tot_set에 합치기
    # tot_set = tot_set.join(tot_ohlcv)

    tot_sets[key] = tot_ohlcv

    break

with open(r"D:\2022\CoinbizExpert\data_feature.pickle","wb") as fw:
    pickle.dump(tot_sets, fw)   
    
    

In [11]:
# 7개월간 데이터가 있는 102개 코인으로 축소 (11개 코인 제거)
tot_set.columns

Index([], dtype='object')

3. 항목 생성

In [12]:
# from ta import add_all_ta_features

# # Load datas
# df = ohlcvs['KRW-BTC'].copy() # this is you data and it has to be in the OHLC and volume format 

# # Add ta features filling NaN values
# df = add_all_ta_features(
#     df, open="open", high="high", low="low", close="close", volume="volume", fillna=True)

# df.columns