### 업비트 API를 활용한 코인가격변동 예측 모형 개발
* 작성일 : 2022-04-08
* 작성자 : 윤성준
* 버전 : 0.1
* 변경이력 

## 목차
1. 데이터 수집
2. 항목 생성
3. 요건 정의 (dev/val, target 등)
4. ML모델학습
5. 평가
---

1. 데이터 수집

In [4]:
import pyupbit
import os
import pandas as pd
import pickle
from datetime import datetime
from dateutil.relativedelta import relativedelta
import talib
from talib import abstract

In [5]:
tickers = pyupbit.get_tickers(fiat="KRW")
print(tickers)

['KRW-BTC', 'KRW-ETH', 'KRW-NEO', 'KRW-MTL', 'KRW-LTC', 'KRW-XRP', 'KRW-ETC', 'KRW-OMG', 'KRW-SNT', 'KRW-WAVES', 'KRW-XEM', 'KRW-QTUM', 'KRW-LSK', 'KRW-STEEM', 'KRW-XLM', 'KRW-ARDR', 'KRW-ARK', 'KRW-STORJ', 'KRW-GRS', 'KRW-REP', 'KRW-ADA', 'KRW-SBD', 'KRW-POWR', 'KRW-BTG', 'KRW-ICX', 'KRW-EOS', 'KRW-TRX', 'KRW-SC', 'KRW-ONT', 'KRW-ZIL', 'KRW-POLY', 'KRW-ZRX', 'KRW-LOOM', 'KRW-BCH', 'KRW-BAT', 'KRW-IOST', 'KRW-RFR', 'KRW-CVC', 'KRW-IQ', 'KRW-IOTA', 'KRW-MFT', 'KRW-ONG', 'KRW-GAS', 'KRW-UPP', 'KRW-ELF', 'KRW-KNC', 'KRW-BSV', 'KRW-THETA', 'KRW-QKC', 'KRW-BTT', 'KRW-MOC', 'KRW-ENJ', 'KRW-TFUEL', 'KRW-MANA', 'KRW-ANKR', 'KRW-AERGO', 'KRW-ATOM', 'KRW-TT', 'KRW-CRE', 'KRW-MBL', 'KRW-WAXP', 'KRW-HBAR', 'KRW-MED', 'KRW-MLK', 'KRW-STPT', 'KRW-ORBS', 'KRW-VET', 'KRW-CHZ', 'KRW-STMX', 'KRW-DKA', 'KRW-HIVE', 'KRW-KAVA', 'KRW-AHT', 'KRW-LINK', 'KRW-XTZ', 'KRW-BORA', 'KRW-JST', 'KRW-CRO', 'KRW-TON', 'KRW-SXP', 'KRW-HUNT', 'KRW-PLA', 'KRW-DOT', 'KRW-SRM', 'KRW-MVL', 'KRW-STRAX', 'KRW-AQT', 'KRW-GLM', 

In [6]:
WORKDATE = '2022-04-01'
REQMONTHS = 7

workdate = datetime.fromisoformat(WORKDATE)
startdate = workdate - relativedelta(months=REQMONTHS)

In [7]:
# n개월치 데이터 가져오기 - 재활용 가능한 함수로 구현
def get_ohlcv_dump(filename = 'data.pickle', workdate = datetime.now(), reqmonths = 7, force_download = False):
    """    download ohlcv data from upbit and dump it as a pickle. if already has one, use it.
    Parameters    
        filename(String) : pickle name
        workdate(String, format %Y-%m-%d) : base date of work, download until workdate-1
        reqmonths(int) : num of requiring month
        force_download(bool)
    returns
        ohlcvas(key : ticker, value : pandas dataframe)

    """
    ohlcvs = {}
    if force_download or not os.path.exists(filename):
        
        startdate = workdate - relativedelta(months=reqmonths)
        
        for ticker in tickers:
            ohlcvs[ticker] = pyupbit.get_ohlcv_from(ticker, interval="minute1", to=workdate.strftime("%Y%m%d"), fromDatetime=startdate)
            # just for fast test, get 3 rows
            # ohlcvs[ticker] = pyupbit.get_ohlcv(ticker, interval="minute1", to=workdate.strftime("%Y%m%d"), count = 3)
        
        # Save pickle
        with open(filename,"wb") as fw:
            pickle.dump(ohlcvs, fw)
        
    else:
        # Load pickle
        with open(filename,"rb") as fr:
            ohlcvs = pickle.load(fr)

    return ohlcvs
    

In [8]:
ohlcvs = get_ohlcv_dump(filename = 'data.pickle', workdate = workdate, reqmonths = REQMONTHS)

2. 데이터 정제

In [9]:
for key, val in ohlcvs.items():
    print(key, val.size)

KRW-BTC 1826814
KRW-ETH 1826502
KRW-NEO 1716108
KRW-MTL 1536216
KRW-LTC 1672956
KRW-XRP 1826754
KRW-ETC 1813032
KRW-OMG 1737996
KRW-SNT 1602306
KRW-WAVES 1704420
KRW-XEM 1476048
KRW-QTUM 1761078
KRW-LSK 1462116
KRW-STEEM 1488834
KRW-XLM 1735842
KRW-ARDR 1445334
KRW-ARK 1464846
KRW-STORJ 1581816
KRW-GRS 1376070
KRW-REP 1449246
KRW-ADA 1811994
KRW-SBD 1353252
KRW-POWR 1685922
KRW-BTG 1629966
KRW-ICX 1668246
KRW-EOS 1788900
KRW-TRX 1810698
KRW-SC 1563060
KRW-ONT 1550724
KRW-ZIL 1551738
KRW-POLY 1599582
KRW-ZRX 1424304
KRW-LOOM 1467552
KRW-BCH 1654794
KRW-BAT 1651668
KRW-IOST 1458768
KRW-RFR 1388598
KRW-CVC 1581660
KRW-IQ 1310058
KRW-IOTA 1476150
KRW-MFT 1449816
KRW-ONG 1575312
KRW-GAS 1552836
KRW-UPP 1354794
KRW-ELF 1613814
KRW-KNC 1610424
KRW-BSV 1532196
KRW-THETA 1591764
KRW-QKC 1389702
KRW-BTT 1755510
KRW-MOC 1544514
KRW-ENJ 1656336
KRW-TFUEL 1603488
KRW-MANA 1802304
KRW-ANKR 1557480
KRW-AERGO 1543176
KRW-ATOM 1799424
KRW-TT 1438518
KRW-CRE 1528842
KRW-MBL 1547706
KRW-WAXP 1692864
KRW-

In [10]:
assert isinstance(ohlcvs['KRW-BTC'].index, pd.DatetimeIndex), 'not datetime index'

In [11]:
ohlcvs['KRW-BTC']

Unnamed: 0,open,high,low,close,volume,value
2021-09-01 00:00:00,55326000.0,55422000.0,55326000.0,55327000.0,16.458649,9.113422e+08
2021-09-01 00:01:00,55327000.0,55399000.0,55327000.0,55393000.0,9.624262,5.327570e+08
2021-09-01 00:02:00,55400000.0,55477000.0,55391000.0,55443000.0,12.328954,6.834197e+08
2021-09-01 00:03:00,55439000.0,55463000.0,55343000.0,55350000.0,18.474244,1.023428e+09
2021-09-01 00:04:00,55378000.0,55385000.0,55331000.0,55340000.0,12.750241,7.056209e+08
...,...,...,...,...,...,...
2022-03-31 23:55:00,56461000.0,56485000.0,56461000.0,56481000.0,7.931584,4.479158e+08
2022-03-31 23:56:00,56480000.0,56490000.0,56480000.0,56490000.0,4.296142,2.426775e+08
2022-03-31 23:57:00,56490000.0,56499000.0,56485000.0,56488000.0,5.051226,2.853419e+08
2022-03-31 23:58:00,56489000.0,56505000.0,56486000.0,56499000.0,3.184290,1.799036e+08


In [132]:
tot_idx = pd.date_range(start = startdate , end = workdate, freq = 'T', closed = 'left')
tot_set = pd.DataFrame(index = tot_idx)

for key, val in ohlcvs.items():

    # 비어있는 index 채우기
    tot_ohlcv = pd.DataFrame(index = tot_idx).join(val, how = 'left')

    # 결측으로 시작하는 경우, 해당 코인 제외
    if tot_ohlcv.head(1).isna().iloc[0,0]: continue

    # 거래가 없어서 결측인 경우 처리
    tot_ohlcv.close.fillna(method = 'ffill', inplace = True)
    tot_ohlcv.open.fillna(tot_ohlcv.close, inplace = True)
    tot_ohlcv.high.fillna(tot_ohlcv.close, inplace = True)
    tot_ohlcv.low.fillna(tot_ohlcv.close, inplace = True)
    tot_ohlcv.value.fillna(tot_ohlcv.close, inplace = True)
    tot_ohlcv.volume.fillna(value = 0, inplace = True)
    
    ######### 항목 생성 시작
    tp = 10
    for i in range(6): #6
        print(tp)
        # 공통항목
        tot_ohlcv[['upperband' + str(tp),'middleband' + str(tp),'lowerband' + str(tp)]] \
                                        = abstract.BBANDS(tot_ohlcv, timeperiod = tp, nbdevup=2.0, nbdevdn=2.0)
        tot_ohlcv['upperband' + str(tp)] = tot_ohlcv['upperband' + str(tp)] / tot_ohlcv['close']
        tot_ohlcv['middleband' + str(tp)] = tot_ohlcv['middleband' + str(tp)] / tot_ohlcv['close']
        tot_ohlcv['lowerband' + str(tp)] = tot_ohlcv['lowerband' + str(tp)] / tot_ohlcv['close']
        tot_ohlcv['dema' + str(tp)]     = abstract.DEMA(tot_ohlcv, timeperiod = tp) / tot_ohlcv['close']
        tot_ohlcv['ema' + str(tp)]      = abstract.EMA(tot_ohlcv, timeperiod = tp) / tot_ohlcv['close']
        tot_ohlcv['kama' + str(tp)]     = abstract.KAMA(tot_ohlcv, timeperiod = tp) / tot_ohlcv['close']
        tot_ohlcv['midpoint' + str(tp)] = abstract.MIDPOINT(tot_ohlcv, timeperiod=tp) / tot_ohlcv['close']
        tot_ohlcv['midprice' + str(tp)] = abstract.MIDPRICE(tot_ohlcv, timeperiod=tp) / tot_ohlcv['close']
        tot_ohlcv['t3' + str(tp)]       = abstract.T3(tot_ohlcv, timeperiod=tp, vfactor=0.7) / tot_ohlcv['close']
        tot_ohlcv['tema' + str(tp)]     = abstract.TEMA(tot_ohlcv, timeperiod=tp) / tot_ohlcv['close']
        tot_ohlcv['trima' + str(tp)]    = abstract.TRIMA(tot_ohlcv, timeperiod=tp) / tot_ohlcv['close']
        tot_ohlcv['wma' + str(tp)]      = abstract.WMA(tot_ohlcv, timeperiod=tp) / tot_ohlcv['close']
        
        # 모멘텀
        tot_ohlcv['adx' + str(tp)]      = abstract.ADX(tot_ohlcv, timeperiod=tp) # no normal
        tot_ohlcv['adxr' + str(tp)]     = abstract.ADXR(tot_ohlcv, timeperiod=tp) # no normal
        tot_ohlcv['apo' + str(tp)]      = abstract.APO(tot_ohlcv, fastperiod=tp, slowperiod = int(tp/2)) / tot_ohlcv['close']
        tot_ohlcv[['aroonup' + str(tp),'aroondown' + str(tp)]] \
                                        = abstract.AROON(tot_ohlcv, timeperiod=tp) # no normal
        tot_ohlcv['aroonosc' + str(tp)] = abstract.AROONOSC(tot_ohlcv, timeperiod=tp) # no normal
        tot_ohlcv['cci' + str(tp)]      = abstract.CCI(tot_ohlcv, timeperiod=tp) # no normal
        tot_ohlcv['cmo' + str(tp)]      = abstract.CMO(tot_ohlcv, timeperiod=tp) # no normal
        tot_ohlcv['dx' + str(tp)]       = abstract.DX(tot_ohlcv, timeperiod=tp) # no normal
        tot_ohlcv[['macd' + str(tp),'macds' + str(tp),'macdh' + str(tp)]] \
                                        = abstract.MACD(tot_ohlcv, fastperiod=tp, slowperiod = int(tp/2), signalperiod = int(tp/3)) # no normal
        tot_ohlcv['mfi' + str(tp)]      = abstract.MFI(tot_ohlcv, timeperiod=tp) # no normal
        tot_ohlcv['minus_di' + str(tp)] = abstract.MINUS_DI(tot_ohlcv, timeperiod=tp) # no normal
        tot_ohlcv['minus_dm' + str(tp)] = abstract.MINUS_DM(tot_ohlcv, timeperiod=tp) # no normal
        tot_ohlcv['mom' + str(tp)]      = abstract.MOM(tot_ohlcv, timeperiod=tp) # no normal
        tot_ohlcv['plus_di' + str(tp)]  = abstract.PLUS_DI(tot_ohlcv, timeperiod=tp) # no normal
        tot_ohlcv['plus_dm' + str(tp)]  = abstract.PLUS_DM(tot_ohlcv, timeperiod=tp) # no normal
        tot_ohlcv['ppo' + str(tp)]      = abstract.PPO(tot_ohlcv, fastperiod=tp, slowperiod = int(tp/2)) # no normal
        tot_ohlcv['roc' + str(tp)]      = abstract.ROC(tot_ohlcv, timeperiod=tp) # no normal
        tot_ohlcv['rsi' + str(tp)]      = abstract.RSI(tot_ohlcv, timeperiod=tp) # no normal
        tot_ohlcv[['slowk' + str(tp),'slowd' + str(tp)]] \
                                        = abstract.STOCH(tot_ohlcv, fastk_period=tp, slowk_period= int(tp/2), slowd_period= int(tp/2)) # no normal
        tot_ohlcv[['fastk' + str(tp),'fastd' + str(tp)]] \
                                        = abstract.STOCHF(tot_ohlcv, fastk_period=tp, fastd_period= int(tp/2)) # no normal
        tot_ohlcv[['fastkrsi' + str(tp),'fastdrsi' + str(tp)]] \
                                        = abstract.STOCHRSI(tot_ohlcv, timeperiod=tp, fastk_period=tp, fastd_period= int(tp/2)) # no normal
        tot_ohlcv['trix' + str(tp)]     = abstract.TRIX(tot_ohlcv, timeperiod=tp) # no normal
        tot_ohlcv['ultosc' + str(tp)]   = abstract.ULTOSC(tot_ohlcv, timeperiod1=int(tp/4), timeperiod2=int(tp/2), timeperiod3=tp) # no normal
        tot_ohlcv['willr' + str(tp)]    = abstract.WILLR(tot_ohlcv, timeperiod=tp) # no normal

        # Volume        
        tot_ohlcv['adosc' + str(tp)]    = abstract.ADOSC(tot_ohlcv, fastperiod=int(tp/3), slowperiod=tp) # no normal?

        # Volatility 
        tot_ohlcv['natr']               = abstract.NATR(tot_ohlcv, timeperiod=tp) # no normal

        # stat
        tot_ohlcv['reg_inter']          = abstract.LINEARREG_INTERCEPT(tot_ohlcv, timeperiod=tp) / tot_ohlcv['close']
        tot_ohlcv['reg_slope']          = abstract.LINEARREG_SLOPE(tot_ohlcv, timeperiod=tp) # no normal
        
        tp *= 3
    
    # 공통항목
    tot_ohlcv[['mama','fama']]  = abstract.MAMA(tot_ohlcv, fastlimit=0.5, slowlimit=0.05)
    tot_ohlcv['mama'] = tot_ohlcv['mama'] / tot_ohlcv['close']
    tot_ohlcv['fama'] = tot_ohlcv['fama'] / tot_ohlcv['close']
    tot_ohlcv['ht']             = abstract.HT_TRENDLINE(tot_ohlcv) / tot_ohlcv['close']
    tot_ohlcv['sar']            = abstract.SAR(tot_ohlcv, acceleration=0.02, maximum=0.2) / tot_ohlcv['close']
    
    # 모멘텀
    tot_ohlcv['bop']            = abstract.BOP(tot_ohlcv) # no normal

    # Volume
    tot_ohlcv['ad']             = abstract.AD(tot_ohlcv) # no normal?
    tot_ohlcv['obv']            = abstract.OBV(tot_ohlcv) # no normal?

    # Volatility 
    tot_ohlcv['trange']         = abstract.TRANGE(tot_ohlcv) / tot_ohlcv['close']

    # Cycle 
    tot_ohlcv['ht_dcperiod']    = abstract.HT_DCPERIOD(tot_ohlcv) # no normal?
    tot_ohlcv['ht_dcphase']     = abstract.HT_DCPHASE(tot_ohlcv) # no normal?
    tot_ohlcv[['ht_inphase','ht_quad']]            = abstract.HT_PHASOR(tot_ohlcv) # no normal?
    tot_ohlcv[['ht_sine','ht_leadsine']]           = abstract.HT_SINE(tot_ohlcv) # no normal?
    tot_ohlcv['ht_trendmode']   = abstract.HT_TRENDMODE(tot_ohlcv) # no normal?

    for fn in talib.get_function_groups()['Pattern Recognition']:
        tot_ohlcv[fn] = (abstract.Function(fn))(tot_ohlcv)

    ######### 항목 생성 끝


    # raw column 삭제
    tot_ohlcv.drop(columns=['open','high','low','close','value'])
    
    # 항목명 변경
    tot_ohlcv = tot_ohlcv.rename(columns = {c:key+'-'+c for c in tot_ohlcv.columns})

    # tot_set에 합치기
    tot_set = tot_set.join(tot_ohlcv)
    
    break
    

10
30
90
270
810
2430


In [13]:
# 7개월간 데이터가 있는 102개 코인으로 축소 (11개 코인 제거)
tot_set.columns

Index(['KRW-BTC-open', 'KRW-BTC-high', 'KRW-BTC-low', 'KRW-BTC-close',
       'KRW-BTC-volume', 'KRW-BTC-value', 'KRW-ETH-open', 'KRW-ETH-high',
       'KRW-ETH-low', 'KRW-ETH-close',
       ...
       'KRW-STX-low', 'KRW-STX-close', 'KRW-STX-volume', 'KRW-STX-value',
       'KRW-XEC-open', 'KRW-XEC-high', 'KRW-XEC-low', 'KRW-XEC-close',
       'KRW-XEC-volume', 'KRW-XEC-value'],
      dtype='object', length=612)

3. 항목 생성

In [17]:
import talib
from talib import abstract

In [86]:
df = ohlcvs['KRW-BTC'].copy()
tp = 50
for i in range(1):
    print(tp)
    df[['upperband' + str(tp),'middleband' + str(tp),'lowerband' + str(tp)]] = abstract.BBANDS(df, timeperiod = tp, nbdevup=2.0, nbdevdn=2.0)
    df['dema' + str(tp)] = abstract.DEMA(df, timeperiod = tp)
    df['ema' + str(tp)] = abstract.EMA(df, timeperiod = tp)
    df['kama' + str(tp)] = abstract.KAMA(df, timeperiod = tp)
    df['midpoint' + str(tp)] = abstract.MIDPOINT(df, timeperiod=tp)
    df['midprice' + str(tp)] = abstract.MIDPRICE(df, timeperiod=tp)
    # MAVP : ??
    # SMA : middleband와 동일
    df['t3' + str(tp)] = abstract.T3(df, timeperiod=tp, vfactor=0.7)
    df['tema' + str(tp)] = abstract.TEMA(df, timeperiod=tp)
    df['trima' + str(tp)] = abstract.TRIMA(df, timeperiod=tp)
    df['wma' + str(tp)] = abstract.WMA(df, timeperiod=tp)

    tp *= 2
    
df[['mama','fama']] = abstract.MAMA(df, fastlimit=0.5, slowlimit=0.05)
df['ht'] = abstract.HT_TRENDLINE(df)
df['sar'] = abstract.SAR(df, acceleration=0.02, maximum=0.2)
# df.to_excel('test.xlsx')

50


In [114]:

from ta import add_all_ta_features

# Load datas
df = ohlcvs['KRW-BTC'].copy() # this is you data and it has to be in the OHLC and volume format 


# Add ta features filling NaN values
df = add_all_ta_features(
    df, open="open", high="high", low="low", close="close", volume="volume", fillna=True)

  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)


In [115]:
df.columns

Index(['open', 'high', 'low', 'close', 'volume', 'value', 'volume_adi',
       'volume_obv', 'volume_cmf', 'volume_fi', 'volume_em', 'volume_sma_em',
       'volume_vpt', 'volume_vwap', 'volume_mfi', 'volume_nvi',
       'volatility_bbm', 'volatility_bbh', 'volatility_bbl', 'volatility_bbw',
       'volatility_bbp', 'volatility_bbhi', 'volatility_bbli',
       'volatility_kcc', 'volatility_kch', 'volatility_kcl', 'volatility_kcw',
       'volatility_kcp', 'volatility_kchi', 'volatility_kcli',
       'volatility_dcl', 'volatility_dch', 'volatility_dcm', 'volatility_dcw',
       'volatility_dcp', 'volatility_atr', 'volatility_ui', 'trend_macd',
       'trend_macd_signal', 'trend_macd_diff', 'trend_sma_fast',
       'trend_sma_slow', 'trend_ema_fast', 'trend_ema_slow',
       'trend_vortex_ind_pos', 'trend_vortex_ind_neg', 'trend_vortex_ind_diff',
       'trend_trix', 'trend_mass_index', 'trend_dpo', 'trend_kst',
       'trend_kst_sig', 'trend_kst_diff', 'trend_ichimoku_conv',
       'tre

In [116]:
talib.get_function_groups()['Pattern Recognition']

['CDL2CROWS',
 'CDL3BLACKCROWS',
 'CDL3INSIDE',
 'CDL3LINESTRIKE',
 'CDL3OUTSIDE',
 'CDL3STARSINSOUTH',
 'CDL3WHITESOLDIERS',
 'CDLABANDONEDBABY',
 'CDLADVANCEBLOCK',
 'CDLBELTHOLD',
 'CDLBREAKAWAY',
 'CDLCLOSINGMARUBOZU',
 'CDLCONCEALBABYSWALL',
 'CDLCOUNTERATTACK',
 'CDLDARKCLOUDCOVER',
 'CDLDOJI',
 'CDLDOJISTAR',
 'CDLDRAGONFLYDOJI',
 'CDLENGULFING',
 'CDLEVENINGDOJISTAR',
 'CDLEVENINGSTAR',
 'CDLGAPSIDESIDEWHITE',
 'CDLGRAVESTONEDOJI',
 'CDLHAMMER',
 'CDLHANGINGMAN',
 'CDLHARAMI',
 'CDLHARAMICROSS',
 'CDLHIGHWAVE',
 'CDLHIKKAKE',
 'CDLHIKKAKEMOD',
 'CDLHOMINGPIGEON',
 'CDLIDENTICAL3CROWS',
 'CDLINNECK',
 'CDLINVERTEDHAMMER',
 'CDLKICKING',
 'CDLKICKINGBYLENGTH',
 'CDLLADDERBOTTOM',
 'CDLLONGLEGGEDDOJI',
 'CDLLONGLINE',
 'CDLMARUBOZU',
 'CDLMATCHINGLOW',
 'CDLMATHOLD',
 'CDLMORNINGDOJISTAR',
 'CDLMORNINGSTAR',
 'CDLONNECK',
 'CDLPIERCING',
 'CDLRICKSHAWMAN',
 'CDLRISEFALL3METHODS',
 'CDLSEPARATINGLINES',
 'CDLSHOOTINGSTAR',
 'CDLSHORTLINE',
 'CDLSPINNINGTOP',
 'CDLSTALLEDPATTERN',
 