In [1]:
import xml.etree.ElementTree as elemTree
import os
import sys
tree = elemTree.parse(r'../AIFT2022_Wiki/config/.config.xml')
root = tree.getroot()
xx = root.find('./PATHS')
work_path = xx.find('work').text
sys.path.append(work_path)

In [2]:
from miscs.config_manager import ConfigManager

In [3]:
import talib
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from tqdm.auto import tqdm
from realtime_kiwoom.data_provider import *
import re
import plotly.express as px

In [5]:
cm = ConfigManager('../AIFT2022_Wiki/config/.config.xml')

In [6]:
print(cm.get_work_path())
print(cm.get_database())
print(cm.get_tables())
print(cm.retrieve_candidate_ETFs())

C:\Users\이연주\Desktop\AIFT\AIFT2022_Wiki
{'database': 'db\\kiwoom_db.sqlite3'}
{'history': {'table_name': 'data_in_minute', 'drop_table': False}, 'today': {'table_name': 'today_in_minute', 'drop_table': True}}
[('069500', 'KONDEX 200', 'X'), ('114800', 'KODEX 인버스', 'Y')]


In [7]:
# csv_paths = [ 
#   ('069500', r'..\AIFT2022_Wiki\data\kodex_200.csv'),
#   ('114800', r'..\AIFT2022_Wiki\data\kodex_inverse.csv'),
#   ('226490', r'..\AIFT2022_Wiki\data\kodex_kospi.csv'),
#   ('001', r'..\AIFT2022_Wiki\data\kospi.csv'),
#   ('201', r'..\AIFT2022_Wiki\data\kospi200.csv')
# ]

# 일년치 분봉 데이터 (오늘 제외 = 어제까지의 데이터임)

In [8]:
history_provider = MinuteChartDataProvider.Factory(cm, tag='history')

In [9]:
%%time
history_minute_dic = history_provider.get_history_from_ndays_ago(n_days=365)

CPU times: total: 49.3 s
Wall time: 49.6 s


In [10]:
history_minute_dic

{'069500':                           st_code   open   high    low  close  volume
 dt                                                                   
 2021-11-15 09:00:00+09:00  069500  38459  38459  38435  38435   50144
 2021-11-15 09:01:00+09:00  069500  38439  38464  38415  38449   23280
 2021-11-15 09:02:00+09:00  069500  38454  38488  38454  38469   45724
 2021-11-15 09:03:00+09:00  069500  38474  38503  38410  38410   37723
 2021-11-15 09:04:00+09:00  069500  38415  38430  38390  38400   16784
 ...                           ...    ...    ...    ...    ...     ...
 2022-11-11 15:17:00+09:00  069500  32375  32380  32365  32380   40137
 2022-11-11 15:18:00+09:00  069500  32385  32385  32365  32370   34866
 2022-11-11 15:19:00+09:00  069500  32365  32380  32365  32380    5633
 2022-11-11 15:30:00+09:00  069500  32395  32395  32395  32395  111692
 2022-11-11 15:35:00+09:00  069500  32395  32395  32395  32395    6403
 
 [93911 rows x 6 columns],
 '114800':                           s

# 여러 수치 구하기

In [13]:
def make_basic_features(df: pd.DataFrame):
  """
  df가 변형됨
  """
  ma = talib.MA(df['close'], timeperiod=30)
  macd, macdsignal, macdhist = talib.MACD(df['close'])
  rsi = talib.RSI(df['close'], timeperiod=14)
  ad = talib.AD(df['high'], df['low'], df['close'], df['volume'])

  df['ma'] = ma
  df['macd'] = macd
  df['macdsignal'] = macdsignal
  df['macdhist'] = macdhist
  df['rsi'] = rsi
  df['ad'] = ad

  df['offset_intra_day'] = ((df.index - df.index.floor('D') - pd.Timedelta('9h')).total_seconds()/(60*60*6.5)).values

In [14]:
def make_window_features(df: pd.DataFrame, cols=['ma', 'macd', 'macdsignal', 'macdhist', 'rsi', 'ad'], window_size=10):
  """
  df가 변형됨: 과거 윈도우 동안의 평균값대비 현재 값의 차이를 계산
  """
  for col in cols:
    prev_summary = df[col].rolling(window=window_size).mean().shift(1)
    df[f'{col}_w'] = (df[col] - prev_summary)

In [15]:
def make_binary_dt_features(df: pd.DataFrame):
  """
  df가 변형됨
  """
  ss = df.reset_index()
  df['ts_end'] = ss.dt.shift(-1).apply(lambda x: x.hour == 9 and x.minute == 0).values
  df['ts_start'] = ss.dt.apply(lambda x: x.hour == 9 and x.minute == 0).values

In [16]:
def make_binary_close_indicators(df: pd.DataFrame):
  """
  df가 변형됨
  """
  daily_prev_close = df.groupby(df.index.strftime('%Y-%m-%d')).close.last().shift(1)
  xx = pd.Series(df.index.strftime('%Y-%m-%d').map(daily_prev_close).values, index=df.index)
  df['is_higher'] = xx < df.close
  df.loc[xx.isna(), 'is_higher']=np.nan

In [17]:
def make_binary_indicators(df: pd.DataFrame):
  make_binary_dt_features(df)
  make_binary_close_indicators(df)

In [18]:
def make_target(df: pd.DataFrame, window_size=10):
  """
  df가 변형됨
  close의 내일 ~ window_size 까지의 가격 변화율을 target으로 함
  """
  df['target'] = df.close.rolling(window=window_size).mean().shift(-window_size) /df.close

In [19]:
for code, df in history_minute_dic.items():
  make_basic_features(df)
for code, df in history_minute_dic.items():
  make_window_features(df)
for code, df in history_minute_dic.items():
  make_binary_indicators(df)
for code, df in history_minute_dic.items():
  make_target(df, window_size=60)

In [20]:
window_size=60
a0 = df.close
a1 = df.close.rolling(window_size).mean().rename('ma')
a2 = df.close.rolling(window_size).mean().shift(-window_size).rename('shifted')
yy = pd.concat((a0, a1, a2), axis=1)[-200:]
assert(yy.iloc[window_size].ma == yy.iloc[0].shifted)

In [21]:
new_cols = ['ma_w', 'macd_w', 'macdsignal_w', 'macdhist_w', 'rsi_w', 'ad_w', 
            'ts_end', 'ts_start', 'is_higher', 'offset_intra_day', 'target']
compact_minute_dic = {code:df[new_cols] for code, df in history_minute_dic.items()}
merged_df = pd.merge(
  compact_minute_dic['069500'], 
  compact_minute_dic['114800'], 
  left_index=True, 
  right_index=True, 
  suffixes=('_x', '_y')
  )

In [22]:
decision_up_threshold=0.0025
decision_down_threshold=0.0020
merged_df['label'] = 'NOP'
merged_df.loc[(merged_df.target_x > 1 + decision_up_threshold) & (merged_df.target_y < 1 - decision_down_threshold), 'label'] = 'X'
merged_df.loc[(merged_df.target_x < 1 - decision_down_threshold) & (merged_df.target_y > 1 + decision_up_threshold), 'label'] = 'Y'
merged_df['label'] = merged_df.label.astype('category')

In [23]:
merged_df = merged_df.shift(1)

In [24]:
merged_df.label.value_counts(normalize=True)

NOP    0.806631
Y      0.103899
X      0.089470
Name: label, dtype: float64

In [25]:
merged_df.label.value_counts(normalize=False)

NOP    75416
Y       9714
X       8365
Name: label, dtype: int64

In [26]:
merged_df.to_pickle('.merged_for_baseline_df.pkl')
merged_df = pd.read_pickle('.merged_for_baseline_df.pkl')