# BA2plus Team Submission

## 필수 기재내용

### 전체 프로세스 개요:

- 자체제작한, 별도의 크롤링 패키지로 KRX/Naver 에서 가격, 거개량, 유동성, 시총 등의 데이터를 가져옴. 
    - 코드: https://github.com/jaepil-choi/korquanttools
    - 각각의 데이터를 `.pickle` 로 저장해 본 코드에서 써먹음 
    - pickle data, train dataset을 받아올 수 있는 google drive 첨부: https://drive.google.com/drive/folders/1fR9fiogdhdktHuX_LbgBdMxfgHSkFT1y?usp=sharing
- 데이터를 불러와 public 기간 전까지 자르고, 데이터를 조합하여 새로운 변수를 만듦
    - 따라서 look-ahead 없음
    - 새로운 변수: `close t-1`, `close t-3`, `close t-5`, `normalized rdv/adv`
    - 각 세팅은 별도의 `submission_config.py` 모듈로 관리. (뒷부분 첨부)
- 모델에 넣고 돌림 
    - XGB + LGBM base model 
    - step 1, step 2, ... , step 15에 대해 따로 예측함 
    - 시그널 만듦 
- submission 형식에 맞게 변환
    - `submission_util.py` 모듈로 형식에 맞게 변환함. (뒷부분 첨부)

### 코드 실행환경 및 실행방법
- 코드 실행환경
    - python 3.9
    - xgboost, sklearn, lightgbm, tqdm, pandas, numpy 필요
    - prophet, pycaret, catboost, pandas_ta 필요
- 실행방법 (중요)
    - .ipynb만 제출할 수 있다는 대회 제약 때문에 부득이 .py 모듈을 후반부에 첨부. 이 파일들이 있어야 코드가 돌아감. 
    - 코드를 실행하려면 drive 링크의 pickle 파일들을 받아 `/data` 폴더에 넣고, output을 넣을 `/output` 폴더도 만들어줘야 함.
    - 또한, `/data` 폴더 내에 `train.csv`, `train_additional.csv`파일을 넣어야 함.
    - 그리고 노트북과 같은 폴더 안에 `submission_config.py` 와 `submission_config.py`, `sample_submission.csv` 파일이 위치해야 함. 


나머지 과정은 아래 markdown 참고 부탁드립니다.

# Alpha Stretagy by Jaepil

## Basic settings

### Import libraries

In [1]:
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMRegressor

from tqdm import tqdm

import warnings

# Suppress Python warnings
warnings.simplefilter('ignore')

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path

from sklearn.impute import SimpleImputer

#import statsmodels.api as sm
#import statsmodels.formula.api as smf

In [3]:
## custom library

import eda_util as eutil
import submission_config as subconfig
import submission_util as subutil

In [4]:
pd.set_option('display.float_format', lambda x: f'{x:,g}')

In [5]:
BASE_PATH = subconfig.BASE_PATH
DATA_PATH = subconfig.DATA_PATH

OUTPUT_PATH = subconfig.OUTPUT_PATH

### Import data & preprocessing

In [6]:
krx_df = pd.read_csv(subconfig.krx_df_PATH)

In [7]:
krx_df.columns = ['date', 'code', 'name', 'volume', 'open', 'high', 'low', 'close']

In [8]:
krx_df['date'] = pd.to_datetime(krx_df['date'], format='%Y%m%d')

In [9]:
krx_df

Unnamed: 0,date,code,name,volume,open,high,low,close
0,2021-06-01,A060310,3S,166690,2890,2970,2885,2920
1,2021-06-01,A095570,AJ네트웍스,63836,5860,5940,5750,5780
2,2021-06-01,A006840,AK홀딩스,103691,35500,35600,34150,34400
3,2021-06-01,A054620,APS,462544,14600,14950,13800,14950
4,2021-06-01,A265520,AP시스템,131987,29150,29150,28800,29050
...,...,...,...,...,...,...,...,...
987995,2023-05-30,A189980,흥국에프엔비,272284,3005,3035,2955,2980
987996,2023-05-30,A000540,흥국화재,50218,3250,3255,3195,3215
987997,2023-05-30,A003280,흥아해운,130664,1344,1395,1340,1370
987998,2023-05-30,A037440,희림,141932,9170,9260,9170,9200


In [10]:
return_df = pd.read_pickle(subconfig.return_df_PATH)
close_df = pd.read_pickle(subconfig.adjclose_df_PATH)

In [11]:
open_df = pd.read_pickle(subconfig.adjopen_df_PATH)
high_df = pd.read_pickle(subconfig.adjhigh_df_PATH)
low_df = pd.read_pickle(subconfig.adjlow_df_PATH)

In [12]:
## date list

holidays = return_df.isnull().all(axis=1)
tradingdays = ~holidays

holidays = holidays.index[holidays]
tradingdays = tradingdays.index[tradingdays]

In [13]:
TRAIN_START = pd.to_datetime(subconfig.TRAIN_START, format='%Y-%m-%d')
REALOS_PORTFOLIO_DATE = pd.to_datetime(subconfig.REALOS_PORTFOLIO_DATE, format='%Y-%m-%d')

In [14]:
tradingdays = tradingdays[(tradingdays >= TRAIN_START) & (tradingdays <= REALOS_PORTFOLIO_DATE)]

In [15]:
dacon_sid_list = [ii[1:] for ii in krx_df['code'].unique()]

In [16]:
return_df = return_df.loc[tradingdays, :].dropna(axis='columns', how='all')
return_df = return_df.loc[:, dacon_sid_list]

close_df = close_df.loc[tradingdays, :].dropna(axis='columns', how='all')
close_df = close_df.loc[:, dacon_sid_list]

In [17]:
open_df = open_df.loc[tradingdays, :].dropna(axis='columns', how='all')
open_df = open_df.loc[:, dacon_sid_list]

high_df = high_df.loc[tradingdays, :].dropna(axis='columns', how='all')
high_df = high_df.loc[:, dacon_sid_list]

low_df = low_df.loc[tradingdays, :].dropna(axis='columns', how='all')
low_df = low_df.loc[:, dacon_sid_list]

In [18]:
# SIMOS_START = subconfig.SIMOS_START
# simOS_END = subconfig.SIMOS_END

### Import additional data

In [19]:
volume_df = pd.read_pickle(subconfig.volume_df_PATH)
dollarvolume_df = pd.read_pickle(subconfig.dollarvolume_df_PATH)
marketcap_df = pd.read_pickle(subconfig.marketcap_df_PATH)
market_cat_df = pd.read_pickle(DATA_PATH / 'market_cat_df_20140101_20230730.pickle')

In [20]:
volume_df = volume_df.loc[tradingdays, :].dropna(axis='columns', how='all')
volume_df = volume_df.loc[:, dacon_sid_list]

dollarvolume_df = dollarvolume_df.loc[tradingdays, :].dropna(axis='columns', how='all')
dollarvolume_df = dollarvolume_df.loc[:, dacon_sid_list]

marketcap_df = marketcap_df.loc[tradingdays, :].dropna(axis='columns', how='all')
marketcap_df = marketcap_df.loc[:, dacon_sid_list]

In [21]:
# Don't shift data since Insoo's code already makes a shift. 

# volume_df = volume_df.shift(1)
# dollarvolume_df = dollarvolume_df.shift(1)
# marketcap_df = marketcap_df.shift(1)

In [22]:
market_cat_inrange = market_cat_df[market_cat_df['trdDd'].isin(tradingdays)]

In [23]:
KOSPI_sid_list = market_cat_inrange[market_cat_inrange['is_KOSPI'] == True]['ISU_SRT_CD'].unique()
KOSDAQ_sid_list = market_cat_inrange[market_cat_inrange['is_KOSDAQ'] == True]['ISU_SRT_CD'].unique()
KONEX_sid_list = market_cat_inrange[market_cat_inrange['is_KONEX'] == True]['ISU_SRT_CD'].unique()

### Parameters

In [24]:
REALOS_PORTFOLIO_DATE = subconfig.REALOS_PORTFOLIO_DATE

RDVADV_WINDOW = subconfig.WINDOWS['rdvadv'] # 20

### normalized RDV/ADV signal

In [25]:
adv_df = dollarvolume_df.rolling(RDVADV_WINDOW, ).mean().dropna(axis='rows', how='all')

분모: average RDV/ADV ratio


In [26]:
avg_adv_s = adv_df.mean(axis='columns')
avg_rdv_s = dollarvolume_df.iloc[RDVADV_WINDOW:, :].mean(axis='columns')

In [27]:
avg_rdvadv_s = avg_rdv_s / avg_adv_s

분자: individual RDV/ADV ratio 

In [28]:
ii_rdvadv_df = dollarvolume_df.iloc[RDVADV_WINDOW:, :] / adv_df

In [29]:
normalized_rdvadv_signal_df = ii_rdvadv_df.divide(avg_rdvadv_s, axis='rows')
normalized_rdvadv_signal_df

ISU_SRT_CD,060310,095570,006840,054620,265520,211270,027410,282330,126600,138930,...,243070,084110,145020,024060,010240,189980,000540,003280,037440,238490
trdDd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-06-28,,,,,,,,,,,...,,,,,,,,,,
2021-06-29,0.415442,0.240279,0.405709,0.726369,0.475821,0.907576,0.4201,1.10324,0.539159,0.755196,...,1.81974,0.798739,0.631614,0.340136,14.3229,0.996211,0.600184,,0.369902,0.630198
2021-06-30,0.314977,0.212304,0.465795,0.931287,0.470661,0.514831,0.648498,1.99712,0.354948,0.501368,...,4.39418,8.5227,0.745765,0.319291,7.08257,0.416926,1.40616,,0.291685,0.618186
2021-07-01,0.399208,0.295054,0.570299,0.366269,0.496024,0.584884,1.26369,3.66506,0.356092,0.701445,...,1.03469,1.22015,0.426687,0.971435,1.91106,0.786424,0.705502,,0.495063,0.955606
2021-07-02,1.02063,0.146543,0.639072,0.410245,0.842014,0.260692,1.97162,1.5627,0.724157,0.519005,...,2.03734,0.881352,0.244086,2.64395,0.377858,0.422299,0.421016,,1.13061,0.666445
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-29,0.0976835,1.22697,0.955555,0.422123,1.01222,0.761076,1.2107,0.621806,1.16368,1.14935,...,0.709307,1.64333,0.83523,0.547578,0.807826,0.415824,0.410286,0.194055,0.786802,0.0887856
2023-06-30,0.147166,1.07672,1.07257,1.24508,0.730709,0.917454,0.876199,1.67797,0.760802,5.40722,...,0.683091,0.970563,0.932644,0.530727,0.929146,0.386497,0.159065,0.251946,0.635382,0.124265
2023-07-03,0.126083,0.736811,0.419303,0.357989,0.911999,0.542547,0.708495,0.601649,0.599315,1.19896,...,0.386994,0.829056,0.803878,0.771279,0.757736,0.358212,0.45834,2.51959,0.44265,0.111932
2023-07-04,0.690671,0.612167,0.367826,0.320846,0.533727,0.558628,1.10086,1.24825,0.279295,0.660552,...,0.345665,1.1905,0.558881,0.674143,0.942419,0.361271,0.72546,2.07744,0.61894,2.05056


In [30]:
MODEL_TRAIN_START = pd.to_datetime('2021-06-29', format='%Y-%m-%d')

## Alphas

### Integrating my data with Insoo's code

In [31]:
# Your function to calculate SMAPE
def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

In [32]:
results_df = pd.DataFrame(columns=['code'] + ['return_day_' + str(i) for i in range(1, 16)])

preds_df_fin_xgb = pd.DataFrame()
smapes_df_fin_xgb = pd.DataFrame()

preds_df_fin_lgbm = pd.DataFrame()
smapes_df_fin_lgbm = pd.DataFrame()


아래 코드는 Ryzen 5 5600X 6 Core (CPU 12) 로 돌렸을 때 

40분 가량 걸림. 

Windows에서 GPU 연산은 활용하기 어려움. 

- XGB: conda는 지원안함, Windows는 version conflict 남
- LGBM: Linux만 지원

In [33]:
# Iterate over each unique stock
for code in tqdm(dacon_sid_list):
    
    # Filter by stock code
    # Note: All prices are adjusted
    # TODO: Add normalized rdvadv signal to the columns

    train_close = pd.DataFrame(
        data={
            'open': open_df.loc[MODEL_TRAIN_START:REALOS_PORTFOLIO_DATE, code],
            'high': high_df.loc[MODEL_TRAIN_START:REALOS_PORTFOLIO_DATE, code],
            'low': low_df.loc[MODEL_TRAIN_START:REALOS_PORTFOLIO_DATE, code],
            'close': close_df.loc[MODEL_TRAIN_START:REALOS_PORTFOLIO_DATE, code],
            'close_t-1': close_df.loc[MODEL_TRAIN_START:REALOS_PORTFOLIO_DATE, code].shift(1),
            'close_t-3': close_df.loc[MODEL_TRAIN_START:REALOS_PORTFOLIO_DATE, code].shift(3),
            'close_t-5': close_df.loc[MODEL_TRAIN_START:REALOS_PORTFOLIO_DATE, code].shift(5),
            'dollarvolume': dollarvolume_df.loc[MODEL_TRAIN_START:REALOS_PORTFOLIO_DATE, code],
            'marketcap': marketcap_df.loc[MODEL_TRAIN_START:REALOS_PORTFOLIO_DATE, code],
            'norm_rdvadv': normalized_rdvadv_signal_df.loc[MODEL_TRAIN_START:REALOS_PORTFOLIO_DATE, code],
        }
        )
    train_close = train_close.iloc[5:, :] # nan 있는 1st row 제거 

    # Store original data for reference
    original_data = train_close.copy()

    # Create return columns for each day
    returns = []
    smapes_xgb = []
    smapes_lgbm = []
    

    preds_df_xgb = pd.DataFrame()
    preds_df_lgbm = pd.DataFrame()
    
    
    # For each day from 1 to 15
    for day in range(1, 16):
        # Scale data
        X = train_close[:]
        y = train_close['close']
        
        scaler = MinMaxScaler(feature_range=(-1, 1))
        data_scaled = scaler.fit_transform(X)
        data_scaled2 = y
        
        X_train = data_scaled[:-day]
        y_train = data_scaled2[day:]
        X_test = data_scaled[-day]
        
        X_train = X_train[:int(len(X_train) * 0.9)]
        X_val = X_train[int(len(X_train) * 0.9):]
        y_train = y_train[:int(len(y_train) * 0.9)]
        y_val = y_train[int(len(y_train) * 0.9):] 

        # Train XGBoost
        xgb_model = XGBRegressor()
        xgb_model.fit(X_train, y_train)
        vals_xgb = xgb_model.predict(X_val)
        smapes_xgb.append(smape(y_val, vals_xgb))
        
        preds_xgb = xgb_model.predict([data_scaled[-day]])
        preds_df_xgb = pd.concat([preds_df_xgb, pd.DataFrame(preds_xgb)], axis = 0)

        # Train LightGBM
        lgbm_model = LGBMRegressor(verbose=-1)
        lgbm_model.fit(X_train, y_train)
        vals_lgbm = lgbm_model.predict(X_val)
        smapes_lgbm.append(smape(y_val, vals_lgbm))
        
        preds_lgbm = lgbm_model.predict([data_scaled[-day]])
        preds_df_lgbm = pd.concat([preds_df_lgbm, pd.DataFrame(preds_lgbm)], axis = 0)
    
    smapes_df_xgb = pd.DataFrame(smapes_xgb)
    smapes_df_lgbm = pd.DataFrame(smapes_lgbm)
    

    preds_df_fin_xgb = pd.concat([preds_df_fin_xgb, preds_df_xgb], axis = 1)
    smapes_df_fin_xgb = pd.concat([smapes_df_fin_xgb, smapes_df_xgb], axis = 1)

    preds_df_fin_lgbm = pd.concat([preds_df_fin_lgbm, preds_df_lgbm], axis = 1)
    smapes_df_fin_lgbm = pd.concat([smapes_df_fin_lgbm, smapes_df_lgbm], axis = 1)

100%|██████████| 2000/2000 [44:46<00:00,  1.34s/it]


In [34]:
# smapes_df_xgb.to_pickle(OUTPUT_PATH / 'smapes_df_xgb.pickle')
# smapes_df_lgbm.to_pickle(OUTPUT_PATH / 'smapes_df_lgbm.pickle')
# smapes_df_catboost.to_pickle(OUTPUT_PATH / 'smapes_df_catboost.pickle')

# preds_df_fin_xgb.to_pickle(OUTPUT_PATH / 'preds_df_fin_xgb.pickle')
# smapes_df_fin_xgb.to_pickle(OUTPUT_PATH / 'smapes_df_fin_xgb.pickle')

# preds_df_fin_lgbm.to_pickle(OUTPUT_PATH / 'preds_df_fin_lgbm.pickle')
# smapes_df_fin_lgbm.to_pickle(OUTPUT_PATH / 'smapes_df_fin_lgbm.pickle')

In [35]:
smapes_df_fin_xgb.shape

(15, 2000)

In [36]:
smapes_df_fin_lgbm.shape

(15, 2000)

In [37]:
final = np.zeros((len(smapes_df_fin_xgb), len(smapes_df_fin_xgb.columns)))

for i in range(0, len(smapes_df_fin_xgb.columns)):
    for j in range(0, len(smapes_df_fin_xgb)):
        weights = [1 / smapes_df_fin_xgb.iloc[j:j+1, i].values[0],
                   
                   1 / smapes_df_fin_lgbm.iloc[j:j+1, i].values[0]]
        

        weights /= np.sum(weights) 
        

        final[j][i] = weights[0] * preds_df_fin_xgb.iloc[j:j+1, i].values[0] \
                            + weights[1] * preds_df_fin_lgbm.iloc[j:j+1, i].values[0] \
                        #     + weights[2] * preds_df_fin_lgbm.iloc[j:j+1, i].values[0]

In [38]:
final_df = pd.DataFrame(final)
final_values = pd.DataFrame((final_df.iloc[-1] - final_df.iloc[0]) / final_df.iloc[0])
final_values_sharpe = -pd.DataFrame(((final_df.iloc[-1] - final_df.iloc[0]) / final_df.iloc[0]) / final_df.pct_change().std())

In [39]:
final_values_sharpe.index = dacon_sid_list
final_values_sharpe.columns = ['VALUE']
final_values_sharpe.reset_index(inplace = True)
final_values_sharpe.columns = ['종목코드', 'VALUE']

In [40]:
final_values_sharpe.set_index('종목코드', inplace = True)

In [41]:
final_values_sharpe['VALUE']

종목코드
060310   -0.781179
095570    -3.30379
006840     0.59483
054620    -3.08624
265520    -2.48063
            ...   
189980    -5.45334
000540    -2.44831
003280     1.46988
037440    0.865517
238490    -1.69745
Name: VALUE, Length: 2000, dtype: float64

## Submission

In [42]:
alpha_feat_insoo = subutil.Submission(
    alpha_series=final_values_sharpe['VALUE'],
    alpha_name='Final_JP',
)

In [43]:
alpha_feat_insoo.get_rank(export_path=OUTPUT_PATH)

Saved to C:\Users\asaf0\Desktop\Work\KRX\output\Final_JP.csv


Unnamed: 0_level_0,순위
종목코드,Unnamed: 1_level_1
A060310,201
A095570,202
A006840,203
A054620,204
A265520,205
...,...
A189980,1940
A000540,1797
A003280,1798
A037440,1799


# Alpha Strategy by Woohyuk


## Basic Settings

### Import libraries

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from prophet import Prophet
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import pandas_ta as ta

import logging

logging.getLogger('prophet').setLevel(logging.ERROR)
logging.getLogger('cmdstanpy').setLevel(logging.ERROR)

import warnings

# Suppress Python warnings
warnings.simplefilter('ignore')

train = pd.read_csv('train.csv')

train

# 결과를 저장하기 위한 dataframe 생성
results_df = pd.DataFrame(columns=['종목코드', 'final_return'])

# train 데이터에 존재하는 독립적인 종목코드 추출
unique_codes = train['종목코드'].unique()

def create_features(df, n_days):
    df_new = df.copy()
    for feature in ['종가', '거래량', '시가', '고가', '저가']:
        for i in range(1, n_days+1):
            df_new[f'{feature}_lag_{i}'] = df_new[feature].shift(i)
    
    # Moving Averages
    df_new['SMA5'] = ta.sma(df_new['종가'], 5)
    df_new['SMA10'] = ta.sma(df_new['종가'], 10)
    
    # RSI
    df_new['RSI'] = ta.rsi(df_new['종가'])
    
    
    
    # Volatility
    df_new['VOLATILITY'] = df_new['종가'].rolling(window=n_days).std()
    
    # Momentum
    df_new['MOMENTUM'] = df_new['종가'] - df_new['종가'].shift(n_days)
    
    # Day of the Week
    df_new['DAY'] = df_new.index.dayofweek
    dummies = pd.get_dummies(df_new['DAY'], prefix='DAY')
    df_new = pd.concat([df_new, dummies], axis=1)
    df_new.drop(['DAY'], axis=1, inplace=True)
    
    return df_new

for code in tqdm(unique_codes):
    
    # 학습 데이터 생성
    train_data = train[train['종목코드'] == code][['일자', '종가', '거래량', '시가', '고가', '저가']]
    train_data['일자'] = pd.to_datetime(train_data['일자'], format='%Y%m%d')
    train_data.set_index('일자', inplace=True)
    train_data = create_features(train_data, 5)
    
    # Prophet 모델 학습 및 추론
    df_prophet = train_data[['종가']].reset_index().rename(columns={'일자':'ds', '종가':'y'})
    model_prophet = Prophet(yearly_seasonality=True, daily_seasonality=True)
    model_prophet.fit(df_prophet)
    future = model_prophet.make_future_dataframe(periods=15)
    forecast = model_prophet.predict(future)
    final_return_prophet = (forecast['yhat'].values[-1] - forecast['yhat'].values[-15]) / forecast['yhat'].values[-15]
    
    # Gradient Boosting 모델 학습 및 추론
    X = train_data.drop('종가', axis=1).values
    y = train_data['종가'].values
    model_gb = XGBRegressor(n_estimators=100, learning_rate=0.1)
    model_gb.fit(X, y)
    
    predictions = []
    X_last = X[-1, :]
    for _ in range(15):
        pred = model_gb.predict(X_last.reshape(1, -1))
        predictions.append(pred[0])
        X_last = np.roll(X_last, -1)
        X_last[-1] = pred
    final_return_gb = (predictions[-1] - predictions[0]) / predictions[0]
    
    # 두 모델의 결과를 앙상블
    final_return = (final_return_prophet + final_return_gb) / 2
    
    # 결과 저장
    results_df = results_df.append({'종목코드': code, 'final_return': final_return}, ignore_index=True)

results_df['순위'] = results_df['final_return'].rank(method='first', ascending=False).astype('int')
results_df

sample_submission = pd.read_csv('baseline_submission.csv')
sample_submission

baseline_submission = sample_submission[['종목코드']].merge(results_df[['종목코드', '순위']], on='종목코드', how='left')
baseline_submission

baseline_submission.to_csv('Final_WH.csv', index=False)

# Alpha Strategy by Insu

- pycaret으로 AutoML 을 이용

## Import Libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import pandas_ta as ta
import pycaret
from pycaret.regression import setup, compare_models

## Import Data

- 베이스 파일 내의 `data` 폴더에 `train.csv` , `train_additional.csv` 파일이 첨부되어 있는지 확인!

In [6]:
train = pd.read_csv('./data/train.csv')
train.reset_index(drop = True)

Unnamed: 0,일자,종목코드,종목명,거래량,시가,고가,저가,종가
0,20210601,A060310,3S,166690,2890,2970,2885,2920
1,20210601,A095570,AJ네트웍스,63836,5860,5940,5750,5780
2,20210601,A006840,AK홀딩스,103691,35500,35600,34150,34400
3,20210601,A054620,APS,462544,14600,14950,13800,14950
4,20210601,A265520,AP시스템,131987,29150,29150,28800,29050
...,...,...,...,...,...,...,...,...
987995,20230530,A189980,흥국에프엔비,272284,3005,3035,2955,2980
987996,20230530,A000540,흥국화재,50218,3250,3255,3195,3215
987997,20230530,A003280,흥아해운,130664,1344,1395,1340,1370
987998,20230530,A037440,희림,141932,9170,9260,9170,9200


In [7]:
# Your function to calculate SMAPE
def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

In [8]:
# Initialize results DataFrame
results_df = pd.DataFrame(np.random.rand(15, 2000))
results_df.columns = train['종목코드'].unique()

# train 데이터에 존재하는 독립적인 종목코드 추출
unique_codes = train['종목코드'].unique()

preds_df_fin = pd.DataFrame()

In [10]:
results_df.index = ['2023-05-31', '2023-06-01', '2023-06-02', '2023-06-05', '2023-06-07',
                    
'2023-06-08', '2023-06-09', '2023-06-12', '2023-06-13', '2023-06-14',
 '2023-06-15', '2023-06-16', '2023-06-19', '2023-06-20', '2023-06-21']

## Train

In [11]:
# Iterate over each unique stock
for code in tqdm(unique_codes):

    df = train[train['종목코드'] == code][['일자', '시가', '고가', '저가',  '종가', '거래량']]
    df['일자'] = pd.to_datetime(df['일자'], format='%Y%m%d')
    df.set_index('일자', inplace=True)
    df.columns = ['Open', 'High', 'Low', 'Close', 'Volume']

    for sma_n in range(1, 5):
        df[f'SMA_{sma_n}'] = df.ta.sma(close='Close', length=sma_n)

    for ema_n in range(1, 5):
        df[f'EMA_{ema_n}'] = df.ta.ema(close='Close', length=ema_n)

    df['RSI_14'] = df.ta.rsi(close='Close', length=14)  # Relative Strength Index
    df['MACD'] = df.ta.macd(close='Close')['MACD_12_26_9']  # Moving Average Convergence Divergence
    df['ADX'] = df.ta.adx(high='High', low='Low', close='Close')['ADX_14']  # Average Directional Index
    df['ATR_14'] = df.ta.atr(high='High', low='Low', close='Close')  # Average True Range
    df['CCI_14'] = df.ta.cci(high='High', low='Low', close='Close')  # Commodity Channel Index
    df['ROC_10'] = df.ta.roc(close='Close')  # Rate of Change
    # For each day from 1 to 15
    values = []
    
    for day in tqdm(range(1, 16)):
        # Scale data
        X = df
        y = df['Close']

        scaler = MinMaxScaler(feature_range=(0, 1))
        data_scaled = scaler.fit_transform(X)
        data_scaled2 = y

        X_train = data_scaled[:-day]
        y_train = data_scaled2[day:]
        X_test = data_scaled[-day]

        X_train = pd.DataFrame(X_train)
        y_train = pd.DataFrame(y_train)
        X_test = pd.DataFrame(X_test).T
        
        X_train.columns = X.columns
        y_train.columns = ['Target']
        X_test.columns = X.columns
        
        X_train.index = y_train.index
        X_test.index = [X.index[-day]]
        
        train_set = pd.concat([X_train, y_train], axis = 1).dropna()
        reg = setup(data = train_set, target = 'Target')
    # compare all models and select top 5
        model = compare_models(n_select = 1, cross_validation = False, verbose = False)
        value = model.predict(X_test)
        values.append(value)
        
    results_df[code] = values

  0%|          | 0/2000 [00:00<?, ?it/s]

Unnamed: 0,Description,Value
0,Session id,240
1,Target,Target
2,Target type,Regression
3,Original data shape,"(466, 20)"
4,Transformed data shape,"(466, 20)"
5,Transformed train set shape,"(326, 20)"
6,Transformed test set shape,"(140, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,3355
1,Target,Target
2,Target type,Regression
3,Original data shape,"(465, 20)"
4,Transformed data shape,"(465, 20)"
5,Transformed train set shape,"(325, 20)"
6,Transformed test set shape,"(140, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,1329
1,Target,Target
2,Target type,Regression
3,Original data shape,"(464, 20)"
4,Transformed data shape,"(464, 20)"
5,Transformed train set shape,"(324, 20)"
6,Transformed test set shape,"(140, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,7465
1,Target,Target
2,Target type,Regression
3,Original data shape,"(463, 20)"
4,Transformed data shape,"(463, 20)"
5,Transformed train set shape,"(324, 20)"
6,Transformed test set shape,"(139, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,1572
1,Target,Target
2,Target type,Regression
3,Original data shape,"(462, 20)"
4,Transformed data shape,"(462, 20)"
5,Transformed train set shape,"(323, 20)"
6,Transformed test set shape,"(139, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,851
1,Target,Target
2,Target type,Regression
3,Original data shape,"(461, 20)"
4,Transformed data shape,"(461, 20)"
5,Transformed train set shape,"(322, 20)"
6,Transformed test set shape,"(139, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,6114
1,Target,Target
2,Target type,Regression
3,Original data shape,"(460, 20)"
4,Transformed data shape,"(460, 20)"
5,Transformed train set shape,"(322, 20)"
6,Transformed test set shape,"(138, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,6429
1,Target,Target
2,Target type,Regression
3,Original data shape,"(459, 20)"
4,Transformed data shape,"(459, 20)"
5,Transformed train set shape,"(321, 20)"
6,Transformed test set shape,"(138, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,2086
1,Target,Target
2,Target type,Regression
3,Original data shape,"(458, 20)"
4,Transformed data shape,"(458, 20)"
5,Transformed train set shape,"(320, 20)"
6,Transformed test set shape,"(138, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,2534
1,Target,Target
2,Target type,Regression
3,Original data shape,"(457, 20)"
4,Transformed data shape,"(457, 20)"
5,Transformed train set shape,"(319, 20)"
6,Transformed test set shape,"(138, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,6350
1,Target,Target
2,Target type,Regression
3,Original data shape,"(456, 20)"
4,Transformed data shape,"(456, 20)"
5,Transformed train set shape,"(319, 20)"
6,Transformed test set shape,"(137, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,6182
1,Target,Target
2,Target type,Regression
3,Original data shape,"(455, 20)"
4,Transformed data shape,"(455, 20)"
5,Transformed train set shape,"(318, 20)"
6,Transformed test set shape,"(137, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,6863
1,Target,Target
2,Target type,Regression
3,Original data shape,"(454, 20)"
4,Transformed data shape,"(454, 20)"
5,Transformed train set shape,"(317, 20)"
6,Transformed test set shape,"(137, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,6957
1,Target,Target
2,Target type,Regression
3,Original data shape,"(453, 20)"
4,Transformed data shape,"(453, 20)"
5,Transformed train set shape,"(317, 20)"
6,Transformed test set shape,"(136, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,4182
1,Target,Target
2,Target type,Regression
3,Original data shape,"(452, 20)"
4,Transformed data shape,"(452, 20)"
5,Transformed train set shape,"(316, 20)"
6,Transformed test set shape,"(136, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple


100%|██████████| 15/15 [02:20<00:00,  9.35s/it]
  0%|          | 1/2000 [02:20<77:53:39, 140.28s/it]

Unnamed: 0,Description,Value
0,Session id,2264
1,Target,Target
2,Target type,Regression
3,Original data shape,"(466, 20)"
4,Transformed data shape,"(466, 20)"
5,Transformed train set shape,"(326, 20)"
6,Transformed test set shape,"(140, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,1086
1,Target,Target
2,Target type,Regression
3,Original data shape,"(465, 20)"
4,Transformed data shape,"(465, 20)"
5,Transformed train set shape,"(325, 20)"
6,Transformed test set shape,"(140, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,3295
1,Target,Target
2,Target type,Regression
3,Original data shape,"(464, 20)"
4,Transformed data shape,"(464, 20)"
5,Transformed train set shape,"(324, 20)"
6,Transformed test set shape,"(140, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,4130
1,Target,Target
2,Target type,Regression
3,Original data shape,"(463, 20)"
4,Transformed data shape,"(463, 20)"
5,Transformed train set shape,"(324, 20)"
6,Transformed test set shape,"(139, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,8634
1,Target,Target
2,Target type,Regression
3,Original data shape,"(462, 20)"
4,Transformed data shape,"(462, 20)"
5,Transformed train set shape,"(323, 20)"
6,Transformed test set shape,"(139, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,5430
1,Target,Target
2,Target type,Regression
3,Original data shape,"(461, 20)"
4,Transformed data shape,"(461, 20)"
5,Transformed train set shape,"(322, 20)"
6,Transformed test set shape,"(139, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,6598
1,Target,Target
2,Target type,Regression
3,Original data shape,"(460, 20)"
4,Transformed data shape,"(460, 20)"
5,Transformed train set shape,"(322, 20)"
6,Transformed test set shape,"(138, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,4160
1,Target,Target
2,Target type,Regression
3,Original data shape,"(459, 20)"
4,Transformed data shape,"(459, 20)"
5,Transformed train set shape,"(321, 20)"
6,Transformed test set shape,"(138, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,892
1,Target,Target
2,Target type,Regression
3,Original data shape,"(458, 20)"
4,Transformed data shape,"(458, 20)"
5,Transformed train set shape,"(320, 20)"
6,Transformed test set shape,"(138, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,6633
1,Target,Target
2,Target type,Regression
3,Original data shape,"(457, 20)"
4,Transformed data shape,"(457, 20)"
5,Transformed train set shape,"(319, 20)"
6,Transformed test set shape,"(138, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,4948
1,Target,Target
2,Target type,Regression
3,Original data shape,"(456, 20)"
4,Transformed data shape,"(456, 20)"
5,Transformed train set shape,"(319, 20)"
6,Transformed test set shape,"(137, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,1947
1,Target,Target
2,Target type,Regression
3,Original data shape,"(455, 20)"
4,Transformed data shape,"(455, 20)"
5,Transformed train set shape,"(318, 20)"
6,Transformed test set shape,"(137, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,8241
1,Target,Target
2,Target type,Regression
3,Original data shape,"(454, 20)"
4,Transformed data shape,"(454, 20)"
5,Transformed train set shape,"(317, 20)"
6,Transformed test set shape,"(137, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,3323
1,Target,Target
2,Target type,Regression
3,Original data shape,"(453, 20)"
4,Transformed data shape,"(453, 20)"
5,Transformed train set shape,"(317, 20)"
6,Transformed test set shape,"(136, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,6370
1,Target,Target
2,Target type,Regression
3,Original data shape,"(452, 20)"
4,Transformed data shape,"(452, 20)"
5,Transformed train set shape,"(316, 20)"
6,Transformed test set shape,"(136, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple


100%|██████████| 15/15 [02:26<00:00,  9.74s/it]
  0%|          | 2/2000 [04:46<79:46:22, 143.73s/it]

Unnamed: 0,Description,Value
0,Session id,4482
1,Target,Target
2,Target type,Regression
3,Original data shape,"(466, 20)"
4,Transformed data shape,"(466, 20)"
5,Transformed train set shape,"(326, 20)"
6,Transformed test set shape,"(140, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,753
1,Target,Target
2,Target type,Regression
3,Original data shape,"(465, 20)"
4,Transformed data shape,"(465, 20)"
5,Transformed train set shape,"(325, 20)"
6,Transformed test set shape,"(140, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,4358
1,Target,Target
2,Target type,Regression
3,Original data shape,"(464, 20)"
4,Transformed data shape,"(464, 20)"
5,Transformed train set shape,"(324, 20)"
6,Transformed test set shape,"(140, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,4522
1,Target,Target
2,Target type,Regression
3,Original data shape,"(463, 20)"
4,Transformed data shape,"(463, 20)"
5,Transformed train set shape,"(324, 20)"
6,Transformed test set shape,"(139, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,1063
1,Target,Target
2,Target type,Regression
3,Original data shape,"(462, 20)"
4,Transformed data shape,"(462, 20)"
5,Transformed train set shape,"(323, 20)"
6,Transformed test set shape,"(139, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,8930
1,Target,Target
2,Target type,Regression
3,Original data shape,"(461, 20)"
4,Transformed data shape,"(461, 20)"
5,Transformed train set shape,"(322, 20)"
6,Transformed test set shape,"(139, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,2150
1,Target,Target
2,Target type,Regression
3,Original data shape,"(460, 20)"
4,Transformed data shape,"(460, 20)"
5,Transformed train set shape,"(322, 20)"
6,Transformed test set shape,"(138, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,3434
1,Target,Target
2,Target type,Regression
3,Original data shape,"(459, 20)"
4,Transformed data shape,"(459, 20)"
5,Transformed train set shape,"(321, 20)"
6,Transformed test set shape,"(138, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,4286
1,Target,Target
2,Target type,Regression
3,Original data shape,"(458, 20)"
4,Transformed data shape,"(458, 20)"
5,Transformed train set shape,"(320, 20)"
6,Transformed test set shape,"(138, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,2268
1,Target,Target
2,Target type,Regression
3,Original data shape,"(457, 20)"
4,Transformed data shape,"(457, 20)"
5,Transformed train set shape,"(319, 20)"
6,Transformed test set shape,"(138, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple




Unnamed: 0,Description,Value
0,Session id,557
1,Target,Target
2,Target type,Regression
3,Original data shape,"(456, 20)"
4,Transformed data shape,"(456, 20)"
5,Transformed train set shape,"(319, 20)"
6,Transformed test set shape,"(137, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple


 67%|██████▋   | 10/15 [01:45<00:52, 10.59s/it]
  0%|          | 2/2000 [06:32<108:54:06, 196.22s/it]


KeyboardInterrupt: 

In [None]:
final_df = pd.DataFrame(final)
final_values = pd.DataFrame((final_df.iloc[-1] - final_df.iloc[0]) / final_df.iloc[0])
final_values_sharpe = -pd.DataFrame(((final_df.iloc[-1] - final_df.iloc[0]) / final_df.iloc[0]) / final_df.pct_change().std())

In [None]:
final_values_sharpe

In [None]:
final_values_sharpe.index = unique_codes
final_values_sharpe.columns = ['VALUE']
final_values_sharpe.reset_index(inplace = True)
final_values_sharpe.columns = ['종목코드', 'VALUE']

In [None]:
# Submit
# Set the ranks for NaNs starting from 1800 and decreasing
nan_indices = np.setdiff1d(unique_codes, final_values_sharpe['종목코드'].values)

# Drop the rows with NaNs from the original DataFrame
baseline_submission = final_values_sharpe[~final_values_sharpe['종목코드'].isin(nan_indices)]
final_values_sharpe.sort_values('VALUE', ascending = False, inplace = True)
baseline_submission.sort_values('VALUE', ascending = False, inplace = True)

# Split the remaining rank range into two parts: before and after the ranks assigned to NaNs
before_nan_ranks = np.arange(1, 1801 - len(nan_indices))
after_nan_ranks = np.arange(1801, 2001)
nan_ranks = np.arange(1801 - len(nan_indices), 1801)

# Assign the remaining ranks
baseline_submission['순위'] = np.concatenate([before_nan_ranks, after_nan_ranks])
nan_vals = pd.DataFrame()
nan_vals['종목코드'] = nan_indices
nan_vals['VALUE'] = np.nan
nan_vals['순위'] = nan_ranks

# Insert the rows with NaNs (now ranked)
baseline_submission = pd.concat([baseline_submission, nan_vals])

# Sort the DataFrame by '순위'
baseline_submission = baseline_submission.sort_values(by='순위')
# Ensure '순위' is of integer type
baseline_submission['순위'] = baseline_submission['순위'].astype('int')
baseline_submission = baseline_submission.drop(['VALUE'], axis = 1)
sample_submission = pd.read_csv('./sample_submission.csv')
baseline_submission = sample_submission[['종목코드']].merge(baseline_submission[['종목코드', '순위']], on='종목코드', how='left')

In [None]:
baseline_submission.to_csv('./output/CIS.csv', index=False)

## Submission Ensemble (Voting System)

In [None]:
import pandas as pd
import numpy as np

import submission_util as subutil
import submission_config as subconfig

df1 = pd.read_csv(subconfig.OUTPUT_PATH / 'alpha_feat_insoo_lagged_ReverseSharpe-final.csv', index_col = 0)
df2 = pd.read_csv(subconfig.OUTPUT_PATH / 'CIS.csv', index_col = 0)
df3 = pd.read_csv(subconfig.OUTPUT_PATH / 'alpha_feat_insoo_lagged_ReverseSharpe_disparity_added_2.csv', index_col = 0)

In [None]:
df1.sort_index(inplace = True)
df2.sort_index(inplace = True)
df3.sort_index(inplace = True)

In [None]:
def rank_to_vote(rank):
    if rank <= 200:
        return 1
    elif rank >= 1801:
        return -1
    else:
        return 0

In [None]:
df1['vote'] = df1['순위'].apply(lambda x: rank_to_vote(x))
df2['vote'] = df2['순위'].apply(lambda x: rank_to_vote(x))
df3['vote'] = df3['순위'].apply(lambda x: rank_to_vote(x))

In [None]:
final = df1['vote'] + df2['vote'] + df3['vote']
final.index = [code[1:] for code in final.index]
final

## Submission

In [None]:
final_submission = subutil.Submission(final, 'final')

In [None]:
final_submission.get_rank(export_path=subconfig.OUTPUT_PATH)

- 이렇게 하면, output 폴더에 `final.csv` 파일이 생성됩니다.
- 이 `final.csv` 파일이 곧 '정답 파일' 입니다.

## 함께 사용된 Python 모듈

`submission_config.py`

In [None]:
from pathlib import Path

## Path configs

BASE_PATH = Path('.').resolve()
DATA_PATH = BASE_PATH / 'data'
OUTPUT_PATH = BASE_PATH / 'output'

krx_df_PATH = DATA_PATH / 'train.csv'
return_df_PATH = DATA_PATH / 'return_20140101_20230730.pkl'
adjclose_df_PATH = DATA_PATH / 'adjClose_20140101_20230730.pkl'
adjhigh_df_PATH = DATA_PATH / 'adjHigh_20140101_20230730.pkl'
adjlow_df_PATH = DATA_PATH / 'adjLow_20140101_20230730.pkl'
adjopen_df_PATH = DATA_PATH / 'adjOpen_20140101_20230730.pkl'
volume_df_PATH = DATA_PATH / 'volume_df_20140101_20230730.pkl'
dollarvolume_df_PATH = DATA_PATH / 'dollarvolume_df_20140101_20230730.pkl'
marketcap_df_PATH = DATA_PATH / 'marketcap_df_20140101_20230730.pkl'

## Param configs

# train (custom)
TRAIN_START = '2021-06-01'

# SimOS
PORTFOLIO_DATE = '2023-05-30' 
SIMOS_START = '2023-05-31'
SIMOS_END = '2023-06-21'

# RealOS
REALOS_PORTFOLIO_DATE = '2023-07-28' 
REALOS_START = '2023-07-31'

WINDOWS = {
    'rdvadv': 20,
}

`submission_util.py`

In [None]:
import numpy as np
import pandas as pd

from pathlib import Path

from sklearn.metrics import (
    confusion_matrix, 
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    roc_auc_score, 
    roc_curve, 
    auc
    )
# TODO: SimOS 에서의 정답을 알고있다. 그러므로 eval metric 계산할 수 있다. 

import submission_config as subconfig

## Params
DACON_SID_CNT = 2000
SIMOS_START = subconfig.SIMOS_START
SIMOS_END = subconfig.SIMOS_END

## Import data
krx_df = pd.read_csv(subconfig.krx_df_PATH)
adjclose_df = pd.read_pickle(subconfig.adjclose_df_PATH)
return_df = pd.read_pickle(subconfig.return_df_PATH)

def get_simos_data(return_df, adjclose_df):
    holidays = return_df.isnull().all(axis=1)
    tradingdays = ~holidays

    holidays = holidays.index[holidays]
    tradingdays = tradingdays.index[tradingdays]

    return_df = return_df.loc[tradingdays, :]
    adjclose_df = adjclose_df.loc[tradingdays, :]

    return_df = return_df.loc[SIMOS_START:SIMOS_END, :]
    adjclose_df = adjclose_df.loc[SIMOS_START:SIMOS_END, :]

    return return_df, adjclose_df

# TODO: Confusing if global variables are not capitalized
simos_return_df, simos_adjclose_df = get_simos_data(return_df, adjclose_df) # simos period, only trading days

## for filtering
def get_tradables(adjclose_df, trading_date=subconfig.PORTFOLIO_DATE):
    sid_list = adjclose_df.columns

    notnull = adjclose_df.loc[trading_date, :].notnull()
    notzero = adjclose_df.loc[trading_date, :] != 0

    return sid_list[notnull * notzero]

def is_tradables(sid_list, tradables):
    tradables = set(tradables)

    return np.array([True if sid in tradables else False for sid in sid_list])

def get_daconsids(krx_df):
    krx_df.columns = ['date', 'code', 'name', 'volume', 'open', 'high', 'low', 'close']
    dacon_sid_list = [ii[1:] for ii in krx_df['code'].unique()] # 060310 형식으로 바꿔줌

    return dacon_sid_list

def is_daconsids(sid_list, daconsids):
    daconsids = set(daconsids)

    return np.array([True if sid in daconsids else False for sid in sid_list])

class Submission:
    holding_return_s = (simos_adjclose_df.loc[SIMOS_END, :] - simos_adjclose_df.loc[SIMOS_START, :]).divide(simos_adjclose_df.loc[SIMOS_START, :])  
    holding_return_s = holding_return_s.fillna(0)

    # simos_winners = 
    # TODO: Add data science evaluation metrics

    # TODO: Make not-instance-specific variables to class variables
    def __init__(self, alpha_series:pd.Series, alpha_name:str, top=200, bottom=200):
        self.alpha_series = alpha_series
        self.alpha_name = alpha_name
        self.top = top
        self.bottom = bottom

        self.sid_list = self.alpha_series.index
        self.tradables = get_tradables(adjclose_df)
        self.daconsids = get_daconsids(krx_df)
    
        self.is_selectables = is_tradables(self.sid_list, self.tradables) * is_daconsids(self.sid_list, self.daconsids)
        self.submission_df = None
        self.alpha_winners = None
        self.alpha_losers = None

        # for excess return
        self.long_hpr = None
        self.short_hpr = None
        self.final_return = None

        # for variance
        self.long_returns = None
        self.short_returns = None
        
    def get_rank(self, export_path=None):
        selectables = self.alpha_series[self.is_selectables]
        top_s = selectables.nlargest(self.top)
        bottom_s = selectables.nsmallest(self.bottom)
        
        self.alpha_winners = top_s.index
        self.alpha_losers = bottom_s.index
        
        submission_df = pd.DataFrame(
            data={'rank': [-1]*DACON_SID_CNT},
            index=self.daconsids
        )
        submission_df.index.name = 'sid'

        submission_df['rank'][top_s.index] = np.arange(1, self.top+1)
        submission_df['rank'][bottom_s.index] = np.arange(DACON_SID_CNT, DACON_SID_CNT - self.bottom, -1)

        submission_df['rank'][submission_df['rank'] == -1] = np.arange(self.top+1, DACON_SID_CNT - self.bottom + 1)

        self.submission_df = submission_df

        if export_path:
            submission_df.index = ['A' + idx for idx in submission_df.index]
            submission_df.index.name = '종목코드'
            submission_df.columns = ['순위']
            submission_df.to_csv(export_path / f'{self.alpha_name}.csv', encoding='utf-8')
            
            print(f'Saved to {export_path / self.alpha_name}.csv')
            return submission_df

        return submission_df

    def get_excess_return(self, risk_free_rate=0.035, days_of_trading=15):
        self.long_hpr = Submission.holding_return_s[self.alpha_winners].sum()
        self.short_hpr = Submission.holding_return_s[self.alpha_losers].sum()

        self.final_return = (self.long_hpr - self.short_hpr) / 400

        annualized_final_return = self.final_return * 250 / days_of_trading
        excess_return = annualized_final_return - risk_free_rate

        return excess_return
    
    def get_volatility(self, days_of_trading=15):
        self.long_returns = simos_return_df.loc[:, self.alpha_winners].mean(axis=1)
        self.short_returns = simos_return_df.loc[:, self.alpha_losers].mean(axis=1)

        annualized_portfolio_returns = (self.long_returns - self.short_returns) / 2 * 250
        annualized_mean_returns = annualized_portfolio_returns.mean()
        
        annualized_portfolio_volatility = np.sqrt((annualized_portfolio_returns - annualized_mean_returns).pow(2)[2:].sum() / (days_of_trading-2))

        return annualized_portfolio_volatility

    def get_Sharpe(self):
        return self.get_excess_return() / self.get_volatility()

    
class Score:
    holding_return_s = (simos_adjclose_df.loc[SIMOS_END, :] - simos_adjclose_df.loc[SIMOS_START, :]).divide(simos_adjclose_df.loc[SIMOS_START, :])  
    holding_return_s = holding_return_s.fillna(0)

    def __init__(self, submission_csv_filepath, alpha_name, top=200, bottom=200, encoding='utf-8'):
        self.alpha_name = alpha_name
        self.top = top
        self.bottom = bottom

        with open(submission_csv_filepath, 'r', encoding=encoding) as f:
            submission_df = pd.read_csv(f, index_col=0)
        
        submission_df.index = [idx[1:] for idx in submission_df.index]
        submission_df.index.name = 'sid'
        submission_df.columns = ['rank']

        self.alpha_series = submission_df['rank']
        self.sid_list = self.alpha_series.index

        # TODO: Add validations

        self.submission_df = None
        self.alpha_winners = self.alpha_series.nsmallest(self.top).index
        self.alpha_losers = self.alpha_series.nlargest(self.bottom).index

        # for excess return
        self.long_hpr = None
        self.short_hpr = None
        self.final_return = None

        # for variance
        self.long_returns = None
        self.short_returns = None
    
    def get_excess_return(self, risk_free_rate=0.035, days_of_trading=15):
        self.long_hpr = Score.holding_return_s[self.alpha_winners].sum()
        self.short_hpr = Score.holding_return_s[self.alpha_losers].sum()

        self.final_return = (self.long_hpr - self.short_hpr) / 400

        annualized_final_return = self.final_return * 250 / days_of_trading
        excess_return = annualized_final_return - risk_free_rate

        return excess_return

    def get_volatility(self, days_of_trading=15):
        self.long_returns = simos_return_df.loc[:, self.alpha_winners].mean(axis=1)
        self.short_returns = simos_return_df.loc[:, self.alpha_losers].mean(axis=1)

        annualized_portfolio_returns = (self.long_returns - self.short_returns) / 2 * 250
        annualized_mean_returns = annualized_portfolio_returns.mean()
        
        annualized_portfolio_volatility = np.sqrt((annualized_portfolio_returns - annualized_mean_returns).pow(2)[2:].sum() / (days_of_trading-2))

        return annualized_portfolio_volatility

    def get_Sharpe(self):
        sharpe = self.get_excess_return() / self.get_volatility()
        print(f'Sharpe of {self.alpha_name}: {sharpe}')

        return sharpe