In [8]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from prophet import Prophet
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import pandas_ta as ta


import warnings

# Suppress Python warnings
warnings.simplefilter('ignore')

In [9]:
train = pd.read_csv('./data/train.csv')

In [10]:
train

Unnamed: 0,일자,종목코드,종목명,거래량,시가,고가,저가,종가
0,20210601,A060310,3S,166690,2890,2970,2885,2920
1,20210601,A095570,AJ네트웍스,63836,5860,5940,5750,5780
2,20210601,A006840,AK홀딩스,103691,35500,35600,34150,34400
3,20210601,A054620,APS,462544,14600,14950,13800,14950
4,20210601,A265520,AP시스템,131987,29150,29150,28800,29050
...,...,...,...,...,...,...,...,...
987995,20230530,A189980,흥국에프엔비,272284,3005,3035,2955,2980
987996,20230530,A000540,흥국화재,50218,3250,3255,3195,3215
987997,20230530,A003280,흥아해운,130664,1344,1395,1340,1370
987998,20230530,A037440,희림,141932,9170,9260,9170,9200


In [11]:
# 결과를 저장하기 위한 dataframe 생성
results_df = pd.DataFrame(columns=['종목코드', 'final_return'])

# train 데이터에 존재하는 독립적인 종목코드 추출
unique_codes = train['종목코드'].unique()

In [12]:
def compute_rsi(data, window=14):
    delta = data.diff()
    up = delta.where(delta > 0, 0)
    down = -delta.where(delta < 0, 0)
    avg_gain = up.rolling(window=window, min_periods=1).mean()
    avg_loss = down.rolling(window=window, min_periods=1).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi


In [13]:
def compute_atr(data, window=14):
    high = data['고가']
    low = data['저가']
    close = data['종가']
    tr1 = high - low
    tr2 = abs(high - close.shift())
    tr3 = abs(low - close.shift())
    tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    atr = tr.rolling(window=window).mean()
    return atr

In [14]:
def create_features(df, n_days):
    df_new = df.copy()
    for feature in ['종가', '거래량', '시가', '고가', '저가']:
        for i in range(1, n_days+1):
            df_new[f'{feature}_lag_{i}'] = df_new[feature].shift(i)
    df_new['rsi_14'] = compute_rsi(df_new['종가'])
    df_new['atr_14'] = compute_atr(df_new)
    df_new.dropna(inplace=True)
    return df_new

In [15]:
for code in tqdm(unique_codes):
    
    # 학습 데이터 생성
    train_data = train[train['종목코드'] == code][['일자', '종가', '거래량', '시가', '고가', '저가']]
    train_data['일자'] = pd.to_datetime(train_data['일자'], format='%Y%m%d')
    train_data.set_index('일자', inplace=True)
    train_data = create_features(train_data, 5)
    
    # Prophet 모델 학습 및 추론
    df_prophet = train_data[['종가']].reset_index().rename(columns={'일자':'ds', '종가':'y'})
    model_prophet = Prophet()
    model_prophet.fit(df_prophet)
    future = model_prophet.make_future_dataframe(periods=15)
    forecast = model_prophet.predict(future)
    final_return_prophet = (forecast['yhat'].values[-1] - forecast['yhat'].values[-15]) / forecast['yhat'].values[-15]
    
    # Gradient Boosting 모델 학습 및 추론
    X = train_data.drop('종가', axis=1).values
    y = train_data['종가'].values
    model_gb = XGBRegressor(n_estimators=100, learning_rate=0.1)
    model_gb.fit(X, y)
    
    predictions = []
    X_last = X[-1, :]
    for _ in range(15):
        pred = model_gb.predict(X_last.reshape(1, -1))
        predictions.append(pred[0])
        X_last = np.roll(X_last, -1)
        X_last[-1] = pred
    final_return_gb = (predictions[-1] - predictions[0]) / predictions[0]
    
    # 두 모델의 결과를 앙상블
    final_return = (final_return_prophet + final_return_gb) / 2
    
    # 결과 저장
    results_df = results_df.append({'종목코드': code, 'final_return': final_return}, ignore_index=True)

results_df['순위'] = results_df['final_return'].rank(method='first', ascending=False).astype('int')
results_df

sample_submission = pd.read_csv('baseline_submission.csv')
sample_submission

baseline_submission = sample_submission[['종목코드']].merge(results_df[['종목코드', '순위']], on='종목코드', how='left')
baseline_submission

baseline_submission.to_csv('Final_WH.csv', index=False)

  0%|          | 0/2000 [00:00<?, ?it/s]


AttributeError: 'Prophet' object has no attribute 'stan_backend'