## Import

In [2]:
import pandas as pd
import numpy as np
import random
import os
import pmdarima as pm
from pmdarima.arima import ndiffs

from tqdm import tqdm
from statsmodels.tsa.arima.model import ARIMA
import itertools

import warnings
warnings.filterwarnings("ignore")

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Data Load

In [73]:
train = pd.read_csv('./train.csv')
train.drop(train[(train['일자'] < 20220101)].index, inplace=True)
print(train)

              일자     종목코드     종목명      거래량     시가     고가     저가     종가
294000  20220103  A060310      3S   114207   3325   3370   3260   3300
294001  20220103  A095570  AJ네트웍스   171188   5410   5470   5300   5320
294002  20220103  A006840   AK홀딩스    12856  19550  19600  19250  19550
294003  20220103  A054620     APS   275806  14500  14750  13600  14000
294004  20220103  A265520   AP시스템   123779  25200  25250  24650  25150
...          ...      ...     ...      ...    ...    ...    ...    ...
987995  20230530  A189980  흥국에프엔비   272284   3005   3035   2955   2980
987996  20230530  A000540    흥국화재    50218   3250   3255   3195   3215
987997  20230530  A003280    흥아해운   130664   1344   1395   1340   1370
987998  20230530  A037440      희림   141932   9170   9260   9170   9200
987999  20230530  A238490      힘스  2611843   6410   8220   6300   8220

[694000 rows x 8 columns]


## Model Define, Train and Inference

In [87]:
# 추론 결과를 저장하기 위한 dataframe 생성
results_df = pd.DataFrame(columns=['종목코드', 'final_return'])

# train 데이터에 존재하는 독립적인 종목코드 추출
unique_codes = train['종목코드'].unique()

for code in tqdm(unique_codes):
    # 학습 데이터 생성
    train_close = train[train['종목코드'] == code][['일자', '종가']]
    train_close['일자'] = pd.to_datetime(train_close['일자'], format='%Y%m%d')
    train_close.set_index('일자', inplace=True)
    tc = train_close['종가']
    """
    n_diffs = ndiffs(tc, alpha=0.05, test='adf', max_d=6)
    model = pm.auto_arima(
            y=tc, 
            d=n_diffs, 
            start_p=0, max_p=3, 
            start_q=0, max_q=3, 
            m=1, seasonal=False,
            # seasonal 여부도 확인할 수 있으면 좋을텐뎁;
            #stepwise=False,
            #trace=True
    )
    
    model_str = str(model)
    pdq = model_str.split(')')[0].split('(')[1].split(',')
    _order = [int(pdq[0]), int(pdq[1]), int(pdq[2])]
    print(_order)
    """
    #model = ARIMA(tc, order=_order)
    model = ARIMA(tc, order=[1,1,2])
    model_fit = model.fit()
    predictions = model_fit.forecast(steps=15) # 향후 15개의 거래일에 대해서 예측
    print(predictions)

    break
    # 최종 수익률 계산
    #final_return = predictions.iloc[-1] / predictions.iloc[0]

    # 결과 저장
    #new_df = pd.DataFrame({'종목코드':[code], "final_return":[final_return]})
    #results_df = pd.concat([results_df, new_df], ignore_index=True)



  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

[0, 1, 0]
347    2986.327625
348    2955.768423
349    2927.853312
350    2902.353514
351    2879.060049
352    2857.782017
353    2838.345035
354    2820.589809
355    2804.370828
356    2789.555171
357    2776.021416
358    2763.658649
359    2752.365553
360    2742.049575
361    2732.626172
Name: predicted_mean, dtype: float64





In [57]:
results_df['순위'] = results_df['final_return'].rank(method='first', ascending=False).astype('int') # 각 순위를 중복없이 생성
results_df

Unnamed: 0,종목코드,final_return,순위
0,A060310,1.000000,419
1,A095570,1.000000,420
2,A006840,0.999193,1832
3,A054620,1.000000,421
4,A265520,1.000000,422
...,...,...,...
1995,A189980,1.000328,347
1996,A000540,1.000000,1717
1997,A003280,1.000000,1718
1998,A037440,0.968124,1986


In [58]:
predictions

494    8220.0
495    8220.0
496    8220.0
497    8220.0
498    8220.0
499    8220.0
500    8220.0
501    8220.0
502    8220.0
503    8220.0
504    8220.0
505    8220.0
506    8220.0
507    8220.0
508    8220.0
Name: predicted_mean, dtype: float64

In [55]:
results_df

Unnamed: 0,종목코드,final_return,순위
0,A060310,0.000000,419
1,A095570,0.000000,420
2,A006840,-0.000807,1832
3,A054620,0.000000,421
4,A265520,0.000000,422
...,...,...,...
1995,A189980,0.000328,347
1996,A000540,0.000000,1717
1997,A003280,0.000000,1718
1998,A037440,-0.031876,1986


## Submit

In [14]:
sample_submission = pd.read_csv('./submission/sample_submission.csv')
sample_submission

Unnamed: 0,종목코드,순위
0,A000020,1
1,A000040,2
2,A000050,3
3,A000070,4
4,A000080,5
...,...,...
1995,A375500,1996
1996,A378850,1997
1997,A383220,1998
1998,A383310,1999


In [15]:
submission = sample_submission[['종목코드']].merge(results_df[['종목코드', '순위']], on='종목코드', how='left')
submission

Unnamed: 0,종목코드,순위
0,A000020,1364
1,A000040,1816
2,A000050,597
3,A000070,772
4,A000080,1157
...,...,...
1995,A375500,509
1996,A378850,1587
1997,A383220,512
1998,A383310,1265


In [16]:
submission.to_csv('./submission/submission0717_2.csv', index=False)