## Import

In [5]:
import pandas as pd
import numpy as np
import random
import os

from tqdm import tqdm
from statsmodels.tsa.arima.model import ARIMA
import itertools

import warnings
warnings.filterwarnings("ignore")

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Data Load

In [7]:
train = pd.read_csv('./train.csv')

## Model Define, Train and Inference

In [18]:
# 추론 결과를 저장하기 위한 dataframe 생성
results_df = pd.DataFrame(columns=['종목코드', 'final_return'])

# train 데이터에 존재하는 독립적인 종목코드 추출
unique_codes = train['종목코드'].unique()

p = range(0, 3)
d = range(1, 3)
q = range(0, 3)
pdq = list(itertools.product(p, d, q))

# 각 종목코드에 대해서 모델 학습 및 추론 반복
pdqN = {
    (0,1,0):0,
    (0,1,1):0,
    (0,1,2):0,
    (0,2,0):0,
    (0,2,1):0,
    (0,2,2):0,
    
    (1,1,0):0,
    (1,1,1):0,
    (1,1,2):0,
    (1,2,0):0,
    (1,2,1):0,
    (1,2,2):0,

    (2,1,0):0,
    (2,1,1):0,
    (2,1,2):0,
    (2,2,0):0,
    (2,2,1):0,
    (2,2,2):0
}
for code in tqdm(unique_codes):
    
    # 학습 데이터 생성
    train_close = train[train['종목코드'] == code][['일자', '종가']]
    train_close['일자'] = pd.to_datetime(train_close['일자'], format='%Y%m%d')
    train_close.set_index('일자', inplace=True)
    tc = train_close['종가']
    
    AIC = []
    for i in pdq :
        model = ARIMA(tc, order=(i))
        model_fit = model.fit()
        AIC.append(round(model_fit.aic, 2))
    optim = [(pdq[i], j) for i, j in enumerate(AIC) if j == min(AIC)]
    pdqN[optim[0][0]] += 1



100%|██████████| 2000/2000 [14:48<00:00,  2.25it/s]

{(0, 1, 0): 592, (0, 1, 1): 134, (0, 2, 0): 0, (0, 2, 1): 490, (1, 1, 0): 132, (1, 1, 1): 356, (1, 2, 0): 0, (1, 2, 1): 296}





In [19]:
print(pdqN)

{(0, 1, 0): 592, (0, 1, 1): 134, (0, 2, 0): 0, (0, 2, 1): 490, (1, 1, 0): 132, (1, 1, 1): 356, (1, 2, 0): 0, (1, 2, 1): 296}


In [20]:
for code in tqdm(unique_codes):
    
    # 학습 데이터 생성
    train_close = train[train['종목코드'] == code][['일자', '종가']]
    train_close['일자'] = pd.to_datetime(train_close['일자'], format='%Y%m%d')
    train_close.set_index('일자', inplace=True)
    tc = train_close['종가']
    
    # order에 pdqN이 가장 큰 값으로 넣어보자!
    model = ARIMA(tc, order=(0, 1, 0))
    model_fit = model.fit()
    predictions = model_fit.forecast(steps=15) # 향후 15개의 거래일에 대해서 예측
    
    # 최종 수익률 계산
    final_return = (predictions.iloc[-1] - predictions.iloc[0]) / predictions.iloc[0]

    # 결과 저장
    new_df = pd.DataFrame({'종목코드':[code], "final_return":[final_return]})
    results_df = pd.concat([results_df, new_df], ignore_index=True)

 13%|█▎        | 258/2000 [00:20<02:16, 12.74it/s]


KeyboardInterrupt: 

In [11]:
results_df['순위'] = results_df['final_return'].rank(method='first', ascending=False).astype('int') # 각 순위를 중복없이 생성
results_df

Unnamed: 0,종목코드,final_return,순위
0,A060310,0.001830,296
1,A095570,-0.016845,1757
2,A006840,-0.000807,1378
3,A054620,0.000000,490
4,A265520,0.000000,491
...,...,...,...
1995,A189980,-0.009033,1607
1996,A000540,-0.012311,1678
1997,A003280,0.022652,71
1998,A037440,0.000000,1239


## Submit

In [14]:
sample_submission = pd.read_csv('./submission/sample_submission.csv')
sample_submission

Unnamed: 0,종목코드,순위
0,A000020,1
1,A000040,2
2,A000050,3
3,A000070,4
4,A000080,5
...,...,...
1995,A375500,1996
1996,A378850,1997
1997,A383220,1998
1998,A383310,1999


In [15]:
submission = sample_submission[['종목코드']].merge(results_df[['종목코드', '순위']], on='종목코드', how='left')
submission

Unnamed: 0,종목코드,순위
0,A000020,1364
1,A000040,1816
2,A000050,597
3,A000070,772
4,A000080,1157
...,...,...
1995,A375500,509
1996,A378850,1587
1997,A383220,512
1998,A383310,1265


In [16]:
submission.to_csv('./submission/submission0717_2.csv', index=False)