In [2]:
%pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.6-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.6

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

In [4]:
train = pd.read_csv('./train.csv')
results_df = pd.DataFrame(columns=['종목코드', 'final_return'])

In [5]:

# train 데이터에 존재하는 독립적인 종목코드 추출
unique_codes = train['종목코드'].unique()

In [6]:
# 과거 데이터를 기반으로 데이터프레임을 구성하는 함수
def create_features(df, n_days):
    df_new = df.copy()
    for i in range(1, n_days+1):
        df_new[f'종가_lag_{i}'] = df_new['종가'].shift(i)
    df_new.dropna(inplace=True)
    return df_new


In [9]:
# 각 종목코드에 대해서 모델 학습 및 추론 반복
for code in tqdm(unique_codes):
    
    # 학습 데이터 생성
    train_data = train[train['종목코드'] == code][['일자', '종가']]
    train_data['일자'] = pd.to_datetime(train_data['일자'], format='%Y%m%d')
    train_data.set_index('일자', inplace=True)
    
    # 과거 5일의 데이터를 특성으로 사용
    train_data = create_features(train_data, 5)
    X = train_data.drop('종가', axis=1).values
    y = train_data['종가'].values
    
    # 데이터 분할
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    
    # 모델 선언 및 학습
    model = XGBRegressor(n_estimators=100, learning_rate=0.1)
    model.fit(X_train, y_train)
    
    # 향후 15개의 거래일에 대해 추론
    predictions = []
    X_last = X_test[-1, :]
    for _ in range(15):
        pred = model.predict(X_last.reshape(1, -1))
        predictions.append(pred[0])
        X_last = np.roll(X_last, -1)
        X_last[-1] = pred

    # 최종 수익률 계산
    final_return = (predictions[-1] - predictions[0]) / predictions[0]
    
    # 결과 저장
    results_df = pd.concat([results_df, pd.DataFrame({'종목코드': [code], 'final_return': [final_return]})], ignore_index=True)




100%|██████████| 2000/2000 [08:31<00:00,  3.91it/s]


In [17]:
# 각 순위를 중복없이 생성
results_df['순위'] = results_df['final_return'].rank(method='first', ascending=False).astype('int')
results_df

Unnamed: 0,종목코드,final_return,순위
0,A060310,-0.290423,1994
1,A095570,0.011021,878
2,A006840,0.086066,104
3,A054620,-0.004418,1432
4,A265520,0.009872,904
...,...,...,...
1995,A189980,-0.002342,1387
1996,A000540,-0.001649,1366
1997,A003280,-0.003236,1404
1998,A037440,0.108953,67


In [21]:
# 주식별로 15일 동안의 예상 수익률 계산
results_df['expected_return'] = results_df['final_return'].apply(lambda x: (x + 1) ** (1/15) - 1)

# 상위 200개 주식과 하위 200개 주식 선택
top_200 = results_df.sort_values(by='final_return', ascending=False).head(200)
bottom_200 = results_df.sort_values(by='final_return', ascending=False).tail(200)

# 롱 포지션과 숏 포지션에 대한 예상 수익률 계산
long_return = top_200['expected_return'].mean()
short_return = bottom_200['expected_return'].mean()

# 포트폴리오 전체의 예상 수익률 계산
portfolio_return = (long_return * 200 - short_return * 200) / 400

# 롱 포지션과 숏 포지션에 대한 수익률의 표준편차 계산
long_std = top_200['expected_return'].std()
short_std = bottom_200['expected_return'].std()

# 포트폴리오 전체의 수익률 표준편차 계산
portfolio_std = ((long_std ** 2 * 200 + short_std ** 2 * 200) / 400) ** 0.5

# 샤프 비율 계산
sharpe_ratio = portfolio_return / portfolio_std

sharpe_ratio

1.1290926290437062

In [18]:
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission

Unnamed: 0,종목코드,순위
0,A000020,1
1,A000040,2
2,A000050,3
3,A000070,4
4,A000080,5
...,...,...
1995,A375500,1996
1996,A378850,1997
1997,A383220,1998
1998,A383310,1999


In [19]:
baseline_submission = sample_submission[['종목코드']].merge(results_df[['종목코드', '순위']], on='종목코드', how='left')
baseline_submission

Unnamed: 0,종목코드,순위
0,A000020,1713
1,A000040,1512
2,A000050,674
3,A000070,592
4,A000080,1322
...,...,...
1995,A375500,881
1996,A378850,130
1997,A383220,329
1998,A383310,1688


In [20]:
baseline_submission.to_csv('GB_submission.csv', index=False)