In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


train = pd.read_csv('./open/train.csv')


In [None]:
# Your function to calculate SMAPE
def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

In [None]:

# Initialize results DataFrame
results_df = pd.DataFrame(columns=['종목코드'] + ['return_day_' + str(i) for i in range(1, 16)])

# train 데이터에 존재하는 독립적인 종목코드 추출
unique_codes = train['종목코드'].unique()

preds_df_fin_xgb = pd.DataFrame()
smapes_df_fin_xgb = pd.DataFrame()

preds_df_fin_lgbm = pd.DataFrame()
smapes_df_fin_lgbm = pd.DataFrame()

preds_df_fin_catboost = pd.DataFrame()
smapes_df_fin_catboost = pd.DataFrame()


# Iterate over each unique stock
for code in tqdm(unique_codes):

    # Filter by stock code
    train_close = train[train['종목코드'] == code][['일자', '거래량', '시가', '고가', '저가',  '종가']]
    train_close['일자'] = pd.to_datetime(train_close['일자'], format='%Y%m%d')
    train_close.set_index('일자', inplace=True)

    # Store original data for reference
    original_data = train_close.copy()

    # Create return columns for each day
    returns = []
    smapes_xgb = []
    smapes_lgbm = []
    smapes_catboost = []

    preds_df_xgb = pd.DataFrame()
    preds_df_lgbm = pd.DataFrame()
    preds_df_catboost = pd.DataFrame()

    # For each day from 1 to 15
    for day in range(1, 16):
        # Scale data
        X = train_close[:]
        y = train_close['종가']

        scaler = MinMaxScaler(feature_range=(-1, 1))
        data_scaled = scaler.fit_transform(X)
        data_scaled2 = y

        X_train = data_scaled[:-day]
        y_train = data_scaled2[day:]
        X_test = data_scaled[-day]

        X_train = X_train[:int(len(X_train) * 0.9)]
        X_val = X_train[int(len(X_train) * 0.9):]
        y_train = y_train[:int(len(y_train) * 0.9)]
        y_val = y_train[int(len(y_train) * 0.9):]

        # Train XGBoost
        xgb_model = XGBRegressor()
        xgb_model.fit(X_train, y_train)
        vals_xgb = xgb_model.predict(X_val)
        smapes_xgb.append(smape(y_val, vals_xgb))

        preds_xgb = xgb_model.predict([data_scaled[-day]])
        preds_df_xgb = pd.concat([preds_df_xgb, pd.DataFrame(preds_xgb)], axis = 0)

        # Train LightGBM
        lgbm_model = LGBMRegressor()
        lgbm_model.fit(X_train, y_train)
        vals_lgbm = lgbm_model.predict(X_val)
        smapes_lgbm.append(smape(y_val, vals_lgbm))

        preds_lgbm = lgbm_model.predict([data_scaled[-day]])
        preds_df_lgbm = pd.concat([preds_df_lgbm, pd.DataFrame(preds_lgbm)], axis = 0)

    smapes_df_xgb = pd.DataFrame(smapes_xgb)
    smapes_df_lgbm = pd.DataFrame(smapes_lgbm)
    smapes_df_catboost = pd.DataFrame(smapes_catboost)

    preds_df_fin_xgb = pd.concat([preds_df_fin_xgb, preds_df_xgb], axis = 1)
    smapes_df_fin_xgb = pd.concat([smapes_df_fin_xgb, smapes_df_xgb], axis = 1)

    preds_df_fin_lgbm = pd.concat([preds_df_fin_lgbm, preds_df_lgbm], axis = 1)
    smapes_df_fin_lgbm = pd.concat([smapes_df_fin_lgbm, smapes_df_lgbm], axis = 1)

100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [39:13<00:00,  1.18s/it]


In [None]:
final = np.zeros((len(smapes_df_fin_xgb), len(smapes_df_fin_xgb.columns)))

for i in range(0, len(smapes_df_fin_xgb.columns)):
    for j in range(0, len(smapes_df_fin_xgb)):
        weights = [1 / smapes_df_fin_xgb.iloc[j:j+1, i].values[0],

                   1 / smapes_df_fin_lgbm.iloc[j:j+1, i].values[0]]


        weights /= np.sum(weights)
        final[j][i] = weights[0] * preds_df_fin_xgb.iloc[j:j+1, i].values[0]
                            + weights[1] * preds_df_fin_lgbm.iloc[j:j+1, i].values[0]

In [None]:
final_df = pd.DataFrame(final)
final_values = pd.DataFrame((final_df.iloc[-1] - final_df.iloc[0]) / final_df.iloc[0])
final_values_sharpe = -pd.DataFrame(((final_df.iloc[-1] - final_df.iloc[0]) / final_df.iloc[0]) / final_df.pct_change().std())



In [None]:
final_values.index = unique_codes
final_values.columns = ['VALUE']
final_values.reset_index(inplace = True)
final_values.columns = ['종목코드', 'VALUE']

In [None]:
# Submit
# Set the ranks for NaNs starting from 1800 and decreasing
nan_indices = np.setdiff1d(unique_codes, final_values['종목코드'].values)

# Drop the rows with NaNs from the original DataFrame
baseline_submission = final_values[~final_values['종목코드'].isin(nan_indices)]
final_values.sort_values('VALUE', ascending = False, inplace = True)
baseline_submission.sort_values('VALUE', ascending = False, inplace = True)

# Split the remaining rank range into two parts: before and after the ranks assigned to NaNs
before_nan_ranks = np.arange(1, 1801 - len(nan_indices))
after_nan_ranks = np.arange(1801, 2001)
nan_ranks = np.arange(1801 - len(nan_indices), 1801)

# Assign the remaining ranks
baseline_submission['순위'] = np.concatenate([before_nan_ranks, after_nan_ranks])
nan_vals = pd.DataFrame()
nan_vals['종목코드'] = nan_indices
nan_vals['VALUE'] = np.nan
nan_vals['순위'] = nan_ranks

# Insert the rows with NaNs (now ranked)
baseline_submission = pd.concat([baseline_submission, nan_vals])

# Sort the DataFrame by '순위'
baseline_submission = baseline_submission.sort_values(by='순위')
# Ensure '순위' is of integer type
baseline_submission['순위'] = baseline_submission['순위'].astype('int')
baseline_submission = baseline_submission.drop(['VALUE'], axis = 1)
sample_submission = pd.read_csv('./open/sample_submission.csv')
baseline_submission = sample_submission[['종목코드']].merge(baseline_submission[['종목코드', '순위']], on='종목코드', how='left')


In [None]:
baseline_submission.to_csv('개별예측.csv', index=False)


In [None]:
final_values_sharpe


A060310   -4.316010
A095570    5.867212
A006840   -1.195872
A054620    0.235413
A265520   -0.769853
             ...   
A189980   -1.247725
A000540    1.448047
A003280    2.147592
A037440    1.012662
A238490   -3.400123
Length: 2000, dtype: float64

In [None]:
final_values_sharpe.index = unique_codes
final_values_sharpe.columns = ['VALUE']
final_values_sharpe.reset_index(inplace = True)
final_values_sharpe.columns = ['종목코드', 'VALUE']

In [None]:
# Submit
# Set the ranks for NaNs starting from 1800 and decreasing
nan_indices = np.setdiff1d(unique_codes, final_values_sharpe['종목코드'].values)

# Drop the rows with NaNs from the original DataFrame
baseline_submission = final_values_sharpe[~final_values_sharpe['종목코드'].isin(nan_indices)]
final_values_sharpe.sort_values('VALUE', ascending = False, inplace = True)
baseline_submission.sort_values('VALUE', ascending = False, inplace = True)

# Split the remaining rank range into two parts: before and after the ranks assigned to NaNs
before_nan_ranks = np.arange(1, 1801 - len(nan_indices))
after_nan_ranks = np.arange(1801, 2001)
nan_ranks = np.arange(1801 - len(nan_indices), 1801)

# Assign the remaining ranks
baseline_submission['순위'] = np.concatenate([before_nan_ranks, after_nan_ranks])
nan_vals = pd.DataFrame()
nan_vals['종목코드'] = nan_indices
nan_vals['VALUE'] = np.nan
nan_vals['순위'] = nan_ranks

# Insert the rows with NaNs (now ranked)
baseline_submission = pd.concat([baseline_submission, nan_vals])

# Sort the DataFrame by '순위'
baseline_submission = baseline_submission.sort_values(by='순위')
# Ensure '순위' is of integer type
baseline_submission['순위'] = baseline_submission['순위'].astype('int')
baseline_submission = baseline_submission.drop(['VALUE'], axis = 1)
sample_submission = pd.read_csv('./open/sample_submission.csv')
baseline_submission = sample_submission[['종목코드']].merge(baseline_submission[['종목코드', '순위']], on='종목코드', how='left')


In [None]:
baseline_submission.to_csv('개별예측_SHARPE_MINUS.csv', index=False)


