# XGBoostRegression Year column embedding
    1. Year column embedding code to preserve time series properties.
    2. stock별 시계열패턴을 모델이 학습할수 있도록 작성된 코드.

In [None]:
import pandas as pd
from xgboost import XGBRegressor

model = XGBRegressor(
    n_estimators=800,
    objective="reg:squarederror",
    enable_categorical=True,
    learning_rate=0.06,
    tree_method="hist",
    eval_metric="rmse",
    device="cuda",
    max_depth=6,
    random_state=42,
    early_stopping_rounds=80,
    verbosity=1
)
model

# multi_strategy="multi_output_tree" -> parameter not support gpu. So deleted.

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,'cuda'
,early_stopping_rounds,80
,enable_categorical,True


In [2]:
# Set target_columns
target_columns = ['CETR', 'GETR', 'TSTA', 'TSDA']

In [3]:
# Load CSV file
df = pd.read_csv('data.csv')
df.shape

(12653, 65)

In [4]:
# Set categorical_cols
categorical_cols = ['name', 'stock', 'KOSPI', 'big4', 'LOSS', 'ind']
for col in categorical_cols:
    df[col] = df[col].astype('category')

print(df.dtypes)

name        category
stock       category
year           int64
KOSPI       category
fnd_year       int64
              ...   
A_CETR       float64
A_GETR3      float64
A_CETR3      float64
A_GETR5      float64
A_CETR5      float64
Length: 65, dtype: object


In [5]:
# 고유한 stock 값 개수
unique_stocks_df = df['stock'].nunique()
print('전체 df 고유 stock 개수:', unique_stocks_df)

# train_df가 존재하면 학습 데이터 내 고유 stock 개수도 출력
if 'train_df' in globals():
    unique_stocks_train = train_df['stock'].nunique()
    print('train_df 고유 stock 개수:', unique_stocks_train)

# 고유 stock 값 일부 예시 (앞 10개)
if hasattr(df['stock'], 'cat'):
    print('stock 카테고리 예시 (앞 10개):', df['stock'].cat.categories[:10].tolist())
else:
    print('stock 값 예시 (앞 10개):', df['stock'].unique()[:10])

전체 df 고유 stock 개수: 1754
stock 카테고리 예시 (앞 10개): [20, 50, 70, 80, 100, 120, 140, 150, 180, 210]


In [6]:
# Set target column
## drop 'object' dtype
X = df.drop(columns=['name'] + target_columns, axis=1)
y = df[target_columns]
X.shape, y.shape

((12653, 60), (12653, 4))

In [7]:
# Time-series aware encoding & global cutoff split (2011~2023 train, 2024 test)
import numpy as np
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error

# ---- CONFIG ----
STRICT_LAG = True  # True: 이전 해(year-1)가 있는 행만 학습/평가에 사용

# 1. Year 기반 전역/종목별 트렌드 & 주기 인코딩
year_min = df['year'].min()
year_max = df['year'].max()

# 전역 정규화 (0~1)
df['year_norm_global'] = (df['year'] - year_min) / (year_max - year_min)

# 종목별 시작 기준 index (0부터 증가)
df['year_index_stock'] = df.groupby('stock', observed=True)['year'].rank(method='dense').astype(int) - 1
# 종목별 상대적 진행도 (0~1)
df['year_norm_stock'] = df['year_index_stock'] / df.groupby('stock', observed=True)['year_index_stock'].transform('max')

# 주기성 (사이클) 표현: 한 전체 구간(2011~2024)을 하나의 2π 주기로 가정
angle = 2 * np.pi * df['year_norm_global']
df['year_sin'] = np.sin(angle)
df['year_cos'] = np.cos(angle)

# 2. Lag 특징 (1-step) 생성: 전체 df 기준(필터 이전) -> 제거된 연도의 값이 다음 해 lag로 사용 (Option A)
exclude_for_lag = set(['name', 'stock', 'year'] + target_columns)
num_cols_for_lag = [c for c in df.columns if c not in exclude_for_lag and df[c].dtype.kind in ['i','u','f']]
for col in num_cols_for_lag:
    df[f'{col}_lag1'] = df.groupby('stock', observed=True)[col].shift(1)

# 2-1. 실제 연속 연도 여부 계산 (gap 정보)
df['year_gap'] = df.groupby('stock', observed=True)['year'].diff()
df['has_prev_year'] = df['year_gap'].eq(1)  # True/False

# 3. 전역 컷오프 기반 연도 분리 (Train: <=2023, Test: 2024)
CUTOFF_TEST_YEAR = 2024
train_mask_full = df['year'] < CUTOFF_TEST_YEAR
full_test_mask = df['year'] == CUTOFF_TEST_YEAR

train_df = df[train_mask_full].copy()
test_df = df[full_test_mask].copy()

# 4. 2024 테스트셋: 과거(2011~2023) 기록이 있는 종목만 사용 (seen stocks)
seen_stocks = set(train_df['stock'].unique())
seen_test_df = test_df[test_df['stock'].isin(seen_stocks)].copy()

# 4-1. STRICT_LAG 적용: 직전 연도가 실제로 존재(연속)하는 행만 유지
if STRICT_LAG:
    before_train_rows = len(train_df)
    before_test_rows = len(seen_test_df)
    # 학습: (year_gap == 1) 인 행만 사용 (첫 해 및 gap 후 재시작 행 제거)
    train_df = train_df[train_df['has_prev_year']].copy()
    # 테스트: 2024년이면서 2023 데이터 존재하는 경우만
    seen_test_df = seen_test_df[seen_test_df['has_prev_year']].copy()
    removed_train = before_train_rows - len(train_df)
    removed_test = before_test_rows - len(seen_test_df)
else:
    removed_train = 0
    removed_test = 0

# 5. Feature / Target 분리
feature_drop = ['name', 'stock'] + target_columns
X_train = train_df.drop(columns=feature_drop)
X_test_seen = seen_test_df.drop(columns=feature_drop)
y_train = train_df[target_columns]
y_test_seen = seen_test_df[target_columns]

# 6. Lag 결측 처리: STRICT_LAG이면 일반적으로 lag 결측이 거의 없음 (첫 해 제거했기 때문)
lag_cols = [c for c in X_train.columns if c.endswith('_lag1')]
if lag_cols:
    # 남아있는 NaN (혹시라도) 채우기
    lag_medians = X_train[lag_cols].median()
    X_train[lag_cols] = X_train[lag_cols].fillna(lag_medians)
    X_test_seen[lag_cols] = X_test_seen[lag_cols].fillna(lag_medians)

In [8]:
# 7. Summary
print(f"Train years: {int(train_df['year'].min())}-{int(train_df['year'].max())} | Test year: {CUTOFF_TEST_YEAR}")
print(f"Cutoff and Seen stocks count in train_df: {len(seen_stocks)}")
print("STRICT_LAG:", STRICT_LAG, f"| Removed train rows: {removed_train} | Removed test rows: {removed_test}")
print("Shapes -> Train:", X_train.shape, "SeenTest:", X_test_seen.shape)
print("Lag feature count:", len(lag_cols))
print("Sample feature cols:", X_train.columns[:10].tolist())

Train years: 2012-2023 | Test year: 2024
Cutoff and Seen stocks count in train_df: 1693
STRICT_LAG: True | Removed train rows: 2472 | Removed test rows: 96
Shapes -> Train: (9095, 125) SeenTest: (929, 125)
Lag feature count: 59
Sample feature cols: ['year', 'KOSPI', 'fnd_year', 'fiscal', 'ind', 'big4', 'forn', 'own', 'c_asset', 'inv']


In [9]:
# 7. 모델 학습 & 평가 (Seen stocks 2024만)
model.fit( X_train, y_train, eval_set=[(X_train, y_train), (X_test_seen, y_test_seen)], verbose=10 )

pred_seen = model.predict(X_test_seen)
print('[Seen Stocks 2024] R2:', r2_score(y_test_seen, pred_seen),
      'MAE:', mean_absolute_error(y_test_seen, pred_seen),
      'RMSE:', root_mean_squared_error(y_test_seen, pred_seen))

XGBoostError: [22:58:43] /workspace/src/gbm/gbtree.cc:208: Check failed: ctx_->IsCPU(): GPU is not yet supported for vector leaf.
Stack trace:
  [bt] (0) /home/super/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x2a6e7c) [0x75402c2a6e7c]
  [bt] (1) /home/super/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x63d03b) [0x75402c63d03b]
  [bt] (2) /home/super/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x68d33e) [0x75402c68d33e]
  [bt] (3) /home/super/.local/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x77) [0x75402c1b6f57]
  [bt] (4) /home/super/anaconda3/envs/dh_xgboost/lib/python3.10/lib-dynload/../../libffi.so.8(+0x6d8a) [0x7541ade82d8a]
  [bt] (5) /home/super/anaconda3/envs/dh_xgboost/lib/python3.10/lib-dynload/../../libffi.so.8(+0x61cd) [0x7541ade821cd]
  [bt] (6) /home/super/anaconda3/envs/dh_xgboost/lib/python3.10/lib-dynload/../../libffi.so.8(ffi_call+0xcd) [0x7541ade8291d]
  [bt] (7) /home/super/anaconda3/envs/dh_xgboost/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x91e7) [0x7541ac7d51e7]
  [bt] (8) /home/super/anaconda3/envs/dh_xgboost/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x869b) [0x7541ac7d469b]



In [None]:
# 타겟별 상세
print('\nPer-target metrics:')
for i, col in enumerate(target_columns):
    r2_i = r2_score(y_test_seen[col], pred_seen[:, i])
    mae_i = mean_absolute_error(y_test_seen[col], pred_seen[:, i])
    rmse_i = root_mean_squared_error(y_test_seen[col], pred_seen[:, i])
    print(f"  {col}: R2={r2_i:.4f} MAE={mae_i:.4f} RMSE={rmse_i:.4f}")

# importance
fi = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print('\nTop 25 Feature Importances:')
print(fi.head(25))

In [None]:
import matplotlib.pyplot as plt

# 타겟별 성능지표 계산
metrics = {'R2': [], 'MAE': [], 'RMSE': []}
for i, col in enumerate(target_columns):
    y_true = y_test_seen[col]
    y_pred = pred_seen[:, i]
    metrics['R2'].append(r2_score(y_true, y_pred))
    metrics['MAE'].append(mean_absolute_error(y_true, y_pred))
    metrics['RMSE'].append(root_mean_squared_error(y_true, y_pred))

metrics_df = pd.DataFrame(metrics, index=target_columns)
print(metrics_df)

# 시각화
fig, axes = plt.subplots(1, 3, figsize=(14, 4))
for ax, metric in zip(axes, metrics_df.columns):
    axes_idx = metrics_df[metric].plot(kind='bar', ax=ax, color='#4C72B0')
    ax.set_title(metric)
    ax.set_xticklabels(metrics_df.index, rotation=45, ha='right')
    ax.grid(axis='y', linestyle='--', alpha=0.4)
    for p in ax.patches:
        val = p.get_height()
        ax.annotate(f'{val:.3f}', (p.get_x() + p.get_width()/2, val),
                    ha='center', va='bottom', fontsize=8, rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# 8. 결과 저장 (Seen test)
import numpy as np
results_df = pd.DataFrame(
    data = np.hstack([y_test_seen.values, pred_seen]),
    columns = [f'actual_{col}' for col in y_test_seen.columns] + [f'pred_{col}' for col in y_test_seen.columns],
    index = y_test_seen.index,
)
results_df.to_csv('actual_vs_predicted_seen_2024.csv', index=True)
print('Saved: actual_vs_predicted_seen_2024.csv')

# Per-target metrics 저장
metric_rows = []
for i, col in enumerate(target_columns):
    metric_rows.append({
        'target': col,
        'R2': r2_score(y_test_seen[col], pred_seen[:, i]),
        'MAE': mean_absolute_error(y_test_seen[col], pred_seen[:, i]),
        'RMSE': root_mean_squared_error(y_test_seen[col], pred_seen[:, i])
    })
metrics_df = pd.DataFrame(metric_rows)
metrics_df.to_csv('xgboost_metrics_seen_2024.csv', index=False)
print('Saved: xgboost_metrics_seen_2024.csv')

## Reference site
    지표 및 점수
    https://scikit-learn.org/stable/modules/model_evaluation.html#model-evaluation
    https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics

    [Training_Parameters]
    1. https://xgboost.readthedocs.io/en/release_3.0.0/parameter.html