# Year column embedding
    1. Year column embedding code to preserve time series properties.
    2. stock별 시계열패턴을 모델이 학습할수 있도록 작성된 코드.

In [None]:
import pandas as pd
from xgboost import XGBRegressor

model = XGBRegressor(enable_categorical=True, tree_method = "hist", device = "cuda")
model

In [None]:
# Set target_columns
target_columns = ['CETR', 'GETR', 'TSTA', 'TSDA']

In [None]:
# Load CSV file
df = pd.read_csv('data.csv')
df

In [None]:
# Set categorical_cols
categorical_cols = ['name', 'stock', 'KOSPI', 'big4', 'LOSS', 'ind']
for col in categorical_cols:
    df[col] = df[col].astype('category')

print(df.dtypes)

In [None]:
# Set target column
## drop 'object' dtype
X = df.drop(columns=['name'] + target_columns, axis=1)
y = df[target_columns]
X.shape, y.shape

In [None]:
# Time-series aware encoding & global cutoff split (2011~2023 train, 2024 test)
import numpy as np
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error

# 1. Year 기반 전역/종목별 트렌드 & 주기 인코딩
year_min = df['year'].min()
year_max = df['year'].max()

# 전역 정규화 (0~1)
df['year_norm_global'] = (df['year'] - year_min) / (year_max - year_min)

# 종목별 시작 기준 index (0부터 증가)
df['year_index_stock'] = df.groupby('stock')['year'].rank(method='dense').astype(int) - 1
# 종목별 상대적 진행도 (0~1)
df['year_norm_stock'] = df['year_index_stock'] / df.groupby('stock')['year_index_stock'].transform('max')

# 주기성 (사이클) 표현: 한 전체 구간(2011~2024)을 하나의 2π 주기로 가정
angle = 2 * np.pi * df['year_norm_global']
df['year_sin'] = np.sin(angle)
df['year_cos'] = np.cos(angle)

# 2. Lag 특징 (1-step lag) 생성: 수치형 컬럼 대상으로 종목별 shift(1)
exclude_for_lag = set(['name', 'stock', 'year'] + target_columns)
num_cols_for_lag = [c for c in df.columns if c not in exclude_for_lag and df[c].dtype.kind in ['i','u','f']]
for col in num_cols_for_lag:
    df[f'{col}_lag1'] = df.groupby('stock')[col].shift(1)

# 3. 전역 컷오프 기반 연도 분리 (Train: <=2023, Test: 2024)
CUTOFF_TEST_YEAR = 2024
train_mask = df['year'] < CUTOFF_TEST_YEAR
full_test_mask = df['year'] == CUTOFF_TEST_YEAR

train_df = df[train_mask].copy()
test_full_df = df[full_test_mask].copy()

# 4. Seen / New stock 분리 (2024에 대해 과거 데이터 유무)
seen_stocks = set(train_df['stock'].unique())
seen_test_df = test_full_df[test_full_df['stock'].isin(seen_stocks)].copy()
new_test_df = test_full_df[~test_full_df['stock'].isin(seen_stocks)].copy()  # optional 평가용

# 5. 특징/타겟 분리 함수
def split_Xy(dataframe):
    X_ = dataframe.drop(columns=['name', 'stock'] + target_columns)
    y_ = dataframe[target_columns]
    return X_, y_

X_train, y_train = split_Xy(train_df)
X_test_seen, y_test_seen = split_Xy(seen_test_df)
if len(new_test_df):
    X_test_new, y_test_new = split_Xy(new_test_df)
else:
    X_test_new = y_test_new = None

# 6. Lag 결측 처리 (lag1 최초 연도 NaN 등) - 학습셋 중앙값 기준
lag_cols = [c for c in X_train.columns if c.endswith('_lag1')]
if lag_cols:
    lag_medians = X_train[lag_cols].median()
    X_train[lag_cols] = X_train[lag_cols].fillna(lag_medians)
    X_test_seen[lag_cols] = X_test_seen[lag_cols].fillna(lag_medians)
    if X_test_new is not None:
        X_test_new[lag_cols] = X_test_new[lag_cols].fillna(lag_medians)

print(f"Train years: {int(train_df['year'].min())}-{int(train_df['year'].max())} | Test year: {CUTOFF_TEST_YEAR}")
print(f"Seen stocks count: {len(seen_stocks)} | New test stocks count: {len(new_test_df['stock'].unique()) if len(new_test_df) else 0}")
print("Shapes -> Train:", X_train.shape, "SeenTest:", X_test_seen.shape, "NewTest:", 'None' if X_test_new is None else X_test_new.shape)
print("Lag feature count:", len(lag_cols))
print("Sample feature cols:", X_train.columns[:10].tolist())

# 7. 모델 학습 & 평가
model.fit(X_train, y_train)

pred_seen = model.predict(X_test_seen)
print('[Seen Stocks 2024] R2:', r2_score(y_test_seen, pred_seen),
      'MAE:', mean_absolute_error(y_test_seen, pred_seen),
      'RMSE:', root_mean_squared_error(y_test_seen, pred_seen))

if X_test_new is not None and len(X_test_new):
    pred_new = model.predict(X_test_new)
    print('[New Stocks 2024] R2:', r2_score(y_test_new, pred_new),
          'MAE:', mean_absolute_error(y_test_new, pred_new),
          'RMSE:', root_mean_squared_error(y_test_new, pred_new))
else:
    print('No new (unseen) stocks present in 2024 test set.')

# 8. 결과 저장 (Seen test 기준)
import numpy as np
results_df = pd.DataFrame(
    data = np.hstack([y_test_seen.values, pred_seen]),
    columns = [f'actual_{col}' for col in y_test_seen.columns] + [f'pred_{col}' for col in y_test_seen.columns],
    index = y_test_seen.index
)
results_df.to_csv('actual_vs_predicted_seen_2024.csv', index=True)
if X_test_new is not None and len(X_test_new):
    results_new_df = pd.DataFrame(
        data = np.hstack([y_test_new.values, pred_new]),
        columns = [f'actual_{col}' for col in y_test_new.columns] + [f'pred_{col}' for col in y_test_new.columns],
        index = y_test_new.index
    )
    results_new_df.to_csv('actual_vs_predicted_newStocks_2024.csv', index=True)


In [None]:
# Fit model
model.fit(X_train, y_train)

In [None]:
# Metrics and scoring
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error
import numpy as np

# Make predictions on the test set
predictions = model.predict(X_test)
actuals = y_test

# Save actual and predicted values to CSV
results_df = pd.DataFrame(
    data = np.hstack([actuals.values, predictions]),
    columns = [f'actual_{col}' for col in actuals.columns] + [f'pred_{col}' for col in actuals.columns],
    index = actuals.index
)
results_df.to_csv('actual_vs_predicted.csv', index=True)

# Evaluate the model
print("R^2 Score:", r2_score(actuals, predictions))
print("Mean Absolute Error:", mean_absolute_error(actuals, predictions))
print("Root Mean Squared Error:", root_mean_squared_error(actuals, predictions))

## Reference site
    지표 및 점수
    https://scikit-learn.org/stable/modules/model_evaluation.html#model-evaluation
    https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics
