# RandomForestRegression Year column embedding
    1. Year column embedding code to preserve time series properties.
    2. stock별 시계열패턴을 모델이 학습할수 있도록 작성된 코드.

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# RandomForestRegressor (multi-output 지원)
model = RandomForestRegressor(
    n_estimators=600,
    max_depth=None,            # 필요시 예: 20 로 제한
    min_samples_split=2,
    min_samples_leaf=1,
    n_jobs=-1,                 # 모든 코어 사용
    random_state=42,
    oob_score=False,
    verbose=0
)
model

0,1,2
,n_estimators,600
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [2]:
# Set target_columns
target_columns = ['CETR', 'GETR', 'TSTA', 'TSDA']

In [3]:
# Load CSV file
df = pd.read_csv('data.csv')
df

Unnamed: 0,name,stock,year,KOSPI,fnd_year,fiscal,ind,big4,forn,own,...,GETR5,CETR5,TSTA,TSDA,A_GETR,A_CETR,A_GETR3,A_CETR3,A_GETR5,A_CETR5
0,동화약품,20,2013,1,1897,12,21,1,0.0613,0.3114,...,0.294414,0.266121,-0.071863,-0.088363,-0.236479,0.748503,0.058765,-0.096756,0.034497,-0.019779
1,동화약품,20,2014,1,1897,12,21,1,0.0502,0.3151,...,0.233461,0.418783,-0.001226,-0.014614,-0.150710,0.748503,0.500993,0.733432,-0.026455,0.132883
2,동화약품,20,2015,1,1897,12,21,1,0.0749,0.3235,...,0.231593,0.528209,0.072994,0.056283,-0.171420,-0.024525,-0.195255,0.733432,-0.028324,0.242310
3,동화약품,20,2016,1,1897,12,21,1,0.0746,0.3229,...,0.306044,0.325968,0.106756,0.107570,0.026917,-0.194237,-0.174602,0.327249,0.046128,0.040069
4,동화약품,20,2017,1,1897,12,21,1,0.1186,0.3240,...,0.250474,0.216419,-0.061506,0.008744,0.042469,-0.157749,-0.014648,-0.184860,-0.009442,-0.069481
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12648,잉글우드랩,950140,2023,0,2015,12,20,0,0.1248,0.4408,...,0.174191,0.211668,-0.317902,-0.362904,-0.122071,0.080095,-0.064306,0.002523,0.001916,0.012473
12649,잉글우드랩,950140,2024,0,2015,12,20,0,0.1358,0.4408,...,0.114758,0.256242,-0.358287,-0.384294,-0.123269,0.292943,-0.129741,0.004858,-0.091928,0.027420
12650,고스트스튜디오,950190,2022,0,2016,12,58,1,0.1374,0.4427,...,0.176417,0.172354,-0.109588,-0.137899,-0.055951,-0.122758,-0.039628,0.121765,-0.064150,-0.092251
12651,고스트스튜디오,950190,2023,0,2016,12,58,1,0.1445,0.5158,...,0.180101,0.204584,-0.225762,-0.237668,-0.034687,-0.052817,-0.041439,-0.021230,-0.060466,-0.060022


In [4]:
# Categorical 컬럼 설정 (CatBoost/XGBoost와 동일 구조 유지)
categorical_cols = ['name', 'stock', 'KOSPI', 'big4', 'LOSS', 'ind']
for col in categorical_cols:
    df[col] = df[col].astype('category')

print('Categorical dtypes:')
print(df[categorical_cols].dtypes)

# RandomForest는 category 직접 처리 불가 → 과도한 one-hot 대신 아래 전략 적용:
# 1) high-cardinality 후보(ind, name)는 원본 유지 후 name 은 이후 제거
# 2) 저카디널리티(KOSPI,big4,LOSS)는 one-hot(drop_first)
# 3) stock 고정효과는 통계 파생(평균, count 등)으로 제공 후 제거 가능

low_card_cols = ['KOSPI','big4','LOSS']
df = pd.get_dummies(df, columns=low_card_cols, drop_first=True)

# stock 등장 횟수 & stock별 연차(age)
stock_counts = df.groupby('stock')['year'].transform('count')
df['stock_freq'] = stock_counts

# 이후 year_index_stock 계산 시 함께 사용 (본 셀에서는 pass)
print('After minimal encoding shape:', df.shape)

Categorical dtypes:
name     category
stock    category
KOSPI    category
big4     category
LOSS     category
ind      category
dtype: object
After minimal encoding shape: (12653, 66)


  stock_counts = df.groupby('stock')['year'].transform('count')


In [6]:
# Time-series aware encoding & global cutoff split (2011~2023 train, 2024 test)
import numpy as np
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error

# 1. Year 기반 전역/종목별 트렌드 & 주기 인코딩
year_min = df['year'].min(); year_max = df['year'].max()
df['year_norm_global'] = (df['year'] - year_min) / ( (year_max - year_min) if year_max>year_min else 1 )
df['year_index_stock'] = df.groupby('stock')['year'].rank(method='dense').astype(int) - 1
df['year_norm_stock'] = df['year_index_stock'] / df.groupby('stock')['year_index_stock'].transform('max')
angle = 2 * np.pi * df['year_norm_global']
df['year_sin'] = np.sin(angle); df['year_cos'] = np.cos(angle)

# 2. 종목별 타겟 통계 (train 구간에서만 계산 → 누출 방지 위해 후에 merge)
CUTOFF_TEST_YEAR = 2024
train_mask_pre = df['year'] < CUTOFF_TEST_YEAR
train_part = df[train_mask_pre]

if 'stock' not in df.columns:
    raise ValueError("'stock' 컬럼이 존재하지 않습니다. 전처리 단계에서 제거/이름 변경 여부 확인 필요.")

# train 구간이 비어있지 않은지 확인
if train_part.empty:
    raise ValueError("Train 구간이 비어 있습니다. CUTOFF_TEST_YEAR 설정을 확인하세요.")

stock_target_mean = train_part.groupby('stock')[target_columns].mean().add_prefix('stock_mean_')
# 마지막 연도 값 (연도 순 정렬 후 각 stock별 마지막 row의 target 값)
stock_target_last_rows = train_part.sort_values('year').groupby('stock').tail(1)
stock_target_last = stock_target_last_rows.set_index('stock')[target_columns].add_prefix('stock_last_')

stock_feat = stock_target_mean.join(stock_target_last, how='outer').reset_index()  # 'stock'를 열로 변환

# merge (이전에 KeyError 발생 원인: stock_feat index 상태였음)
df = df.merge(stock_feat, on='stock', how='left')

# 3. Lag 특징 다중 생성
exclude_for_lag = set(['name','stock','year'] + target_columns)
num_cols_for_lag = [c for c in df.columns if c not in exclude_for_lag and df[c].dtype.kind in ['i','u','f']]
lag_list = [1,2,3]
for col in num_cols_for_lag:
    g = df.groupby('stock', observed=True)[col]
    for L in lag_list:
        df[f'{col}_lag{L}'] = g.shift(L)

# 4. YoY 변화율 (target 기반)
for col in target_columns:
    df[f'{col}_yoy'] = df.groupby('stock', observed=True)[col].pct_change().replace([np.inf,-np.inf], np.nan)

# 5. Rolling window 통계 (3연도)
for col in target_columns:
    g = df.groupby('stock', observed=True)[col]
    df[f'{col}_roll3_mean'] = g.rolling(3).mean().reset_index(level=0, drop=True)
    df[f'{col}_roll3_std'] = g.rolling(3).std().reset_index(level=0, drop=True)

# 6. 전역 컷오프 분리
full_test_mask = df['year'] == CUTOFF_TEST_YEAR
train_df = df[train_mask_pre].copy()
test_df = df[full_test_mask].copy()
seen_stocks = set(train_df['stock'].unique())
seen_test_df = test_df[test_df['stock'].isin(seen_stocks)].copy()

# 7. Feature / Target 분리 (최종 드롭)
feature_drop = ['name','stock'] + target_columns
X_train = train_df.drop(columns=[c for c in feature_drop if c in train_df.columns])
X_test_seen = seen_test_df.drop(columns=[c for c in feature_drop if c in seen_test_df.columns])
y_train = train_df[target_columns]
y_test_seen = seen_test_df[target_columns]

# 8. Lag/파생 결측 처리: train 중앙값으로 채움
lag_cols = [c for c in X_train.columns if 'lag' in c or c.endswith('_yoy') or 'roll3' in c]
if lag_cols:
    medians = X_train[lag_cols].median()
    X_train[lag_cols] = X_train[lag_cols].fillna(medians)
    X_test_seen[lag_cols] = X_test_seen[lag_cols].fillna(medians)

print(f"Train years: {int(train_df['year'].min())}-{int(train_df['year'].max())} | Test year: {CUTOFF_TEST_YEAR}")
print(f"Seen stocks count: {len(seen_stocks)}")
print("Shapes -> Train:", X_train.shape, "SeenTest:", X_test_seen.shape)
print("Added lag count:", len([c for c in X_train.columns if '_lag' in c]))
print("Target stat cols (sample):", [c for c in X_train.columns if c.startswith('stock_mean_')][:4])
print("Sample feature cols:", X_train.columns[:15].tolist())

  df['year_index_stock'] = df.groupby('stock')['year'].rank(method='dense').astype(int) - 1
  df['year_norm_stock'] = df['year_index_stock'] / df.groupby('stock')['year_index_stock'].transform('max')
  stock_target_mean = train_part.groupby('stock')[target_columns].mean().add_prefix('stock_mean_')
  stock_target_last_rows = train_part.sort_values('year').groupby('stock').tail(1)
  df[f'{col}_lag{L}'] = g.shift(L)
  df[f'{col}_lag{L}'] = g.shift(L)
  df[f'{col}_lag{L}'] = g.shift(L)
  df[f'{col}_lag{L}'] = g.shift(L)
  df[f'{col}_lag{L}'] = g.shift(L)
  df[f'{col}_lag{L}'] = g.shift(L)
  df[f'{col}_lag{L}'] = g.shift(L)
  df[f'{col}_lag{L}'] = g.shift(L)
  df[f'{col}_lag{L}'] = g.shift(L)
  df[f'{col}_lag{L}'] = g.shift(L)
  df[f'{col}_lag{L}'] = g.shift(L)
  df[f'{col}_lag{L}'] = g.shift(L)
  df[f'{col}_lag{L}'] = g.shift(L)
  df[f'{col}_lag{L}'] = g.shift(L)
  df[f'{col}_lag{L}'] = g.shift(L)
  df[f'{col}_lag{L}'] = g.shift(L)
  df[f'{col}_lag{L}'] = g.shift(L)
  df[f'{col}_lag{L}'] =

Train years: 2011-2023 | Test year: 2024
Seen stocks count: 1693
Shapes -> Train: (11567, 289) SeenTest: (1025, 289)
Added lag count: 204
Target stat cols (sample): ['stock_mean_CETR', 'stock_mean_GETR', 'stock_mean_TSTA', 'stock_mean_TSDA']
Sample feature cols: ['year', 'fnd_year', 'fiscal', 'ind', 'forn', 'own', 'c_asset', 'inv', 'asset', 'sales', 'cogs', 'dep', 'tax', 'rec', 'ni']


In [7]:
# 7. 모델 학습 & 평가 (RandomForest 개선)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error

model = RandomForestRegressor(
    n_estimators=900,
    max_depth=22,
    max_features='sqrt',
    min_samples_leaf=3,
    n_jobs=-1,
    random_state=42,
    verbose=0
)
model.fit(X_train, y_train)

pred_seen = model.predict(X_test_seen)  # (n_samples, n_targets)
print('[Overall] R2:', r2_score(y_test_seen, pred_seen),
      'MAE:', mean_absolute_error(y_test_seen, pred_seen),
      'RMSE:', root_mean_squared_error(y_test_seen, pred_seen))

# 타겟별 상세
print('\nPer-target metrics:')
for i, col in enumerate(target_columns):
    r2_i = r2_score(y_test_seen[col], pred_seen[:, i])
    mae_i = mean_absolute_error(y_test_seen[col], pred_seen[:, i])
    rmse_i = root_mean_squared_error(y_test_seen[col], pred_seen[:, i])
    print(f"  {col}: R2={r2_i:.4f} MAE={mae_i:.4f} RMSE={rmse_i:.4f}")

# 중요도
import pandas as pd
fi = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print('\nTop 25 Feature Importances:')
print(fi.head(25))

[Overall] R2: 0.7826527294603955 MAE: 0.06097245203526131 RMSE: 0.09685683515113391

Per-target metrics:
  CETR: R2=0.7402 MAE=0.0840 RMSE=0.1148
  GETR: R2=0.6245 MAE=0.0574 RMSE=0.0830
  TSTA: R2=0.8785 MAE=0.0527 RMSE=0.0959
  TSDA: R2=0.8874 MAE=0.0499 RMSE=0.0937

Top 25 Feature Importances:
A_CETR                  0.084905
stock_mean_TSTA         0.049119
stock_mean_TSDA         0.047521
year_sin                0.032894
year                    0.032286
year_norm_global        0.032238
A_GETR                  0.028547
stock_last_TSDA         0.027960
TSDA_roll3_mean         0.027530
stock_last_TSTA         0.026229
TSTA_roll3_mean         0.024648
stock_mean_TSDA_lag1    0.018832
stock_mean_TSTA_lag1    0.017997
CETR_yoy                0.017208
ROA                     0.015512
lag_asset               0.015473
SIZE                    0.014604
ROE                     0.014233
stock_last_TSTA_lag1    0.012609
asset                   0.012468
stock_last_TSDA_lag1    0.011250
CETR_roll

In [8]:
# 8. 결과 저장 (Seen test)
import numpy as np, pandas as pd
pred_df = pd.DataFrame(
    data = pred_seen,
    columns = [f'pred_{col}' for col in y_test_seen.columns],
    index = y_test_seen.index
)
actual_df = y_test_seen.add_prefix('actual_')
results_df = pd.concat([actual_df, pred_df], axis=1)
results_df.to_csv('rf_actual_vs_predicted_seen_2024.csv', index=True)
print('Saved: rf_actual_vs_predicted_seen_2024.csv')

# Per-target metrics 저장
metric_rows = []
for i, col in enumerate(target_columns):
    metric_rows.append({
        'target': col,
        'R2': r2_score(y_test_seen[col], pred_seen[:, i]),
        'MAE': mean_absolute_error(y_test_seen[col], pred_seen[:, i]),
        'RMSE': root_mean_squared_error(y_test_seen[col], pred_seen[:, i])
    })
metrics_df = pd.DataFrame(metric_rows)
metrics_df.to_csv('rf_metrics_seen_2024.csv', index=False)
print('Saved: rf_metrics_seen_2024.csv')

Saved: rf_actual_vs_predicted_seen_2024.csv
Saved: rf_metrics_seen_2024.csv


## Reference site
    지표 및 점수
    https://scikit-learn.org/stable/modules/model_evaluation.html#model-evaluation
    https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics
