## Stacking

In [None]:
import pandas as pd

# 데이터 로드
train = pd.read_csv('./train.csv')

# 연도를 2024로 설정하여 yymm 컬럼을 날짜 형식으로 변환
train['yymm'] = pd.to_datetime('2024' + train['yymm'], format='%Y%m%d %H:%M')

# day, hour, minute 컬럼 생성
train['day'] = train['yymm'].dt.day   # 일
train['hour'] = train['yymm'].dt.hour # 시
train['minute'] = train['yymm'].dt.minute # 분

# weekday 컬럼 생성 (요일 계산)
train['weekday'] = train['day'] % 7 # 1은 화요일, 2는 수요일, ...

# yymm 컬럼 삭제
train.drop('yymm', axis=1, inplace=True)

In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
import optuna

# 피처와 타겟 설정
X = train.drop('Target', axis=1)
y = train['Target']

# 훈련 세트와 검증 세트 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 피처 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

def objective(trial):
    # 하이퍼파라미터 정의
    lasso_alpha = trial.suggest_float('lasso_alpha', 1e-4, 1e2, log=True)
    elastic_net_alpha = trial.suggest_float('elastic_net_alpha', 1e-4, 1e2, log=True)
    elastic_net_l1_ratio = trial.suggest_float('elastic_net_l1_ratio', 0, 1)
    svr_c = trial.suggest_float('svr_c', 1e-4, 1e2, log=True)
    random_state = trial.suggest_int('random_state', 0, 100)

    # 기본 모델 생성
    base_models = [
        ('lasso', Lasso(alpha=lasso_alpha, random_state=random_state, max_iter=10000)),  # max_iter 증가
        ('elastic_net', ElasticNet(alpha=elastic_net_alpha, l1_ratio=elastic_net_l1_ratio, random_state=random_state, max_iter=10000)),
        ('svr', SVR(C=svr_c))
    ]

    # 스태킹 리그레서 생성
    stacking_model = StackingRegressor(estimators=base_models, final_estimator=Lasso(random_state=random_state, max_iter=10000))

    # 모델 학습
    stacking_model.fit(X_train_scaled, y_train)

    # 교차 검증
    cv_scores = cross_val_score(stacking_model, X_train_scaled, y_train, cv=5, scoring='neg_mean_absolute_error')
    return -cv_scores.mean()

# Optuna를 사용한 하이퍼파라미터 최적화
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# 최적의 하이퍼파라미터 출력
print("Best hyperparameters:", study.best_params)
print("Best score:", study.best_value)

# 최적의 하이퍼파라미터로 모델 재학습
best_params = study.best_params
model = StackingRegressor(
    estimators=[
        ('lasso', Lasso(alpha=best_params['lasso_alpha'], random_state=best_params['random_state'])),
        ('elastic_net', ElasticNet(alpha=best_params['elastic_net_alpha'], l1_ratio=best_params['elastic_net_l1_ratio'], random_state=best_params['random_state'])),
        ('svr', SVR(C=best_params['svr_c']))
    ],
    final_estimator=Lasso(random_state=best_params['random_state'])
)

# 모델 학습
model.fit(X_train_scaled, y_train)

# 훈련 데이터와 검증 데이터에 대한 예측
train_pred = model.predict(X_train_scaled)
val_pred = model.predict(X_val_scaled)

# 검증 데이터에 대한 MAE 계산
val_score = mean_absolute_error(y_val, val_pred)

print(f'Validation Score: {val_score}')


[I 2024-11-02 23:16:39,797] A new study created in memory with name: no-name-bb3f811d-43d4-4e9d-8ea5-dc5f5ebcb727
[I 2024-11-02 23:16:53,256] Trial 0 finished with value: 12.572532390811025 and parameters: {'lasso_alpha': 0.004533321368169926, 'elastic_net_alpha': 0.015698834328233144, 'elastic_net_l1_ratio': 0.5266177517033725, 'svr_c': 0.4796039057478442, 'random_state': 48}. Best is trial 0 with value: 12.572532390811025.
[I 2024-11-02 23:17:07,010] Trial 1 finished with value: 12.572091081175994 and parameters: {'lasso_alpha': 0.004864451323045446, 'elastic_net_alpha': 0.11456833111487263, 'elastic_net_l1_ratio': 0.3806965421367453, 'svr_c': 5.911544784595587, 'random_state': 76}. Best is trial 1 with value: 12.572091081175994.
[I 2024-11-02 23:17:21,667] Trial 2 finished with value: 12.572532390811025 and parameters: {'lasso_alpha': 29.77427303094066, 'elastic_net_alpha': 65.59088263344532, 'elastic_net_l1_ratio': 0.17092173945792022, 'svr_c': 0.00010580360522708637, 'random_state

Best hyperparameters: {'lasso_alpha': 2.906464730544162, 'elastic_net_alpha': 25.199059341655243, 'elastic_net_l1_ratio': 0.1209450418738051, 'svr_c': 16.981172120892825, 'random_state': 67}
Best score: 12.571186721568491
Validation Score: 12.399218985929549
