## Basic Data

### Voting

In [1]:
import pandas as pd

# 데이터 로드
train = pd.read_csv('./train.csv')

train.drop('yymm', axis=1, inplace=True)

# 결과 출력
train.head(10)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V18,V19,V20,V21,V22,V23,V24,V25,V26,Target
0,-5.327,12.25,-3.294,-7.855,-1.196,13.824,-10.249,-3.04,-5.17,8.077,...,0.103,1.001,-5.861,-27.695,-9.978,-2.689,-0.951,-3.873,0.471,44.521
1,-5.267,12.916,-3.22,-7.788,-1.196,14.424,-10.249,-3.04,-4.97,8.027,...,0.073,0.935,-5.881,-37.695,-10.038,-2.652,-1.018,-3.503,0.361,35.027
2,-5.127,13.583,-3.13,-7.658,-1.196,15.081,-10.359,-3.04,-4.83,7.977,...,0.013,0.905,-5.891,-37.695,-10.001,-2.652,-1.051,-3.436,0.361,13.92
3,-5.06,14.25,-3.13,-7.532,-1.196,14.961,-10.359,-3.04,-4.83,7.927,...,-0.02,0.845,-5.911,-37.695,-10.028,-2.552,-1.111,-3.346,0.261,28.41
4,-4.967,14.916,-3.094,-7.462,-1.196,15.454,-10.359,-3.04,-4.97,7.877,...,-0.087,0.811,-5.931,-37.695,-10.111,-2.619,-1.141,-3.346,0.261,1.647
5,-4.967,15.583,-3.02,-7.388,-1.196,15.284,-10.419,-3.04,-4.86,7.827,...,-0.087,0.745,-5.941,-37.695,-10.111,-2.689,-1.208,-3.346,0.171,6.36
6,-4.827,16.25,-2.92,-7.288,-1.196,15.351,-10.449,-3.04,-4.933,7.777,...,-0.153,0.695,-5.961,-47.695,-10.171,-2.762,-1.275,-3.346,0.171,34.535
7,-4.797,16.25,-2.92,-7.222,-1.196,14.188,-10.516,-3.04,-4.86,7.727,...,-0.187,0.645,-6.111,-37.695,-10.478,-2.689,-1.341,-3.206,0.071,21.335
8,-4.737,16.25,-2.83,-7.188,-1.196,14.048,-10.659,-3.04,-4.933,7.677,...,-0.26,0.645,-6.261,-37.695,-10.744,-2.689,-1.451,-3.073,0.004,34.687
9,-4.9,16.25,-2.89,-7.188,-1.196,14.014,-10.659,-3.04,-4.43,7.627,...,-0.297,0.535,-6.411,-37.695,-10.941,-2.792,-1.551,-2.706,0.038,34.136


In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error

# 데이터 분할
X = train.drop('Target', axis=1)    # Target을 제외한 모든 컬럼을 X로 지정
y = train['Target']                 # Target 컬럼을 y로 지정

# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 모델 생성
lr = LinearRegression()
ridge = Ridge(random_state=42)
lasso = Lasso(random_state=42)
elastic = ElasticNet(random_state=42)
svr = SVR()
gb = GradientBoostingRegressor(random_state=42)
rf = RandomForestRegressor(random_state=42)
xgb = XGBRegressor(random_state=42)
lgbm = LGBMRegressor(random_state=42)
dt = DecisionTreeRegressor(random_state=42)

voting1 = VotingRegressor([
    ('lasso', lasso),
    ('svr', svr),
    ('random_forest', rf)
])

voting2 = VotingRegressor([
    ('lasso', lasso),
    ('elastic', elastic),
    ('svr', svr),
])

voting3 = VotingRegressor([
    ('svr', svr),
    ('random_forest', rf),
    ('xgb', xgb),
])

models = {
    'Voting1': voting1,
    'Voting2': voting2,
    'Voiting3': voting3
}

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# 각 모델에 대해 학습 및 5-fold 교차검증 수행
for model_name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring=mae_scorer)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    train_score = mean_absolute_error(y_train, model.predict(X_train))
    test_score = mean_absolute_error(y_test, y_pred)

    print(f'{model_name}: {scores.mean()} / {train_score} / {test_score}')

Voting1: 12.539424804561312 / 9.889203428498147 / 12.506113838706884
Voting2: 12.537481846367452 / 12.514234348019821 / 12.457180035475115
Voiting3: 12.653674805670764 / 7.291876288187345 / 12.763322403492216


In [None]:
import optuna
from sklearn.ensemble import VotingRegressor, RandomForestRegressor
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error
from optuna.samplers import TPESampler

# 데이터 분할
X = train.drop('Target', axis=1)  # Target을 제외한 모든 컬럼을 X로 지정
y = train['Target']               # Target 컬럼을 y로 지정

# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# Optuna 목적 함수 정의
def objective(trial):
    # 하이퍼파라미터 선택
    lasso_alpha = trial.suggest_float('lasso_alpha', 0.001, 1.0)
    elastic_alpha = trial.suggest_float('elastic_alpha', 0.001, 1.0)
    elastic_l1_ratio = trial.suggest_float('elastic_l1_ratio', 0.1, 0.9)
    
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 50, 300)
    rf_max_depth = trial.suggest_int('rf_max_depth', 3, 20)
    rf_min_samples_split = trial.suggest_int('rf_min_samples_split', 2, 10)
    rf_min_samples_leaf = trial.suggest_int('rf_min_samples_leaf', 1, 10)
    rf_max_features = trial.suggest_categorical('rf_max_features', ['sqrt', 'log2', None])
    rf_random_state = trial.suggest_int('rf_random_state', 1, 100)

    # 모델 정의
    lasso = Lasso(alpha=lasso_alpha, random_state=rf_random_state)
    elastic = ElasticNet(alpha=elastic_alpha, l1_ratio=elastic_l1_ratio, random_state=rf_random_state)
    rf = RandomForestRegressor(
        n_estimators=rf_n_estimators,
        max_depth=rf_max_depth,
        min_samples_split=rf_min_samples_split,
        min_samples_leaf=rf_min_samples_leaf,
        max_features=rf_max_features,
        random_state=rf_random_state
    )
    
    # VotingRegressor 정의
    voting = VotingRegressor([('lasso', lasso), ('elastic', elastic), ('rf', rf)])
    
    # 5-fold 교차 검증으로 평가
    scores = cross_val_score(voting, X_train, y_train, cv=5, scoring=mae_scorer)
    return scores.mean()

# Optuna 최적화 실행
study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(objective, n_trials=500)

# 최적 하이퍼파라미터 출력
print("Best parameters:", study.best_params)

# 최적 하이퍼파라미터로 VotingRegressor 학습
best_lasso = Lasso(alpha=study.best_params['lasso_alpha'], random_state=study.best_params['rf_random_state'])
best_elastic = ElasticNet(alpha=study.best_params['elastic_alpha'], l1_ratio=study.best_params['elastic_l1_ratio'], random_state=study.best_params['rf_random_state'])
best_rf = RandomForestRegressor(
    n_estimators=study.best_params['rf_n_estimators'],
    max_depth=study.best_params['rf_max_depth'],
    min_samples_split=study.best_params['rf_min_samples_split'],
    min_samples_leaf=study.best_params['rf_min_samples_leaf'],
    max_features=study.best_params['rf_max_features'],
    random_state=study.best_params['rf_random_state']
)

voting_best = VotingRegressor([('lasso', best_lasso), ('elastic', best_elastic), ('rf', best_rf)])

# 최적화된 모델 학습 및 평가
voting_best.fit(X_train, y_train)
y_pred = voting_best.predict(X_test)

train_score = mean_absolute_error(y_train, voting_best.predict(X_train))
test_score = mean_absolute_error(y_test, y_pred)

print(f'Optimized Voting Regressor - Train MAE: {train_score}, Test MAE: {test_score}')

[I 2024-10-28 15:17:44,101] A new study created in memory with name: no-name-5d7855ea-bb0b-482d-bb2d-3f3cbc34c7a9
[I 2024-10-28 15:17:51,776] Trial 0 finished with value: 12.61063206598244 and parameters: {'lasso_alpha': 0.05359203960529344, 'elastic_alpha': 0.9663744177171892, 'elastic_l1_ratio': 0.1771469623971088, 'rf_n_estimators': 230, 'rf_max_depth': 20, 'rf_min_samples_split': 8, 'rf_min_samples_leaf': 2, 'rf_max_features': 'sqrt', 'rf_random_state': 56}. Best is trial 0 with value: 12.61063206598244.
[I 2024-10-28 15:17:53,310] Trial 1 finished with value: 12.583079694426269 and parameters: {'lasso_alpha': 0.14864221436540087, 'elastic_alpha': 0.3609806544260413, 'elastic_l1_ratio': 0.13384186420595334, 'rf_n_estimators': 135, 'rf_max_depth': 4, 'rf_min_samples_split': 3, 'rf_min_samples_leaf': 2, 'rf_max_features': 'sqrt', 'rf_random_state': 38}. Best is trial 1 with value: 12.583079694426269.
[I 2024-10-28 15:17:55,278] Trial 2 finished with value: 12.59021023262496 and param

Best parameters: {'lasso_alpha': 0.6326730218311353, 'elastic_alpha': 0.8613078453165357, 'elastic_l1_ratio': 0.8888224759362712, 'rf_n_estimators': 88, 'rf_max_depth': 9, 'rf_min_samples_split': 10, 'rf_min_samples_leaf': 6, 'rf_max_features': None, 'rf_random_state': 60}
Optimized Voting Regressor - Train MAE: 12.072384525863281, Test MAE: 12.488797325007173


In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error

# 데이터 분할
X = train.drop('Target', axis=1)    # Target을 제외한 모든 컬럼을 X로 지정
y = train['Target']                 # Target 컬럼을 y로 지정

# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 모델 생성
# {'lasso_alpha': 0.6326730218311353, 'elastic_alpha': 0.8613078453165357, 'elastic_l1_ratio': 0.8888224759362712, 'rf_n_estimators': 88, 'rf_max_depth': 9, 'rf_min_samples_split': 10, 'rf_min_samples_leaf': 6, 'rf_max_features': None, 'rf_random_state': 60}

lasso = Lasso(alpha=0.6326730218311353, random_state=60)
elastic = ElasticNet(alpha=0.8613078453165357, l1_ratio=0.8888224759362712, random_state=60)
rf = RandomForestRegressor(
    n_estimators=88,
    max_depth=9,
    min_samples_split=10,
    min_samples_leaf=6,
    max_features=None,
    random_state=60
)

voting = VotingRegressor([('lasso', lasso), ('elastic', elastic), ('rf', rf)])

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# 각 모델에 대해 학습 및 5-fold 교차검증 수행
scores = cross_val_score(voting, X, y, cv=5, scoring=mae_scorer)

voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)

train_score = mean_absolute_error(y_train, voting.predict(X_train))
test_score = mean_absolute_error(y_test, y_pred)

print(f'{scores.mean()} / {train_score} / {test_score}')

12.549272939789832 / 12.072384525863281 / 12.488797325007173


## Basic Time Data

In [None]:
import pandas as pd

# 데이터 로드
train = pd.read_csv('./train.csv')

# yymm 컬럼을 날짜 형식으로 변환 (연도는 임의로 설정)
train['yymm'] = pd.to_datetime('2024' + train['yymm'], format='%Y%m%d %H:%M')

# day, hour, minute 컬럼 생성
train['day'] = train['yymm'].dt.day         # 일
train['hour'] = train['yymm'].dt.hour       # 시
train['minute'] = train['yymm'].dt.minute   # 분

# weekday 컬럼 생성
train['weekday'] = train['day'] % 7         # 요일 (0: 월요일, 1: 화요일, ..., 6: 일요일)

# yymm 컬럼 삭제
train.drop('yymm', axis=1, inplace=True)

# 결과 출력
train.head(10)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,Target,day,hour,minute,weekday
0,-5.327,12.25,-3.294,-7.855,-1.196,13.824,-10.249,-3.04,-5.17,8.077,16.198,-1.101,-0.067,-8.412,-0.592,-4.153,23.669,0.103,1.001,-5.861,-27.695,-9.978,-2.689,-0.951,-3.873,0.471,44.521,1,0,0,1
1,-5.267,12.916,-3.22,-7.788,-1.196,14.424,-10.249,-3.04,-4.97,8.027,16.198,-1.168,-0.067,-8.532,-0.592,-4.079,21.669,0.073,0.935,-5.881,-37.695,-10.038,-2.652,-1.018,-3.503,0.361,35.027,1,0,10,1
2,-5.127,13.583,-3.13,-7.658,-1.196,15.081,-10.359,-3.04,-4.83,7.977,16.198,-1.168,-0.067,-8.642,-0.665,-3.953,19.669,0.013,0.905,-5.891,-37.695,-10.001,-2.652,-1.051,-3.436,0.361,13.92,1,0,20,1
3,-5.06,14.25,-3.13,-7.532,-1.196,14.961,-10.359,-3.04,-4.83,7.927,26.198,-1.168,-0.067,-8.762,-0.592,-3.953,17.669,-0.02,0.845,-5.911,-37.695,-10.028,-2.552,-1.111,-3.346,0.261,28.41,1,0,30,1
4,-4.967,14.916,-3.094,-7.462,-1.196,15.454,-10.359,-3.04,-4.97,7.877,16.198,-1.168,-0.067,-8.882,-0.629,-3.916,15.669,-0.087,0.811,-5.931,-37.695,-10.111,-2.619,-1.141,-3.346,0.261,1.647,1,0,40,1
5,-4.967,15.583,-3.02,-7.388,-1.196,15.284,-10.419,-3.04,-4.86,7.827,16.198,-1.168,-0.134,-8.992,-0.702,-3.916,13.669,-0.087,0.745,-5.941,-37.695,-10.111,-2.689,-1.208,-3.346,0.171,6.36,1,0,50,1
6,-4.827,16.25,-2.92,-7.288,-1.196,15.351,-10.449,-3.04,-4.933,7.777,16.198,-1.268,-0.167,-9.112,-0.702,-3.953,11.669,-0.153,0.695,-5.961,-47.695,-10.171,-2.762,-1.275,-3.346,0.171,34.535,1,1,0,1
7,-4.797,16.25,-2.92,-7.222,-1.196,14.188,-10.516,-3.04,-4.86,7.727,26.198,-1.268,-0.167,-9.242,-0.769,-3.983,9.169,-0.187,0.645,-6.111,-37.695,-10.478,-2.689,-1.341,-3.206,0.071,21.335,1,1,10,1
8,-4.737,16.25,-2.83,-7.188,-1.196,14.048,-10.659,-3.04,-4.933,7.677,16.198,-1.268,-0.167,-9.382,-0.802,-4.043,6.669,-0.26,0.645,-6.261,-37.695,-10.744,-2.689,-1.451,-3.073,0.004,34.687,1,1,20,1
9,-4.9,16.25,-2.89,-7.188,-1.196,14.014,-10.659,-3.04,-4.43,7.627,6.198,-1.268,-0.234,-9.512,-0.802,-4.013,4.169,-0.297,0.535,-6.411,-37.695,-10.941,-2.792,-1.551,-2.706,0.038,34.136,1,1,30,1


In [None]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
import optuna
from sklearn.ensemble import VotingRegressor, RandomForestRegressor
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error
from optuna.samplers import TPESampler
# 데이터 로드
train = pd.read_csv('./train.csv')

# yymm 컬럼을 날짜 형식으로 변환 (연도는 임의로 설정)
train['yymm'] = pd.to_datetime('2024' + train['yymm'], format='%Y%m%d %H:%M')

# day, hour, minute 컬럼 생성
train['day'] = train['yymm'].dt.day         # 일
train['hour'] = train['yymm'].dt.hour       # 시
train['minute'] = train['yymm'].dt.minute   # 분

# weekday 컬럼 생성
train['weekday'] = train['day'] % 7         # 요일 (0: 월요일, 1: 화요일, ..., 6: 일요일)

# yymm 컬럼 삭제
train.drop('yymm', axis=1, inplace=True)

# 데이터 분할
X = train.drop('Target', axis=1)  # Target을 제외한 모든 컬럼을 X로 지정
y = train['Target']               # Target 컬럼을 y로 지정

# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# Optuna 목적 함수 수정
def objective(trial):
    # 하이퍼파라미터 선택
    lasso_alpha = trial.suggest_float('lasso_alpha', 0.001, 1.0)
    elastic_alpha = trial.suggest_float('elastic_alpha', 0.001, 1.0)
    elastic_l1_ratio = trial.suggest_float('elastic_l1_ratio', 0.1, 0.9)

    rf_n_estimators = trial.suggest_int('rf_n_estimators', 50, 300)
    rf_max_depth = trial.suggest_int('rf_max_depth', 3, 20)
    rf_min_samples_split = trial.suggest_int('rf_min_samples_split', 2, 10)
    rf_min_samples_leaf = trial.suggest_int('rf_min_samples_leaf', 1, 10)
    rf_max_features = trial.suggest_categorical('rf_max_features', ['sqrt', 'log2', None])
    rf_random_state = trial.suggest_int('rf_random_state', 1, 100)
    
    gbr_n_estimators = trial.suggest_int('gbr_n_estimators', 50, 300)
    gbr_max_depth = trial.suggest_int('gbr_max_depth', 3, 20)
    gbr_learning_rate = trial.suggest_float('gbr_learning_rate', 0.01, 0.3)
    
    svr_C = trial.suggest_float('svr_C', 0.1, 10)
    svr_epsilon = trial.suggest_float('svr_epsilon', 0.01, 1.0)

    # 모델 정의
    lasso = Lasso(alpha=lasso_alpha, random_state=rf_random_state)
    elastic = ElasticNet(alpha=elastic_alpha, l1_ratio=elastic_l1_ratio, random_state=rf_random_state)
    rf = RandomForestRegressor(
        n_estimators=rf_n_estimators,
        max_depth=rf_max_depth,
        min_samples_split=rf_min_samples_split,
        min_samples_leaf=rf_min_samples_leaf,
        max_features=rf_max_features,
        random_state=rf_random_state
    )
    gbr = GradientBoostingRegressor(
        n_estimators=gbr_n_estimators,
        max_depth=gbr_max_depth,
        learning_rate=gbr_learning_rate,
        random_state=rf_random_state
    )
    svr = SVR(C=svr_C, epsilon=svr_epsilon)

    # VotingRegressor 정의
    voting = VotingRegressor([
        ('lasso', lasso), 
        ('elastic', elastic), 
        ('rf', rf), 
        ('gbr', gbr), 
        ('svr', svr)
    ])
    
    # 5-fold 교차 검증으로 평가
    scores = cross_val_score(voting, X_train, y_train, cv=5, scoring=mae_scorer)
    return scores.mean()

# Optuna 최적화 실행
study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(objective, n_trials=50)

# 최적 하이퍼파라미터로 VotingRegressor 학습
best_lasso = Lasso(alpha=study.best_params['lasso_alpha'], random_state=study.best_params['rf_random_state'])
best_elastic = ElasticNet(alpha=study.best_params['elastic_alpha'], l1_ratio=study.best_params['elastic_l1_ratio'], random_state=study.best_params['rf_random_state'])
best_rf = RandomForestRegressor(
    n_estimators=study.best_params['rf_n_estimators'],
    max_depth=study.best_params['rf_max_depth'],
    min_samples_split=study.best_params['rf_min_samples_split'],
    min_samples_leaf=study.best_params['rf_min_samples_leaf'],
    max_features=study.best_params['rf_max_features'],
    random_state=study.best_params['rf_random_state']
)
best_gbr = GradientBoostingRegressor(
    n_estimators=study.best_params['gbr_n_estimators'],
    max_depth=study.best_params['gbr_max_depth'],
    learning_rate=study.best_params['gbr_learning_rate'],
    random_state=study.best_params['rf_random_state']
)
best_svr = SVR(C=study.best_params['svr_C'], epsilon=study.best_params['svr_epsilon'])

voting_best = VotingRegressor([
    ('lasso', best_lasso), 
    ('elastic', best_elastic), 
    ('rf', best_rf), 
    ('gbr', best_gbr), 
    ('svr', best_svr)
])

# 최적화된 모델 학습 및 평가
voting_best.fit(X_train, y_train)
y_pred = voting_best.predict(X_test)

train_score = mean_absolute_error(y_train, voting_best.predict(X_train))
test_score = mean_absolute_error(y_test, y_pred)

print(f'Optimized Voting Regressor - Train MAE: {train_score}, Test MAE: {test_score}')


[I 2024-10-28 21:22:48,959] A new study created in memory with name: no-name-26e42622-7d57-4acf-9c8a-9b52c9db0fa4
[I 2024-10-28 21:23:52,751] Trial 0 finished with value: 12.668818421658505 and parameters: {'lasso_alpha': 0.7508557547922238, 'elastic_alpha': 0.18176404786409991, 'elastic_l1_ratio': 0.7085931887315663, 'rf_n_estimators': 140, 'rf_max_depth': 16, 'rf_min_samples_split': 10, 'rf_min_samples_leaf': 10, 'rf_max_features': None, 'rf_random_state': 42, 'gbr_n_estimators': 115, 'gbr_max_depth': 16, 'gbr_learning_rate': 0.22951752749049986, 'svr_C': 3.77296433262724, 'svr_epsilon': 0.1567026538180402}. Best is trial 0 with value: 12.668818421658505.
[I 2024-10-28 21:25:15,906] Trial 1 finished with value: 12.640724457439282 and parameters: {'lasso_alpha': 0.2818801638962993, 'elastic_alpha': 0.08222816071645095, 'elastic_l1_ratio': 0.21768531184775386, 'rf_n_estimators': 237, 'rf_max_depth': 9, 'rf_min_samples_split': 3, 'rf_min_samples_leaf': 6, 'rf_max_features': 'sqrt', 'rf_

Optimized Voting Regressor - Train MAE: 11.779678761381117, Test MAE: 12.448669792907314


In [None]:
study.best_params

{'lasso_alpha': 0.5766799274859804,
 'elastic_alpha': 0.8221525283117354,
 'elastic_l1_ratio': 0.874529953459651,
 'rf_n_estimators': 50,
 'rf_max_depth': 14,
 'rf_min_samples_split': 5,
 'rf_min_samples_leaf': 1,
 'rf_max_features': 'log2',
 'rf_random_state': 64,
 'gbr_n_estimators': 238,
 'gbr_max_depth': 3,
 'gbr_learning_rate': 0.010172253051238662,
 'svr_C': 7.195062927565365,
 'svr_epsilon': 0.3590899010236214}

In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error

# 데이터 로드
train = pd.read_csv('./train.csv')

# yymm 컬럼을 날짜 형식으로 변환 (연도는 임의로 설정)
train['yymm'] = pd.to_datetime('2024' + train['yymm'], format='%Y%m%d %H:%M')

# day, hour, minute 컬럼 생성
train['day'] = train['yymm'].dt.day         # 일
train['hour'] = train['yymm'].dt.hour       # 시
train['minute'] = train['yymm'].dt.minute

# weekday 컬럼 생성
train['weekday'] = train['day'] % 7         # 요일 (0: 월요일, 1: 화요일, ..., 6: 일요일)

# yymm 컬럼 삭제
train.drop('yymm', axis=1, inplace=True)


# 데이터 분할
X = train.drop('Target', axis=1)    # Target을 제외한 모든 컬럼을 X로 지정
y = train['Target']                 # Target 컬럼을 y로 지정

# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 모델 생성
# {'lasso_alpha': 0.8422730237975117, 'elastic_alpha': 0.9491303377885107, 'elastic_l1_ratio': 0.7173804553247742, 'rf_n_estimators': 158, 'rf_max_depth': 9, 'rf_min_samples_split': 8, 'rf_min_samples_leaf': 4, 'rf_max_features': 'sqrt', 'rf_random_state': 45, 'gbr_n_estimators': 102, 'gbr_max_depth': 4, 'gbr_learning_rate': 0.018268910533320307, 'svr_C': 5.906266623527182, 'svr_epsilon': 0.6564267398355691}
lasso = Lasso(alpha=0.5766799274859804, random_state=64)
elastic = ElasticNet(alpha=0.8221525283117354, l1_ratio=0.874529953459651, random_state=64)
rf = RandomForestRegressor(
    n_estimators=50,
    max_depth=14,
    min_samples_split=5,
    min_samples_leaf=1,
    max_features='log2',
    random_state=64
)
gbr = GradientBoostingRegressor(
    n_estimators=238,
    max_depth=3,
    learning_rate=0.010172253051238662,
    random_state=64
)
svr = SVR(C=7.195062927565365, epsilon=0.3590899010236214)

voting = VotingRegressor([
    ('lasso', lasso), 
    ('elastic', elastic), 
    ('rf', rf), 
    ('gbr', gbr), 
    ('svr', svr)
])

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# 각 모델에 대해 학습 및 5-fold 교차검증 수행
scores = cross_val_score(voting, X, y, cv=5, scoring=mae_scorer)

voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)

train_score = mean_absolute_error(y_train, voting.predict(X_train))
test_score = mean_absolute_error(y_test, y_pred)

print(f'{scores.mean()} / {train_score} / {test_score}')

12.543629280213713 / 11.779678761381117 / 12.448669792907314


In [None]:
study.best_params

{'lasso_alpha': 0.8422730237975117,
 'elastic_alpha': 0.9491303377885107,
 'elastic_l1_ratio': 0.7173804553247742,
 'rf_n_estimators': 158,
 'rf_max_depth': 9,
 'rf_min_samples_split': 8,
 'rf_min_samples_leaf': 4,
 'rf_max_features': 'sqrt',
 'rf_random_state': 45,
 'gbr_n_estimators': 102,
 'gbr_max_depth': 4,
 'gbr_learning_rate': 0.018268910533320307,
 'svr_C': 5.906266623527182,
 'svr_epsilon': 0.6564267398355691}

In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error

# 데이터 로드
train = pd.read_csv('./train.csv')

# yymm 컬럼을 날짜 형식으로 변환 (연도는 임의로 설정)
train['yymm'] = pd.to_datetime('2024' + train['yymm'], format='%Y%m%d %H:%M')

# day, hour, minute 컬럼 생성
train['day'] = train['yymm'].dt.day         # 일
train['hour'] = train['yymm'].dt.hour       # 시
train['minute'] = train['yymm'].dt.minute

# weekday 컬럼 생성
train['weekday'] = train['day'] % 7         # 요일 (0: 월요일, 1: 화요일, ..., 6: 일요일)

# yymm 컬럼 삭제
train.drop('yymm', axis=1, inplace=True)


# 데이터 분할
X = train.drop('Target', axis=1)    # Target을 제외한 모든 컬럼을 X로 지정
y = train['Target']                 # Target 컬럼을 y로 지정

# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 모델 생성
# {'lasso_alpha': 0.8422730237975117, 'elastic_alpha': 0.9491303377885107, 'elastic_l1_ratio': 0.7173804553247742, 'rf_n_estimators': 158, 'rf_max_depth': 9, 'rf_min_samples_split': 8, 'rf_min_samples_leaf': 4, 'rf_max_features': 'sqrt', 'rf_random_state': 45, 'gbr_n_estimators': 102, 'gbr_max_depth': 4, 'gbr_learning_rate': 0.018268910533320307, 'svr_C': 5.906266623527182, 'svr_epsilon': 0.6564267398355691}

lasso = Lasso(alpha=0.8422730237975117, random_state=45)
elastic = ElasticNet(alpha=0.9491303377885107, l1_ratio=0.7173804553247742, random_state=45)
rf = RandomForestRegressor(
    n_estimators=158,
    max_depth=9,
    min_samples_split=8,
    min_samples_leaf=4,
    max_features='sqrt',
    random_state=45
)
gbr = GradientBoostingRegressor(
    n_estimators=102,
    max_depth=4,
    learning_rate=0.018268910533320307,
    random_state=45
)
svr = SVR(C=5.906266623527182, epsilon=0.6564267398355691)

voting = VotingRegressor([
    ('lasso', lasso), 
    ('elastic', elastic), 
    ('rf', rf), 
    ('gbr', gbr), 
    ('svr', svr)
])

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# 각 모델에 대해 학습 및 5-fold 교차검증 수행
scores = cross_val_score(voting, X, y, cv=5, scoring=mae_scorer)

voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)

train_score = mean_absolute_error(y_train, voting.predict(X_train))
test_score = mean_absolute_error(y_test, y_pred)

print(f'{scores.mean()} / {train_score} / {test_score}')

12.54172378378728 / 12.12218583237527 / 12.458393952008992


In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error

# 데이터 분할
X = train.drop('Target', axis=1)    # Target을 제외한 모든 컬럼을 X로 지정
y = train['Target']                 # Target 컬럼을 y로 지정

# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 모델 생성
lr = LinearRegression()
ridge = Ridge(random_state=42)
lasso = Lasso(random_state=42)
elastic = ElasticNet(random_state=42)
svr = SVR()
gb = GradientBoostingRegressor(random_state=42)
rf = RandomForestRegressor(random_state=42)
xgb = XGBRegressor(random_state=42)
lgbm = LGBMRegressor(random_state=42)
dt = DecisionTreeRegressor(random_state=42)

voting1 = VotingRegressor([
    ('lr', lr),
    ('lasso', lasso),
    ('elastic', elastic),
    ('random_forest', rf),
    ('xgb', xgb),
    ('lgbm', lgbm)
])

voting2 = VotingRegressor([
    ('lasso', lasso),
    ('random_forest', rf),
    ('xgb', xgb),
    ('lgbm', lgbm)
])

voting3 = VotingRegressor([
    ('svr', svr),
    ('random_forest', rf),
    ('xgb', xgb),
])

models = {
    'Voting1': voting1,
    'Voting2': voting2,
}

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# 각 모델에 대해 학습 및 5-fold 교차검증 수행
for model_name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring=mae_scorer)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    train_score = mean_absolute_error(y_train, model.predict(X_train))
    test_score = mean_absolute_error(y_test, y_pred)

    print(f'{model_name}: {scores.mean()} / {train_score} / {test_score}')

Voting1: 12.60688779355655 / 9.123350168815875 / 12.59916436989187
Voting2: 12.671473568583895 / 7.484227059334714 / 12.807629368722674


In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error

# 데이터 분할
X = train.drop('Target', axis=1)    # Target을 제외한 모든 컬럼을 X로 지정
y = train['Target']                 # Target 컬럼을 y로 지정

# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 모델 생성
lr = LinearRegression()
ridge = Ridge(random_state=42)
lasso = Lasso(random_state=42)
elastic = ElasticNet(random_state=42)
svr = SVR()
gb = GradientBoostingRegressor(random_state=42)
rf = RandomForestRegressor(random_state=42)
xgb = XGBRegressor(random_state=42)
lgbm = LGBMRegressor(random_state=42)
dt = DecisionTreeRegressor(random_state=42)

voting1 = VotingRegressor([
    ('linear', lr),
    ('ridge', ridge),
    ('lasso', lasso)
])

voting2 = VotingRegressor([
    ('linear', lr),
    ('lasso', lasso),
    ('elastic', elastic)
])

voting3 = VotingRegressor([
    ('ridge', ridge),
    ('lasso', lasso),
    ('elastic', elastic)
])

models = {
    'Voting1': voting1,
    'Voting2': voting2,
    'Voiting3': voting3
}

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# 각 모델에 대해 학습 및 5-fold 교차검증 수행
for model_name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring=mae_scorer)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    train_score = mean_absolute_error(y_train, model.predict(X_train))
    test_score = mean_absolute_error(y_test, y_pred)

    print(f'{model_name}: {scores.mean()} / {train_score} / {test_score}')

Voting1: 12.577860675814023 / 12.520992571007 / 12.46450337287879
Voting2: 12.556600719190508 / 12.531942285375514 / 12.466188836570359
Voiting3: 12.556628895007307 / 12.531972435513197 / 12.466226516010302


In [None]:
import optuna
from sklearn.ensemble import VotingRegressor, RandomForestRegressor
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error
from optuna.samplers import TPESampler

# 데이터 분할
X = train.drop('Target', axis=1)  # Target을 제외한 모든 컬럼을 X로 지정
y = train['Target']               # Target 컬럼을 y로 지정

# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# Optuna 목적 함수 정의
def objective(trial):
    # 하이퍼파라미터 선택
    lasso_alpha = trial.suggest_float('lasso_alpha', 0.001, 1.0)
    elastic_alpha = trial.suggest_float('elastic_alpha', 0.001, 1.0)
    elastic_l1_ratio = trial.suggest_float('elastic_l1_ratio', 0.1, 0.9)
    
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 50, 300)
    rf_max_depth = trial.suggest_int('rf_max_depth', 3, 20)
    rf_min_samples_split = trial.suggest_int('rf_min_samples_split', 2, 10)
    rf_min_samples_leaf = trial.suggest_int('rf_min_samples_leaf', 1, 10)
    rf_max_features = trial.suggest_categorical('rf_max_features', ['sqrt', 'log2', None])
    rf_random_state = trial.suggest_int('rf_random_state', 1, 100)

    # 모델 정의
    lasso = Lasso(alpha=lasso_alpha, random_state=rf_random_state)
    elastic = ElasticNet(alpha=elastic_alpha, l1_ratio=elastic_l1_ratio, random_state=rf_random_state)
    rf = RandomForestRegressor(
        n_estimators=rf_n_estimators,
        max_depth=rf_max_depth,
        min_samples_split=rf_min_samples_split,
        min_samples_leaf=rf_min_samples_leaf,
        max_features=rf_max_features,
        random_state=rf_random_state
    )
    
    # VotingRegressor 정의
    voting = VotingRegressor([('lasso', lasso), ('elastic', elastic), ('rf', rf)])
    
    # 5-fold 교차 검증으로 평가
    scores = cross_val_score(voting, X_train, y_train, cv=5, scoring=mae_scorer)
    return scores.mean()

# Optuna 최적화 실행
study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(objective, n_trials=50)

# 최적 하이퍼파라미터 출력
print("Best parameters:", study.best_params)

# 최적 하이퍼파라미터로 VotingRegressor 학습
best_lasso = Lasso(alpha=study.best_params['lasso_alpha'], random_state=study.best_params['rf_random_state'])
best_elastic = ElasticNet(alpha=study.best_params['elastic_alpha'], l1_ratio=study.best_params['elastic_l1_ratio'], random_state=study.best_params['rf_random_state'])
best_rf = RandomForestRegressor(
    n_estimators=study.best_params['rf_n_estimators'],
    max_depth=study.best_params['rf_max_depth'],
    min_samples_split=study.best_params['rf_min_samples_split'],
    min_samples_leaf=study.best_params['rf_min_samples_leaf'],
    max_features=study.best_params['rf_max_features'],
    random_state=study.best_params['rf_random_state']
)

voting_best = VotingRegressor([('lasso', best_lasso), ('elastic', best_elastic), ('rf', best_rf)])

# 최적화된 모델 학습 및 평가
voting_best.fit(X_train, y_train)
y_pred = voting_best.predict(X_test)

train_score = mean_absolute_error(y_train, voting_best.predict(X_train))
test_score = mean_absolute_error(y_test, y_pred)

print(f'Optimized Voting Regressor - Train MAE: {train_score}, Test MAE: {test_score}')

[I 2024-10-28 13:23:35,227] A new study created in memory with name: no-name-55bbf7b2-8107-4134-a95d-4407507fe174
[I 2024-10-28 13:23:39,012] Trial 0 finished with value: 12.586220220865986 and parameters: {'lasso_alpha': 0.04951247206224487, 'elastic_alpha': 0.6863197688524071, 'elastic_l1_ratio': 0.7362861256359688, 'rf_n_estimators': 288, 'rf_max_depth': 6, 'rf_min_samples_split': 8, 'rf_min_samples_leaf': 5, 'rf_max_features': 'log2', 'rf_random_state': 81}. Best is trial 0 with value: 12.586220220865986.
[I 2024-10-28 13:23:50,850] Trial 1 finished with value: 12.617340948074608 and parameters: {'lasso_alpha': 0.17673415289192013, 'elastic_alpha': 0.05975465535802682, 'elastic_l1_ratio': 0.3249854222921561, 'rf_n_estimators': 100, 'rf_max_depth': 12, 'rf_min_samples_split': 5, 'rf_min_samples_leaf': 3, 'rf_max_features': None, 'rf_random_state': 54}. Best is trial 0 with value: 12.586220220865986.
[I 2024-10-28 13:23:53,483] Trial 2 finished with value: 12.617915116161118 and para

Best parameters: {'lasso_alpha': 0.7000364112801918, 'elastic_alpha': 0.9737082848325345, 'elastic_l1_ratio': 0.8968219831995436, 'rf_n_estimators': 50, 'rf_max_depth': 8, 'rf_min_samples_split': 4, 'rf_min_samples_leaf': 10, 'rf_max_features': None, 'rf_random_state': 31}
Optimized Voting Regressor - Train MAE: 12.19373132878114, Test MAE: 12.494546429601435


In [None]:
import optuna
from sklearn.ensemble import VotingRegressor, RandomForestRegressor
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error
from optuna.samplers import TPESampler

# 데이터 분할
X = train.drop('Target', axis=1)  # Target을 제외한 모든 컬럼을 X로 지정
y = train['Target']               # Target 컬럼을 y로 지정

# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# Optuna 목적 함수 정의
def objective(trial):
    # 하이퍼파라미터 선택
    lasso_alpha = trial.suggest_float('lasso_alpha', 0.001, 1.0)
    elastic_alpha = trial.suggest_float('elastic_alpha', 0.001, 1.0)
    elastic_l1_ratio = trial.suggest_float('elastic_l1_ratio', 0.1, 0.9)
    
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 50, 300)
    rf_max_depth = trial.suggest_int('rf_max_depth', 3, 20)
    rf_min_samples_split = trial.suggest_int('rf_min_samples_split', 2, 10)
    rf_min_samples_leaf = trial.suggest_int('rf_min_samples_leaf', 1, 10)
    rf_max_features = trial.suggest_categorical('rf_max_features', ['sqrt', 'log2', None])
    rf_random_state = trial.suggest_int('rf_random_state', 1, 100)

    # 모델 정의
    lasso = Lasso(alpha=lasso_alpha, random_state=rf_random_state)
    elastic = ElasticNet(alpha=elastic_alpha, l1_ratio=elastic_l1_ratio, random_state=rf_random_state)
    rf = RandomForestRegressor(
        n_estimators=rf_n_estimators,
        max_depth=rf_max_depth,
        min_samples_split=rf_min_samples_split,
        min_samples_leaf=rf_min_samples_leaf,
        max_features=rf_max_features,
        random_state=rf_random_state
    )
    
    # VotingRegressor 정의
    voting = VotingRegressor([('lasso', lasso), ('elastic', elastic), ('rf', rf)])
    
    # 5-fold 교차 검증으로 평가
    scores = cross_val_score(voting, X_train, y_train, cv=5, scoring=mae_scorer)
    return scores.mean()

# Optuna 최적화 실행
study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(objective, n_trials=100)

# 최적 하이퍼파라미터 출력
print("Best parameters:", study.best_params)

# 최적 하이퍼파라미터로 VotingRegressor 학습
best_lasso = Lasso(alpha=study.best_params['lasso_alpha'], random_state=study.best_params['rf_random_state'])
best_elastic = ElasticNet(alpha=study.best_params['elastic_alpha'], l1_ratio=study.best_params['elastic_l1_ratio'], random_state=study.best_params['rf_random_state'])
best_rf = RandomForestRegressor(
    n_estimators=study.best_params['rf_n_estimators'],
    max_depth=study.best_params['rf_max_depth'],
    min_samples_split=study.best_params['rf_min_samples_split'],
    min_samples_leaf=study.best_params['rf_min_samples_leaf'],
    max_features=study.best_params['rf_max_features'],
    random_state=study.best_params['rf_random_state']
)

voting_best = VotingRegressor([('lasso', best_lasso), ('elastic', best_elastic), ('rf', best_rf)])

# 최적화된 모델 학습 및 평가
voting_best.fit(X_train, y_train)
y_pred = voting_best.predict(X_test)

train_score = mean_absolute_error(y_train, voting_best.predict(X_train))
test_score = mean_absolute_error(y_test, y_pred)

print(f'Optimized Voting Regressor - Train MAE: {train_score}, Test MAE: {test_score}')

[I 2024-10-28 22:21:53,611] A new study created in memory with name: no-name-2990cbde-0f07-43ff-b785-99d17b876074
[I 2024-10-28 22:22:13,997] Trial 0 finished with value: 12.590844731075222 and parameters: {'lasso_alpha': 0.8757630761685458, 'elastic_alpha': 0.12894514214567518, 'elastic_l1_ratio': 0.38016502778259376, 'rf_n_estimators': 160, 'rf_max_depth': 12, 'rf_min_samples_split': 6, 'rf_min_samples_leaf': 5, 'rf_max_features': None, 'rf_random_state': 17}. Best is trial 0 with value: 12.590844731075222.
[I 2024-10-28 22:22:17,664] Trial 1 finished with value: 12.60707082642242 and parameters: {'lasso_alpha': 0.510540943071692, 'elastic_alpha': 0.251619491346435, 'elastic_l1_ratio': 0.464051899651561, 'rf_n_estimators': 139, 'rf_max_depth': 19, 'rf_min_samples_split': 8, 'rf_min_samples_leaf': 5, 'rf_max_features': 'log2', 'rf_random_state': 6}. Best is trial 0 with value: 12.590844731075222.
[I 2024-10-28 22:22:34,206] Trial 2 finished with value: 12.58499047145801 and parameters

Best parameters: {'lasso_alpha': 0.8972848387122385, 'elastic_alpha': 0.9927458081241716, 'elastic_l1_ratio': 0.7864933943439321, 'rf_n_estimators': 201, 'rf_max_depth': 8, 'rf_min_samples_split': 10, 'rf_min_samples_leaf': 1, 'rf_max_features': 'sqrt', 'rf_random_state': 28}
Optimized Voting Regressor - Train MAE: 12.187420114063967, Test MAE: 12.47436399418055


In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error

# 데이터 분할
X = train.drop('Target', axis=1)    # Target을 제외한 모든 컬럼을 X로 지정
y = train['Target']                 # Target 컬럼을 y로 지정

# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 모델 생성
# {'lasso_alpha': 0.8972848387122385, 'elastic_alpha': 0.9927458081241716, 'elastic_l1_ratio': 0.7864933943439321, 'rf_n_estimators': 201, 'rf_max_depth': 8, 'rf_min_samples_split': 10, 'rf_min_samples_leaf': 1, 'rf_max_features': 'sqrt', 'rf_random_state': 28}
lasso = Lasso(alpha=0.8972848387122385, random_state=28)
elastic = ElasticNet(alpha=0.9927458081241716, l1_ratio=0.7864933943439321, random_state=28)
rf = RandomForestRegressor(
    n_estimators=201,
    max_depth=8,
    min_samples_split=10,
    min_samples_leaf=1,
    max_features='sqrt',
    random_state=28
)

voting = VotingRegressor([('lasso', lasso), ('elastic', elastic), ('rf', rf)])

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# 각 모델에 대해 학습 및 5-fold 교차검증 수행
scores = cross_val_score(voting, X, y, cv=5, scoring=mae_scorer)

voting.fit(X_train, y_train)
y_pred = model.predict(X_test)

train_score = mean_absolute_error(y_train, voting.predict(X_train))
test_score = mean_absolute_error(y_test, y_pred)

print(f'{scores.mean()} / {train_score} / {test_score}')

12.544782819689578 / 12.187420114063967 / 12.807629368722674


In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error

# 데이터 분할
X = train.drop('Target', axis=1)    # Target을 제외한 모든 컬럼을 X로 지정
y = train['Target']                 # Target 컬럼을 y로 지정

# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 모델 생성
lasso = Lasso(alpha=0.7000364112801918, random_state=31)
elastic = ElasticNet(alpha=0.9737082848325345, l1_ratio=0.8968219831995436, random_state=31)
rf = RandomForestRegressor(
    n_estimators=50,
    max_depth=8,
    min_samples_split=4,
    min_samples_leaf=10,
    max_features=None,
    random_state=31
)

voting = VotingRegressor([('lasso', lasso), ('elastic', elastic), ('rf', rf)])

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# 각 모델에 대해 학습 및 5-fold 교차검증 수행
scores = cross_val_score(voting, X, y, cv=5, scoring=mae_scorer)

voting.fit(X_train, y_train)
y_pred = model.predict(X_test)

train_score = mean_absolute_error(y_train, voting.predict(X_train))
test_score = mean_absolute_error(y_test, y_pred)

print(f'{scores.mean()} / {train_score} / {test_score}')

12.541153673973747 / 12.19373132878114 / 12.466226516010302


In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error

# 데이터 분할
X = train.drop('Target', axis=1)    # Target을 제외한 모든 컬럼을 X로 지정
y = train['Target']                 # Target 컬럼을 y로 지정

# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 모델 생성
# {'lasso_alpha': 0.6779122520600722, 'elastic_alpha': 0.9971090599987921, 'elastic_l1_ratio': 0.44498452438450353, 'rf_n_estimators': 168, 'rf_max_depth': 6, 'rf_min_samples_split': 2, 'rf_min_samples_leaf': 1, 'rf_max_features': None, 'rf_random_state': 30}
lasso = Lasso(alpha=0.6779122520600722, random_state=30)
elastic = ElasticNet(alpha=0.9971090599987921, l1_ratio=0.44498452438450353, random_state=30)
rf = RandomForestRegressor(
    n_estimators=168,
    max_depth=6,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features=None,
    random_state=30
)

voting = VotingRegressor([
    ('lasso', lasso),
    ('elastic', elastic),
    ('rf', rf)
])

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# 각 모델에 대해 학습 및 5-fold 교차검증 수행
scores = cross_val_score(voting, X, y, cv=5, scoring=mae_scorer)

voting.fit(X_train, y_train)
y_pred = model.predict(X_test)

train_score = mean_absolute_error(y_train, voting.predict(X_train))
test_score = mean_absolute_error(y_test, y_pred)

print(f'{scores.mean()} / {train_score} / {test_score}')

12.539876272275201 / 12.27021820379983 / 12.466226516010302


In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error
import pandas as pd

# 데이터 분할
X = train[['V7', 'V17', 'V20', 'V21', 'V24']]    # Target을 제외한 모든 컬럼을 X로 지정
y = train['Target']                 # Target 컬럼을 y로 지정

# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 모델 생성
lasso = Lasso(alpha=0.6779122520600722, random_state=30)
elastic = ElasticNet(alpha=0.9971090599987921, l1_ratio=0.44498452438450353, random_state=30)
rf = RandomForestRegressor(
    n_estimators=168,
    max_depth=6,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features=None,
    random_state=30
)

voting = VotingRegressor([
    ('lasso', lasso),
    ('elastic', elastic),
    ('rf', rf)
])

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# 각 모델에 대해 학습 및 5-fold 교차검증 수행
scores = cross_val_score(voting, X, y, cv=5, scoring=mae_scorer)

# VotingRegressor 학습
voting.fit(X_train, y_train)

# 테스트 데이터에 대해 예측 수행
y_pred = voting.predict(X_test)

# 훈련 및 테스트 데이터의 MAE 계산
train_score = mean_absolute_error(y_train, voting.predict(X_train))
test_score = mean_absolute_error(y_test, y_pred)

# 결과 출력
print(f'Cross-validated MAE: {scores.mean()}, Train MAE: {train_score}, Test MAE: {test_score}')


Cross-validated MAE: 12.53065715635401, Train MAE: 12.306957102252456, Test MAE: 12.46857108565719


In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error

# 데이터 분할
X = train.drop('Target', axis=1)    # Target을 제외한 모든 컬럼을 X로 지정
y = train['Target']                 # Target 컬럼을 y로 지정

# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 모델 생성
# {'lasso_alpha': 0.9858784552601979, 'elastic_alpha': 0.9755023294378308, 'elastic_l1_ratio': 0.8955218082984115, 'rf_n_estimators': 51, 'rf_max_depth': 7, 'rf_min_samples_split': 9, 'rf_min_samples_leaf': 3, 'rf_max_features': None, 'rf_random_state': 49}

lasso = Lasso(alpha=0.9858784552601979, random_state=49)
elastic = ElasticNet(alpha=0.9755023294378308, l1_ratio=0.8955218082984115, random_state=49)
rf = RandomForestRegressor(
    n_estimators=51,
    max_depth=7,
    min_samples_split=9,
    min_samples_leaf=3,
    max_features=None,
    random_state=49
)


voting = VotingRegressor([
    ('lasso', lasso),
    ('elastic', elastic),
    ('rf', rf)
])

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# 각 모델에 대해 학습 및 5-fold 교차검증 수행
scores = cross_val_score(voting, X, y, cv=5, scoring=mae_scorer)

voting.fit(X_train, y_train)
y_pred = model.predict(X_test)

train_score = mean_absolute_error(y_train, voting.predict(X_train))
test_score = mean_absolute_error(y_test, y_pred)

print(f'{scores.mean()} / {train_score} / {test_score}')

12.544548093688368 / 12.202605499643731 / 12.466226516010302


In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error

# 데이터 분할
X = train.drop('Target', axis=1)    # Target을 제외한 모든 컬럼을 X로 지정
y = train['Target']                 # Target 컬럼을 y로 지정

# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 모델 생성
{'lasso_alpha': 0.87975180300022, 'elastic_alpha': 0.9583534291840334, 'elastic_l1_ratio': 0.7329161446530024, 'ridge_alpha': 0.583133995768023, 'gb_learning_rate': 0.05843394170060385, 'gb_n_estimators': 66, 'rf_n_estimators': 285, 'rf_max_depth': 6, 'rf_min_samples_split': 8, 'rf_min_samples_leaf': 10, 'rf_max_features': None, 'rf_random_state': 49}

lr = LinearRegression()
lasso = Lasso(alpha=0.87975180300022, random_state=49)
elastic = ElasticNet(alpha=0.9583534291840334, l1_ratio=0.7329161446530024, random_state=49)
ridge = Ridge(alpha=0.583133995768023, random_state=49)
rf = RandomForestRegressor(
    n_estimators=285,
    max_depth=6,
    min_samples_split=8,
    min_samples_leaf=10,
    max_features=None,
    random_state=49
)
gb = GradientBoostingRegressor(
    n_estimators=66,
    learning_rate=0.05843394170060385,
    random_state=49
)

voting_best = VotingRegressor([
    ('lr', lr),
    ('lasso', lasso), 
    ('elastic', elastic), 
    ('rf', rf),
    ('ridge', ridge),
    ('gb', gb)
])

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# 각 모델에 대해 학습 및 5-fold 교차검증 수행
scores = cross_val_score(voting, X, y, cv=5, scoring=mae_scorer)

voting.fit(X_train, y_train)
y_pred = model.predict(X_test)

train_score = mean_absolute_error(y_train, voting.predict(X_train))
test_score = mean_absolute_error(y_test, y_pred)

print(f'{scores.mean()} / {train_score} / {test_score}')

12.543629280213713 / 11.779678761381117 / 12.807629368722674


In [None]:
import optuna
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso, ElasticNet, Ridge, LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error
from optuna.samplers import TPESampler
import pandas as pd
import numpy as np  

# 데이터 로드
train = pd.read_csv('./train.csv')

# yymm 컬럼을 날짜 형식으로 변환 (연도는 임의로 설정)
train['yymm'] = pd.to_datetime('2024' + train['yymm'], format='%Y%m%d %H:%M')

# day, hour, minute 컬럼 생성
train['day'] = train['yymm'].dt.day         # 일
train['hour'] = train['yymm'].dt.hour       # 시
train['minute'] = train['yymm'].dt.minute   # 분

# weekday 컬럼 생성
train['weekday'] = train['day'] % 7         # 요일 (0: 월요일, 1: 화요일, ..., 6: 일요일)

# yymm 컬럼 삭제
train.drop('yymm', axis=1, inplace=True)

# 데이터 분할
X = train.drop('Target', axis=1)  # Target을 제외한 모든 컬럼을 X로 지정
y = train['Target']               # Target 컬럼을 y로 지정

# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# Optuna 목적 함수 정의
def objective(trial):
    # 하이퍼파라미터 선택
    lasso_alpha = trial.suggest_float('lasso_alpha', 0.001, 1.0)
    elastic_alpha = trial.suggest_float('elastic_alpha', 0.001, 1.0)
    elastic_l1_ratio = trial.suggest_float('elastic_l1_ratio', 0.1, 0.9)
    gb_learning_rate = trial.suggest_float('gb_learning_rate', 0.01, 0.3)
    gb_n_estimators = trial.suggest_int('gb_n_estimators', 50, 300)

    rf_n_estimators = trial.suggest_int('rf_n_estimators', 50, 300)
    rf_max_depth = trial.suggest_int('rf_max_depth', 3, 20)
    rf_min_samples_split = trial.suggest_int('rf_min_samples_split', 2, 10)
    rf_min_samples_leaf = trial.suggest_int('rf_min_samples_leaf', 1, 10)
    rf_max_features = trial.suggest_categorical('rf_max_features', ['sqrt', 'log2', None])
    rf_random_state = trial.suggest_int('rf_random_state', 1, 100)

    # 모델 정의
    lasso = Lasso(alpha=lasso_alpha, random_state=rf_random_state)
    elastic = ElasticNet(alpha=elastic_alpha, l1_ratio=elastic_l1_ratio, random_state=rf_random_state)
    rf = RandomForestRegressor(
        n_estimators=rf_n_estimators,
        max_depth=rf_max_depth,
        min_samples_split=rf_min_samples_split,
        min_samples_leaf=rf_min_samples_leaf,
        max_features=rf_max_features,
        random_state=rf_random_state
    )
    gb = GradientBoostingRegressor(
        learning_rate=gb_learning_rate,
        n_estimators=gb_n_estimators,
        random_state=rf_random_state
    )

    # VotingRegressor 정의
    voting = VotingRegressor([
        ('lasso', lasso), 
        ('elastic', elastic), 
        ('rf', rf),
        ('gb', gb)
    ])

    # 5-fold 교차 검증으로 평가
    scores = cross_val_score(voting, X_train, y_train, cv=5, scoring=mae_scorer)
    return scores.mean()

# Optuna 최적화 실행
study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(objective, n_trials=50)

# 최적 하이퍼파라미터 출력
print("Best parameters:", study.best_params)

# 최적 하이퍼파라미터로 VotingRegressor 학습
best_lasso = Lasso(alpha=study.best_params['lasso_alpha'], random_state=study.best_params['rf_random_state'])
best_elastic = ElasticNet(alpha=study.best_params['elastic_alpha'], l1_ratio=study.best_params['elastic_l1_ratio'], random_state=study.best_params['rf_random_state'])
best_rf = RandomForestRegressor(
    n_estimators=study.best_params['rf_n_estimators'],
    max_depth=study.best_params['rf_max_depth'],
    min_samples_split=study.best_params['rf_min_samples_split'],
    min_samples_leaf=study.best_params['rf_min_samples_leaf'],
    max_features=study.best_params['rf_max_features'],
    random_state=study.best_params['rf_random_state']
)
best_gb = GradientBoostingRegressor(
    learning_rate=study.best_params['gb_learning_rate'],
    n_estimators=study.best_params['gb_n_estimators'],
    random_state=study.best_params['rf_random_state']
)

voting_best = VotingRegressor([
    ('lasso', best_lasso), 
    ('elastic', best_elastic), 
    ('rf', best_rf),
    ('gb', best_gb)
])

# 최적화된 모델 학습 및 평가
voting_best.fit(X_train, y_train)
y_pred = voting_best.predict(X_test)

train_score = mean_absolute_error(y_train, voting_best.predict(X_train))
test_score = mean_absolute_error(y_test, y_pred)

print(f'Optimized Voting Regressor - Train MAE: {train_score}, Test MAE: {test_score}')


[I 2024-10-28 22:52:58,442] A new study created in memory with name: no-name-9fff95a7-2366-4a48-830b-e9ee27d8a1dc
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-10-28 22:54:14,329] Trial 0 finished with value: 12.633878099651655 and parameters: {'lasso_alpha': 0.007675941735660297, 'elastic_alpha': 0.10412828008128282, 'elastic_l1_ratio': 0.2885626358797505, 'gb_learning_rate': 0.09807596150569996, 'gb_n_estimators': 123, 'rf_n_estimators': 300, 'rf_max_depth': 18, 'rf_min_samples_split': 9, 'rf_min_samples_leaf': 1, 'rf_max_features': None, 'rf_random_state': 91}. Best is trial 0 with value: 12.633878099651655.
[I 2024-10-28 22:54:27,310] Trial 1 finished with value: 12.593232421483597 and parameters: {'lasso_alpha': 0.8327691687821464, 'elastic_alpha': 0.3527374710046569, 'elastic_l1_ratio': 0.47849157781935536, 'gb_learning_rate': 0.118394272542097, 'g

Best parameters: {'lasso_alpha': 0.8047578403183432, 'elastic_alpha': 0.7797332527857485, 'elastic_l1_ratio': 0.4439560000993694, 'gb_learning_rate': 0.011161577541366336, 'gb_n_estimators': 194, 'rf_n_estimators': 187, 'rf_max_depth': 7, 'rf_min_samples_split': 10, 'rf_min_samples_leaf': 8, 'rf_max_features': 'sqrt', 'rf_random_state': 73}
Optimized Voting Regressor - Train MAE: 12.281636966341262, Test MAE: 12.478376207295785


In [None]:
import optuna
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso, ElasticNet, Ridge, LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error
from optuna.samplers import TPESampler
import pandas as pd
import numpy as np

# 데이터 로드
train = pd.read_csv('./train.csv')

# yymm 컬럼을 날짜 형식으로 변환 (연도는 임의로 설정)
train['yymm'] = pd.to_datetime('2024' + train['yymm'], format='%Y%m%d %H:%M')

# day, hour, minute 컬럼 생성
train['day'] = train['yymm'].dt.day         # 일
train['hour'] = train['yymm'].dt.hour       # 시
train['minute'] = train['yymm'].dt.minute   # 분

# weekday 컬럼 생성
train['weekday'] = train['day'] % 7         # 요일 (0: 월요일, 1: 화요일, ..., 6: 일요일)

# yymm 컬럼 삭제
train.drop('yymm', axis=1, inplace=True)

# 데이터 분할
X = train.drop('Target', axis=1)  # Target을 제외한 모든 컬럼을 X로 지정
y = train['Target']               # Target 컬럼을 y로 지정

# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# Optuna 목적 함수 정의
def objective(trial):
    # 하이퍼파라미터 선택
    lasso_alpha = trial.suggest_float('lasso_alpha', 0.001, 1.0)
    elastic_alpha = trial.suggest_float('elastic_alpha', 0.001, 1.0)
    elastic_l1_ratio = trial.suggest_float('elastic_l1_ratio', 0.1, 0.9)
    ridge_alpha = trial.suggest_float('ridge_alpha', 0.001, 1.0)
    gb_learning_rate = trial.suggest_float('gb_learning_rate', 0.01, 0.3)
    gb_n_estimators = trial.suggest_int('gb_n_estimators', 50, 300)

    rf_n_estimators = trial.suggest_int('rf_n_estimators', 50, 300)
    rf_max_depth = trial.suggest_int('rf_max_depth', 3, 20)
    rf_min_samples_split = trial.suggest_int('rf_min_samples_split', 2, 10)
    rf_min_samples_leaf = trial.suggest_int('rf_min_samples_leaf', 1, 10)
    rf_max_features = trial.suggest_categorical('rf_max_features', ['sqrt', 'log2', None])
    rf_random_state = trial.suggest_int('rf_random_state', 1, 100)

    # 모델 정의
    lasso = Lasso(alpha=lasso_alpha, random_state=rf_random_state)
    elastic = ElasticNet(alpha=elastic_alpha, l1_ratio=elastic_l1_ratio, random_state=rf_random_state)
    rf = RandomForestRegressor(
        n_estimators=rf_n_estimators,
        max_depth=rf_max_depth,
        min_samples_split=rf_min_samples_split,
        min_samples_leaf=rf_min_samples_leaf,
        max_features=rf_max_features,
        random_state=rf_random_state
    )
    ridge = Ridge(alpha=ridge_alpha, random_state=rf_random_state)
    linear = LinearRegression()
    gb = GradientBoostingRegressor(
        learning_rate=gb_learning_rate,
        n_estimators=gb_n_estimators,
        random_state=rf_random_state
    )

    # VotingRegressor 정의
    voting = VotingRegressor([
        ('lasso', lasso),
        ('elastic', elastic),
        ('rf', rf),
        ('ridge', ridge),
        ('linear', linear),
        ('gb', gb)
    ])

    # 5-fold 교차 검증으로 평가
    scores = cross_val_score(voting, X_train, y_train, cv=5, scoring=mae_scorer)
    return scores.mean()

# Optuna 최적화 실행
study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(objective, n_trials=100)

# 최적 하이퍼파라미터 출력
print("Best parameters:", study.best_params)

# 최적 하이퍼파라미터로 VotingRegressor 학습
best_lasso = Lasso(alpha=study.best_params['lasso_alpha'], random_state=study.best_params['rf_random_state'])
best_elastic = ElasticNet(alpha=study.best_params['elastic_alpha'], l1_ratio=study.best_params['elastic_l1_ratio'], random_state=study.best_params['rf_random_state'])
best_rf = RandomForestRegressor(
    n_estimators=study.best_params['rf_n_estimators'],
    max_depth=study.best_params['rf_max_depth'],
    min_samples_split=study.best_params['rf_min_samples_split'],
    min_samples_leaf=study.best_params['rf_min_samples_leaf'],
    max_features=study.best_params['rf_max_features'],
    random_state=study.best_params['rf_random_state']
)
best_ridge = Ridge(alpha=study.best_params['ridge_alpha'], random_state=study.best_params['rf_random_state'])
best_linear = LinearRegression()
best_gb = GradientBoostingRegressor(
    learning_rate=study.best_params['gb_learning_rate'],
    n_estimators=study.best_params['gb_n_estimators'],
    random_state=study.best_params['rf_random_state']
)

voting_best = VotingRegressor([
    ('lasso', best_lasso),
    ('elastic', best_elastic),
    ('rf', best_rf),
    ('ridge', best_ridge),
    ('linear', best_linear),
    ('gb', best_gb)
])

# 최적화된 모델 학습 및 평가
voting_best.fit(X_train, y_train)
y_pred = voting_best.predict(X_test)

train_score = mean_absolute_error(y_train, voting_best.predict(X_train))
test_score = mean_absolute_error(y_test, y_pred)

print(f'Optimized Voting Regressor - Train MAE: {train_score}, Test MAE: {test_score}')


[I 2024-10-28 12:11:49,251] A new study created in memory with name: no-name-0546f345-4536-4881-b545-324ce11e5053
[I 2024-10-28 12:12:20,382] Trial 0 finished with value: 12.626233293414014 and parameters: {'lasso_alpha': 0.3146695602448273, 'elastic_alpha': 0.5293019322114868, 'elastic_l1_ratio': 0.8176425539673525, 'ridge_alpha': 0.2962320642151984, 'gb_learning_rate': 0.29314543313780117, 'gb_n_estimators': 280, 'rf_n_estimators': 100, 'rf_max_depth': 8, 'rf_min_samples_split': 10, 'rf_min_samples_leaf': 2, 'rf_max_features': 'sqrt', 'rf_random_state': 21}. Best is trial 0 with value: 12.626233293414014.
[I 2024-10-28 12:12:38,672] Trial 1 finished with value: 12.609652298942104 and parameters: {'lasso_alpha': 0.9912171900047738, 'elastic_alpha': 0.5178158367809479, 'elastic_l1_ratio': 0.3772104572016993, 'ridge_alpha': 0.8449731157994221, 'gb_learning_rate': 0.27506844882298337, 'gb_n_estimators': 70, 'rf_n_estimators': 250, 'rf_max_depth': 18, 'rf_min_samples_split': 10, 'rf_min_s

Best parameters: {'lasso_alpha': 0.87975180300022, 'elastic_alpha': 0.9583534291840334, 'elastic_l1_ratio': 0.7329161446530024, 'ridge_alpha': 0.583133995768023, 'gb_learning_rate': 0.05843394170060385, 'gb_n_estimators': 66, 'rf_n_estimators': 285, 'rf_max_depth': 6, 'rf_min_samples_split': 8, 'rf_min_samples_leaf': 10, 'rf_max_features': None, 'rf_random_state': 49}
Optimized Voting Regressor - Train MAE: 12.32637182447851, Test MAE: 12.471958708717917


## One-Hot + PCA Data

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# 데이터 로드
train = pd.read_csv('./train.csv')

# yymm 컬럼을 날짜 형식으로 변환 (연도는 임의로 설정)
train['yymm'] = pd.to_datetime('2024' + train['yymm'], format='%Y%m%d %H:%M')

# day, hour, minute, weekday 컬럼 생성
day = train['yymm'].dt.day
day_dummies = pd.get_dummies(day, prefix='day')

hour = train['yymm'].dt.hour
hour_dummies = pd.get_dummies(hour, prefix='hour')

minute = train['yymm'].dt.minute
minute_dummies = pd.get_dummies(minute, prefix='minute')

# weekday 컬럼 생성
weekday = day % 7
weekday = weekday.map({0:'Mon', 1:'Tue', 2:'Wed', 3:'Thu', 4:'Fri', 5:'Sat', 6:'Sun'})
weekday_dummies = pd.get_dummies(weekday)

# PCA 피처 생성
features = train.loc[:, 'V1':'V26'] # V1 ~ V26 컬럼 선택

scaler = StandardScaler()
features = scaler.fit_transform(features) # 피처 표준화

pca = PCA(n_components=3)
pca_features = pca.fit_transform(features) # PCA 피처 생성

# 생성된 주성분을 DataFrame으로 변환
pca_columns = ['PCA1', 'PCA2', 'PCA3']
pca_df = pd.DataFrame(pca_features, columns=pca_columns)

# 원본 데이터와 PCA 피처 결합
train = pd.concat([train, day_dummies, hour_dummies, minute_dummies, weekday_dummies, pca_df], axis=1)

# yymm 컬럼 삭제
train.drop('yymm', axis=1, inplace=True)

# 결과 출력
train.head(10)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,Target,day_1,day_2,day_3,day_4,day_5,day_6,day_7,day_8,day_9,day_10,day_11,day_12,day_13,...,hour_0,hour_1,hour_2,hour_3,hour_4,hour_5,hour_6,hour_7,hour_8,hour_9,hour_10,hour_11,hour_12,hour_13,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23,minute_0,minute_10,minute_20,minute_30,minute_40,minute_50,Fri,Mon,Sat,Sun,Thu,Tue,Wed,PCA1,PCA2,PCA3
0,-5.327,12.25,-3.294,-7.855,-1.196,13.824,-10.249,-3.04,-5.17,8.077,16.198,-1.101,-0.067,-8.412,-0.592,-4.153,23.669,0.103,1.001,-5.861,-27.695,-9.978,-2.689,-0.951,-3.873,0.471,44.521,True,False,False,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,-2.616525,-2.341176,-3.016174
1,-5.267,12.916,-3.22,-7.788,-1.196,14.424,-10.249,-3.04,-4.97,8.027,16.198,-1.168,-0.067,-8.532,-0.592,-4.079,21.669,0.073,0.935,-5.881,-37.695,-10.038,-2.652,-1.018,-3.503,0.361,35.027,True,False,False,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,-2.691167,-2.187795,-3.000616
2,-5.127,13.583,-3.13,-7.658,-1.196,15.081,-10.359,-3.04,-4.83,7.977,16.198,-1.168,-0.067,-8.642,-0.665,-3.953,19.669,0.013,0.905,-5.891,-37.695,-10.001,-2.652,-1.051,-3.436,0.361,13.92,True,False,False,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,-2.724164,-2.076864,-2.954282
3,-5.06,14.25,-3.13,-7.532,-1.196,14.961,-10.359,-3.04,-4.83,7.927,26.198,-1.168,-0.067,-8.762,-0.592,-3.953,17.669,-0.02,0.845,-5.911,-37.695,-10.028,-2.552,-1.111,-3.346,0.261,28.41,True,False,False,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,-2.744811,-2.076954,-2.864929
4,-4.967,14.916,-3.094,-7.462,-1.196,15.454,-10.359,-3.04,-4.97,7.877,16.198,-1.168,-0.067,-8.882,-0.629,-3.916,15.669,-0.087,0.811,-5.931,-37.695,-10.111,-2.619,-1.141,-3.346,0.261,1.647,True,False,False,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,-2.827777,-1.960836,-2.942832
5,-4.967,15.583,-3.02,-7.388,-1.196,15.284,-10.419,-3.04,-4.86,7.827,16.198,-1.168,-0.134,-8.992,-0.702,-3.916,13.669,-0.087,0.745,-5.941,-37.695,-10.111,-2.689,-1.208,-3.346,0.171,6.36,True,False,False,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,-2.914471,-1.90683,-2.889545
6,-4.827,16.25,-2.92,-7.288,-1.196,15.351,-10.449,-3.04,-4.933,7.777,16.198,-1.268,-0.167,-9.112,-0.702,-3.953,11.669,-0.153,0.695,-5.961,-47.695,-10.171,-2.762,-1.275,-3.346,0.171,34.535,True,False,False,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,-3.008483,-1.854571,-2.893859
7,-4.797,16.25,-2.92,-7.222,-1.196,14.188,-10.516,-3.04,-4.86,7.727,26.198,-1.268,-0.167,-9.242,-0.769,-3.983,9.169,-0.187,0.645,-6.111,-37.695,-10.478,-2.689,-1.341,-3.206,0.071,21.335,True,False,False,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,-3.088857,-1.917965,-2.752764
8,-4.737,16.25,-2.83,-7.188,-1.196,14.048,-10.659,-3.04,-4.933,7.677,16.198,-1.268,-0.167,-9.382,-0.802,-4.043,6.669,-0.26,0.645,-6.261,-37.695,-10.744,-2.689,-1.451,-3.073,0.004,34.687,True,False,False,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,-3.212438,-1.820224,-2.834904
9,-4.9,16.25,-2.89,-7.188,-1.196,14.014,-10.659,-3.04,-4.43,7.627,6.198,-1.268,-0.234,-9.512,-0.802,-4.013,4.169,-0.297,0.535,-6.411,-37.695,-10.941,-2.792,-1.551,-2.706,0.038,34.136,True,False,False,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,-3.306377,-1.653918,-2.872736


In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error

# 데이터 분할
X = train.drop('Target', axis=1)    # Target을 제외한 모든 컬럼을 X로 지정
y = train['Target']                 # Target 컬럼을 y로 지정

# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 모델 생성
lr = LinearRegression()
ridge = Ridge(random_state=42)
lasso = Lasso(random_state=42)
elastic = ElasticNet(random_state=42)
svr = SVR()
gb = GradientBoostingRegressor(random_state=42)
rf = RandomForestRegressor(random_state=42)
xgb = XGBRegressor(random_state=42)
lgbm = LGBMRegressor(random_state=42)
dt = DecisionTreeRegressor(random_state=42)

voting1 = VotingRegressor([
    ('lasso', lasso),
    ('svr', svr),
    ('random_forest', rf)
])

voting2 = VotingRegressor([
    ('lasso', lasso),
    ('elastic', elastic),
    ('svr', svr),
])

voting3 = VotingRegressor([
    ('svr', svr),
    ('random_forest', rf),
    ('xgb', xgb),
])

models = {
    'Voting1': voting1,
    'Voting2': voting2,
    'Voiting3': voting3
}

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# 각 모델에 대해 학습 및 5-fold 교차검증 수행
for model_name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring=mae_scorer)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    train_score = mean_absolute_error(y_train, model.predict(X_train))
    test_score = mean_absolute_error(y_test, y_pred)

    print(f'{model_name}: {scores.mean()} / {train_score} / {test_score}')

Voting1: 12.539015789720931 / 9.88133180297008 / 12.490045280103026
Voting2: 12.537690856924806 / 12.513861512090774 / 12.457365725409833
Voiting3: 12.686431825484089 / 7.133621336898445 / 12.708426967205904
