## Voting 최적 모델 feature selection

In [1]:
import pandas as pd

# 데이터 로드
train = pd.read_csv('./train.csv')

# yymm 컬럼을 날짜 형식으로 변환 (연도는 임의로 설정)
train['yymm'] = pd.to_datetime('2024' + train['yymm'], format='%Y%m%d %H:%M')

# day, hour, minute 컬럼 생성
train['day'] = train['yymm'].dt.day         # 일
train['hour'] = train['yymm'].dt.hour       # 시
train['minute'] = train['yymm'].dt.minute   # 분

# weekday 컬럼 생성
train['weekday'] = train['day'] % 7         # 요일 (0: 월요일, 1: 화요일, ..., 6: 일요일)

# yymm 컬럼 삭제
train.drop('yymm', axis=1, inplace=True)

# 결과 출력
train.head(10)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,Target,day,hour,minute,weekday
0,-5.327,12.25,-3.294,-7.855,-1.196,13.824,-10.249,-3.04,-5.17,8.077,...,-9.978,-2.689,-0.951,-3.873,0.471,44.521,1,0,0,1
1,-5.267,12.916,-3.22,-7.788,-1.196,14.424,-10.249,-3.04,-4.97,8.027,...,-10.038,-2.652,-1.018,-3.503,0.361,35.027,1,0,10,1
2,-5.127,13.583,-3.13,-7.658,-1.196,15.081,-10.359,-3.04,-4.83,7.977,...,-10.001,-2.652,-1.051,-3.436,0.361,13.92,1,0,20,1
3,-5.06,14.25,-3.13,-7.532,-1.196,14.961,-10.359,-3.04,-4.83,7.927,...,-10.028,-2.552,-1.111,-3.346,0.261,28.41,1,0,30,1
4,-4.967,14.916,-3.094,-7.462,-1.196,15.454,-10.359,-3.04,-4.97,7.877,...,-10.111,-2.619,-1.141,-3.346,0.261,1.647,1,0,40,1
5,-4.967,15.583,-3.02,-7.388,-1.196,15.284,-10.419,-3.04,-4.86,7.827,...,-10.111,-2.689,-1.208,-3.346,0.171,6.36,1,0,50,1
6,-4.827,16.25,-2.92,-7.288,-1.196,15.351,-10.449,-3.04,-4.933,7.777,...,-10.171,-2.762,-1.275,-3.346,0.171,34.535,1,1,0,1
7,-4.797,16.25,-2.92,-7.222,-1.196,14.188,-10.516,-3.04,-4.86,7.727,...,-10.478,-2.689,-1.341,-3.206,0.071,21.335,1,1,10,1
8,-4.737,16.25,-2.83,-7.188,-1.196,14.048,-10.659,-3.04,-4.933,7.677,...,-10.744,-2.689,-1.451,-3.073,0.004,34.687,1,1,20,1
9,-4.9,16.25,-2.89,-7.188,-1.196,14.014,-10.659,-3.04,-4.43,7.627,...,-10.941,-2.792,-1.551,-2.706,0.038,34.136,1,1,30,1


### Filter Method

In [3]:
import numpy as np
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
import pandas as pd

# 데이터 분할
X = train.drop('Target', axis=1)  # Target 컬럼 제외
y = train['Target']  # Target 컬럼

# 모델 생성
lasso = Lasso(alpha=0.6779122520600722, random_state=30)
elastic = ElasticNet(alpha=0.9971090599987921, l1_ratio=0.44498452438450353, random_state=30)
rf = RandomForestRegressor(
    n_estimators=168,
    max_depth=6,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features=None,
    random_state=30
)

model = VotingRegressor([
    ('lasso', lasso),
    ('elastic', elastic),
    ('rf', rf)
])

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# SelectKBest로 K-최고 특성 선택
test = SelectKBest(score_func=f_regression, k=X.shape[1])
fit = test.fit(X, y)

# 선택된 특성들의 인덱스를 내림차순으로 정렬
sorted_columns = np.argsort(fit.scores_)[::-1]

# 각 모델에 대해 최적의 특성 선택
best_score = float('inf')
best_features = []

# 최적의 특성 선택
for i in range(1, X.shape[1] + 1):
    # 선택된 feature들의 인덱스
    fs = sorted_columns[:i]

    # 선택된 feature만 선택 (Pandas DataFrame에서 iloc 사용)
    X_selected = X.iloc[:, fs]
    
    # 선택된 feature들의 이름
    selected_feature_names = X.columns[fs].tolist()
    
    # 교차 검증
    mae = cross_val_score(model, X_selected, y, cv=5, scoring=mae_scorer).mean()

    # 가장 성능이 좋은 MAE 및 feature를 저장
    if mae < best_score:
        best_score = mae
        best_features = selected_feature_names

# 결과 출력
print(f'best score: {best_score}, num_features: {len(best_features)}, best features: {best_features}')

best score: 12.528921875845228, num_features: 5, best features: ['V7', 'V17', 'V10', 'V4', 'V25']


### Forward Selection

In [4]:
import numpy as np
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error
from sklearn.feature_selection import SequentialFeatureSelector
import pandas as pd

# 데이터 분할
X = train.drop('Target', axis=1)  # Target 컬럼 제외
y = train['Target']  # Target 컬럼

# 모델 생성
lasso = Lasso(alpha=0.6779122520600722, random_state=30)
elastic = ElasticNet(alpha=0.9971090599987921, l1_ratio=0.44498452438450353, random_state=30)
rf = RandomForestRegressor(
    n_estimators=168,
    max_depth=6,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features=None,
    random_state=30
)

model = VotingRegressor([
    ('lasso', lasso),
    ('elastic', elastic),
    ('rf', rf)
])

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# 특성 개수
n_features = [5, 10, 15, 20]

# 최적의 특성 선택 및 교차 검증
for n_feature in n_features:
    # SequentialFeatureSelector를 사용하여 최적의 특성 선택
    sfs = SequentialFeatureSelector(model, n_features_to_select=n_feature, direction='forward')
    fit = sfs.fit(X, y)
    
    # 선택된 특성들의 이름
    selected_feature_names = X.columns[sfs.get_support()].tolist()
    
    # 교차 검증
    mae = cross_val_score(model, X[selected_feature_names], y, cv=5, scoring=mae_scorer).mean()
    
    # 결과 출력
    print(f'n_features: {n_feature}, MAE: {mae}, features: {selected_feature_names}')

n_features: 5, MAE: 12.53065715635401, features: ['V7', 'V17', 'V20', 'V21', 'V24']
n_features: 10, MAE: 12.528183843942037, features: ['V2', 'V3', 'V7', 'V17', 'V20', 'V21', 'V23', 'V24', 'day', 'weekday']
n_features: 15, MAE: 12.532695979225863, features: ['V2', 'V3', 'V5', 'V7', 'V10', 'V15', 'V17', 'V20', 'V21', 'V23', 'V24', 'day', 'hour', 'minute', 'weekday']
n_features: 20, MAE: 12.52874715493348, features: ['V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V10', 'V15', 'V17', 'V18', 'V20', 'V21', 'V23', 'V24', 'V26', 'day', 'hour', 'minute', 'weekday']


In [6]:
import numpy as np
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error
import pandas as pd

# 데이터 분할
features = ['V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V10', 'V15', 'V17', 'V18', 'V20', 'V21', 'V23', 'V24', 'V26', 'day', 'hour', 'minute', 'weekday']
X = train[features]  # 선택된 특성
y = train['Target']  # Target 컬럼

# 모델 생성
lasso = Lasso(alpha=0.6779122520600722, random_state=30)
elastic = ElasticNet(alpha=0.9971090599987921, l1_ratio=0.44498452438450353, random_state=30)
rf = RandomForestRegressor(
    n_estimators=168,
    max_depth=6,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features=None,
    random_state=30
)

model = VotingRegressor([
    ('lasso', lasso),
    ('elastic', elastic),
    ('rf', rf)
])

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# 교차 검증
mae = cross_val_score(model, X, y, cv=5, scoring=mae_scorer).mean()

# 결과 출력
print(f'MAE: {mae}')

MAE: 12.52874715493348
