## Basic Data Comparison

- 시계열 데이터를 제외한 데이터셋

In [2]:
import pandas as pd

# 데이터 로드
train = pd.read_csv('./train.csv')

# yymm 컬럼 삭제
train.drop('yymm', axis=1, inplace=True)

# 결과 출력
train.head(10)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,Target
0,-5.327,12.25,-3.294,-7.855,-1.196,13.824,-10.249,-3.04,-5.17,8.077,16.198,-1.101,-0.067,-8.412,-0.592,-4.153,23.669,0.103,1.001,-5.861,-27.695,-9.978,-2.689,-0.951,-3.873,0.471,44.521
1,-5.267,12.916,-3.22,-7.788,-1.196,14.424,-10.249,-3.04,-4.97,8.027,16.198,-1.168,-0.067,-8.532,-0.592,-4.079,21.669,0.073,0.935,-5.881,-37.695,-10.038,-2.652,-1.018,-3.503,0.361,35.027
2,-5.127,13.583,-3.13,-7.658,-1.196,15.081,-10.359,-3.04,-4.83,7.977,16.198,-1.168,-0.067,-8.642,-0.665,-3.953,19.669,0.013,0.905,-5.891,-37.695,-10.001,-2.652,-1.051,-3.436,0.361,13.92
3,-5.06,14.25,-3.13,-7.532,-1.196,14.961,-10.359,-3.04,-4.83,7.927,26.198,-1.168,-0.067,-8.762,-0.592,-3.953,17.669,-0.02,0.845,-5.911,-37.695,-10.028,-2.552,-1.111,-3.346,0.261,28.41
4,-4.967,14.916,-3.094,-7.462,-1.196,15.454,-10.359,-3.04,-4.97,7.877,16.198,-1.168,-0.067,-8.882,-0.629,-3.916,15.669,-0.087,0.811,-5.931,-37.695,-10.111,-2.619,-1.141,-3.346,0.261,1.647
5,-4.967,15.583,-3.02,-7.388,-1.196,15.284,-10.419,-3.04,-4.86,7.827,16.198,-1.168,-0.134,-8.992,-0.702,-3.916,13.669,-0.087,0.745,-5.941,-37.695,-10.111,-2.689,-1.208,-3.346,0.171,6.36
6,-4.827,16.25,-2.92,-7.288,-1.196,15.351,-10.449,-3.04,-4.933,7.777,16.198,-1.268,-0.167,-9.112,-0.702,-3.953,11.669,-0.153,0.695,-5.961,-47.695,-10.171,-2.762,-1.275,-3.346,0.171,34.535
7,-4.797,16.25,-2.92,-7.222,-1.196,14.188,-10.516,-3.04,-4.86,7.727,26.198,-1.268,-0.167,-9.242,-0.769,-3.983,9.169,-0.187,0.645,-6.111,-37.695,-10.478,-2.689,-1.341,-3.206,0.071,21.335
8,-4.737,16.25,-2.83,-7.188,-1.196,14.048,-10.659,-3.04,-4.933,7.677,16.198,-1.268,-0.167,-9.382,-0.802,-4.043,6.669,-0.26,0.645,-6.261,-37.695,-10.744,-2.689,-1.451,-3.073,0.004,34.687
9,-4.9,16.25,-2.89,-7.188,-1.196,14.014,-10.659,-3.04,-4.43,7.627,6.198,-1.268,-0.234,-9.512,-0.802,-4.013,4.169,-0.297,0.535,-6.411,-37.695,-10.941,-2.792,-1.551,-2.706,0.038,34.136


## Base Model MAE

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer, mean_absolute_error

# 데이터 분할
X = train.drop('Target', axis=1)    # Target을 제외한 모든 컬럼을 X로 지정
y = train['Target']                 # Target 컬럼을 y로 지정

# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 모델 정의
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42),
    'ElasticNet': ElasticNet(random_state=42),
    'SVR': SVR(),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'LightGBM': LGBMRegressor(random_state=42, verbose=-1),
    'Decision Tree': DecisionTreeRegressor(random_state=42)
}

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# 각 모델에 대해 학습 및 5-fold 교차검증 수행
for model_name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring=mae_scorer)
    print(f'{model_name}: {scores.mean()}')

Linear Regression: 12.626007154563748
Ridge: 12.625872446923035
Lasso: 12.580224346550702
ElasticNet: 12.582955143638385
SVR: 12.564188226545891
Gradient Boosting: 12.63337062841707
Random Forest: 13.353987621984615
XGBoost: 13.89784202256629
LightGBM: 13.157735041902736
Decision Tree: 16.959262534358977


## Featrue Selection MAE

### Filter Method

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import make_scorer, mean_absolute_error

# 데이터 분할
X = train.drop('Target', axis=1)  # Target 컬럼 제외
y = train['Target']  # Target 컬럼

# 모델 정의
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42),
    'ElasticNet': ElasticNet(random_state=42),
    'SVR': SVR(),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# SelectKBest로 K-최고 특성 선택
test = SelectKBest(score_func=f_regression, k=X.shape[1])
fit = test.fit(X, y)

# 선택된 특성들의 인덱스를 내림차순으로 정렬
sorted_columns = np.argsort(fit.scores_)[::-1]

# 각 모델에 대해 최적의 특성 선택
for model_name, model in models.items():

    # 최적의 특성을 찾기 위한 변수 초기화
    best_score = float('inf')
    best_features = []

    # 최적의 특성 선택
    for i in range(1, X.shape[1] + 1):
        # 선택된 feature들의 인덱스
        fs = sorted_columns[:i]

        # 선택된 feature만 선택 (Pandas DataFrame에서 iloc 사용)
        X_selected = X.iloc[:, fs]
        
        # 선택된 feature들의 이름
        selected_feature_names = X.columns[fs].tolist()
        
        # 교차 검증
        mae = cross_val_score(model, X_selected, y, cv=5, scoring=mae_scorer).mean()

        # 가장 성능이 좋은 MAE 및 feature를 저장
        if mae < best_score:
            best_score = mae
            best_features = selected_feature_names
    
    # 결과 출력
    print(f'{model_name} best score: {best_score}, num_features: {len(best_features)}, best features: {best_features}')

Linear Regression best score: 12.526764152572579, num_features: 5, best features: ['V7', 'V17', 'V10', 'V4', 'V25']
Ridge best score: 12.526763926952748, num_features: 5, best features: ['V7', 'V17', 'V10', 'V4', 'V25']
Lasso best score: 12.535082495159886, num_features: 4, best features: ['V7', 'V17', 'V10', 'V4']
ElasticNet best score: 12.52996843930851, num_features: 4, best features: ['V7', 'V17', 'V10', 'V4']
SVR best score: 12.536671680166233, num_features: 16, best features: ['V7', 'V17', 'V10', 'V4', 'V25', 'V8', 'V23', 'V21', 'V3', 'V5', 'V13', 'V11', 'V22', 'V16', 'V19', 'V6']
Gradient Boosting best score: 12.62843426586662, num_features: 1, best features: ['V7']


### Forward Selection

In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import make_scorer, mean_absolute_error

# 데이터 분할
X = train.drop('Target', axis=1)  # Target 컬럼 제외
y = train['Target']  # Target 컬럼

# 모델 정의
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42),
    'ElasticNet': ElasticNet(random_state=42),
    'SVR': SVR(),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# 각 모델에 대해 최적의 특성 선택
for model_name, model in models.items():
    
    # 최적의 특성을 찾기 위한 변수 초기화
    best_score = float('inf')
    best_features = []

    # 최적의 특성 선택
    sfs = SequentialFeatureSelector(model, n_features_to_select=5, direction='forward')
    fit = sfs.fit(X, y)

    # 선택된 피처
    fs = X.columns[fit.support_].tolist()

    # 선택된 feature 데이터프레임 생성
    X_selected = X.iloc[:, fit.get_support()]

    # 교차 검증
    mae = cross_val_score(model, X_selected, y, cv=5, scoring=mae_scorer).mean()

    # 결과 출력
    print(f'{model_name} best score: {mae}, best features: {fs}')


Linear Regression best score: 12.524648835270032, best features: ['V3', 'V10', 'V17', 'V25', 'V26']
Ridge best score: 12.524648903536, best features: ['V3', 'V10', 'V17', 'V25', 'V26']
Lasso best score: 12.534578335004287, best features: ['V1', 'V3', 'V5', 'V10', 'V17']
ElasticNet best score: 12.528527107451348, best features: ['V1', 'V8', 'V10', 'V17', 'V25']
SVR best score: 12.540622387546296, best features: ['V3', 'V4', 'V16', 'V23', 'V25']
Gradient Boosting best score: 12.58342619636261, best features: ['V2', 'V8', 'V11', 'V16', 'V21']


In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import make_scorer, mean_absolute_error

# 데이터 분할
X = train.drop('Target', axis=1)  # Target 컬럼 제외
y = train['Target']  # Target 컬럼

# 모델 정의
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42),
    'ElasticNet': ElasticNet(random_state=42),
    'SVR': SVR(),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# 각 모델에 대해 최적의 특성 선택
for model_name, model in models.items():
    
    # 최적의 특성을 찾기 위한 변수 초기화
    best_score = float('inf')
    best_features = []

    # 최적의 특성 선택
    sfs = SequentialFeatureSelector(model, n_features_to_select=10, direction='forward')
    fit = sfs.fit(X, y)

    # 선택된 피처
    fs = X.columns[fit.support_].tolist()

    # 선택된 feature 데이터프레임 생성
    X_selected = X.iloc[:, fit.get_support()]

    # 교차 검증
    mae = cross_val_score(model, X_selected, y, cv=5, scoring=mae_scorer).mean()

    # 결과 출력
    print(f'{model_name} best score: {mae}, best features: {fs}')
    

Linear Regression best score: 12.528747341482214, best features: ['V3', 'V9', 'V10', 'V11', 'V12', 'V16', 'V17', 'V24', 'V25', 'V26']
Ridge best score: 12.528744524509014, best features: ['V3', 'V9', 'V10', 'V11', 'V12', 'V16', 'V17', 'V24', 'V25', 'V26']
Lasso best score: 12.534578335004287, best features: ['V1', 'V3', 'V5', 'V8', 'V9', 'V10', 'V12', 'V13', 'V14', 'V17']
ElasticNet best score: 12.528527107451348, best features: ['V1', 'V3', 'V5', 'V8', 'V9', 'V10', 'V12', 'V13', 'V17', 'V25']
SVR best score: 12.537583280828374, best features: ['V3', 'V4', 'V6', 'V8', 'V10', 'V16', 'V17', 'V21', 'V23', 'V25']
Gradient Boosting best score: 12.60997070922468, best features: ['V2', 'V4', 'V6', 'V8', 'V9', 'V11', 'V16', 'V21', 'V23', 'V25']


In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import make_scorer, mean_absolute_error

# 데이터 분할
X = train.drop('Target', axis=1)  # Target 컬럼 제외
y = train['Target']  # Target 컬럼

# 모델 정의
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42),
    'ElasticNet': ElasticNet(random_state=42),
    'SVR': SVR(),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# 각 모델에 대해 최적의 특성 선택
for model_name, model in models.items():
    
    # 최적의 특성을 찾기 위한 변수 초기화
    best_score = float('inf')
    best_features = []

    # 최적의 특성 선택
    sfs = SequentialFeatureSelector(model, n_features_to_select=15, direction='forward')
    fit = sfs.fit(X, y)

    # 선택된 피처
    fs = X.columns[fit.support_].tolist()

    # 선택된 feature 데이터프레임 생성
    X_selected = X.iloc[:, fit.get_support()]

    # 교차 검증
    mae = cross_val_score(model, X_selected, y, cv=5, scoring=mae_scorer).mean()

    # 결과 출력
    print(f'{model_name} best score: {mae}, best features: {fs}')


Linear Regression best score: 12.538622578923501, best features: ['V1', 'V3', 'V5', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V16', 'V17', 'V23', 'V24', 'V25', 'V26']
Ridge best score: 12.53861576920944, best features: ['V1', 'V3', 'V5', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V16', 'V17', 'V23', 'V24', 'V25', 'V26']
Lasso best score: 12.534578335004287, best features: ['V1', 'V3', 'V5', 'V8', 'V9', 'V10', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20']
ElasticNet best score: 12.528527107451348, best features: ['V1', 'V3', 'V5', 'V8', 'V9', 'V10', 'V12', 'V13', 'V15', 'V16', 'V17', 'V18', 'V19', 'V24', 'V25']
SVR best score: 12.535068148104605, best features: ['V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V10', 'V12', 'V15', 'V16', 'V17', 'V21', 'V23', 'V25']
Gradient Boosting best score: 12.582642430664299, best features: ['V2', 'V3', 'V4', 'V6', 'V7', 'V8', 'V9', 'V11', 'V16', 'V17', 'V20', 'V21', 'V23', 'V24', 'V25']


### Backward Elimination

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import RFE
from sklearn.metrics import make_scorer, mean_absolute_error

# 데이터 분할
X = train.drop('Target', axis=1)  # Target 컬럼 제외
y = train['Target']  # Target 컬럼

# 모델 정의
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42),
    'ElasticNet': ElasticNet(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# RFE를 사용하여 feature selection
for model_name, model in models.items():
    rfe = RFE(model, n_features_to_select=5)
    fit = rfe.fit(X, y)

    fs = X.columns[fit.support_].tolist()
    X_selected = X.iloc[:, fit.get_support()]

    # 선택된 feature로 cross-validation 수행
    score = cross_val_score(model, X_selected, y, cv=5, scoring=mae_scorer)

    print(f'{model_name} score: {score.mean()}, selected features: {fs}')

Linear Regression score: 12.543496571231898, selected features: ['V5', 'V14', 'V18', 'V22', 'V24']
Ridge score: 12.543495934779815, selected features: ['V5', 'V14', 'V18', 'V22', 'V24']
Lasso score: 12.53489854284295, selected features: ['V2', 'V4', 'V7', 'V10', 'V17']
ElasticNet score: 12.530659655893988, selected features: ['V4', 'V7', 'V10', 'V17', 'V25']
Gradient Boosting score: 12.780808193079931, selected features: ['V1', 'V6', 'V16', 'V24', 'V25']


In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import RFE
from sklearn.metrics import make_scorer, mean_absolute_error

# 데이터 분할
X = train.drop('Target', axis=1)  # Target 컬럼 제외
y = train['Target']  # Target 컬럼

# 모델 정의
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42),
    'ElasticNet': ElasticNet(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# RFE를 사용하여 feature selection
for model_name, model in models.items():
    rfe = RFE(model, n_features_to_select=10)
    fit = rfe.fit(X, y)

    fs = X.columns[fit.support_].tolist()
    X_selected = X.iloc[:, fit.get_support()]

    # 선택된 feature로 cross-validation 수행
    score = cross_val_score(model, X_selected, y, cv=10, scoring=mae_scorer)

    print(f'{model_name} score: {score.mean()}, selected features: {fs}')

Linear Regression score: 12.533227292015251, selected features: ['V2', 'V3', 'V4', 'V5', 'V14', 'V16', 'V18', 'V20', 'V22', 'V24']
Ridge score: 12.533236313699147, selected features: ['V2', 'V3', 'V4', 'V5', 'V14', 'V16', 'V18', 'V20', 'V22', 'V24']
Lasso score: 12.54504045859278, selected features: ['V2', 'V4', 'V7', 'V10', 'V11', 'V17', 'V21', 'V24', 'V25', 'V26']
ElasticNet score: 12.538815977311199, selected features: ['V2', 'V4', 'V7', 'V10', 'V11', 'V17', 'V21', 'V24', 'V25', 'V26']
Gradient Boosting score: 12.576264802335583, selected features: ['V1', 'V3', 'V4', 'V6', 'V7', 'V16', 'V20', 'V22', 'V24', 'V25']


In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import RFE
from sklearn.metrics import make_scorer, mean_absolute_error

# 데이터 분할
X = train.drop('Target', axis=1)  # Target 컬럼 제외
y = train['Target']  # Target 컬럼

# 모델 정의
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42),
    'ElasticNet': ElasticNet(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

# MAE를 평가 기준으로 사용하기 위해 scorer 정의
mae_scorer = make_scorer(mean_absolute_error)

# RFE를 사용하여 feature selection
for model_name, model in models.items():
    rfe = RFE(model, n_features_to_select=15)
    fit = rfe.fit(X, y)

    fs = X.columns[fit.support_].tolist()
    X_selected = X.iloc[:, fit.get_support()]

    # 선택된 feature로 cross-validation 수행
    score = cross_val_score(model, X_selected, y, cv=15, scoring=mae_scorer)

    print(f'{model_name} score: {score.mean()}, selected features: {fs}')

Linear Regression score: 12.562041248351733, selected features: ['V1', 'V2', 'V3', 'V4', 'V5', 'V9', 'V14', 'V15', 'V16', 'V18', 'V20', 'V22', 'V23', 'V24', 'V25']
Ridge score: 12.562030683585256, selected features: ['V1', 'V2', 'V3', 'V4', 'V5', 'V9', 'V14', 'V15', 'V16', 'V18', 'V20', 'V22', 'V23', 'V24', 'V25']
Lasso score: 12.553925559758117, selected features: ['V2', 'V4', 'V7', 'V10', 'V11', 'V12', 'V17', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26']
ElasticNet score: 12.547704236357474, selected features: ['V2', 'V4', 'V7', 'V10', 'V11', 'V12', 'V17', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26']
Gradient Boosting score: 12.607488745875148, selected features: ['V1', 'V3', 'V4', 'V6', 'V7', 'V9', 'V15', 'V16', 'V17', 'V18', 'V20', 'V22', 'V23', 'V24', 'V25']
