# 선형상관계수의 함정
- 선형 상관 관계가 낮다고 해서 해당 피처가 모델 예측에 영향을 적게 준다고 할 수 없다.
- XGBoost가 만능은 아니다. (Linear Regression보다 성능이 떨어질 때도 있다.)

## 데이터 생성
- 100개 피처, 1000개 샘플, 5개 중요 피처의 회귀 데이터 생성
- 모델 생성 결과 -> 7, 1, 25, 40, 38번 피처가 중요

In [1]:
from sklearn.datasets import make_regression

seed = 1

X, y = make_regression(n_samples=1000, n_features=100, random_state=seed, n_informative=5, noise=1)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import pandas as pd

lr = LinearRegression()
lr.fit(X, y)
coef = pd.Series(lr.coef_)
coef.abs().sort_values(ascending=False).head(10)

7     71.389236
1     59.191261
25    53.331269
40    37.594780
38     3.364070
80     0.080267
36     0.073577
12     0.069053
23     0.066007
44     0.059059
dtype: float64

In [2]:
from sklearn.linear_model import Lasso

lr = Lasso()
lr.fit(X, y)
pd.Series(lr.coef_).abs().sort_values(ascending=False).head(10)

7     70.289678
1     58.059438
25    52.219298
40    36.433725
38     2.340127
73     0.000000
72     0.000000
71     0.000000
70     0.000000
69     0.000000
dtype: float64

## 선형 상관 계수 확인
- 7, 1, 25, 40, 38이 중요도가 높지만 상관 계수는 7, 1, 25, 40, 77이 나옴
    - 38번 피처가 상관 관계가 낮게 나옴

In [3]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

df = pd.DataFrame(X)
df['y'] = y

corr = df.corr()

corr['y'].abs().sort_values(ascending=False).head(10)

y     1.000000
7     0.619634
1     0.501967
25    0.454816
40    0.282569
77    0.113721
39    0.084726
0     0.082017
59    0.079138
13    0.076281
Name: y, dtype: float64

In [4]:
corr.loc[coef.abs().sort_values(ascending=False).head(10).index, 'y']

7     0.619634
1     0.501967
25    0.454816
40    0.282569
38    0.018636
80    0.017778
36   -0.026675
12   -0.014458
23   -0.037519
44    0.031747
Name: y, dtype: float64

## LinearRegression

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [6]:
lr_ = LinearRegression().fit(X_train, y_train)

In [7]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, accuracy_score, precision_score, recall_score, f1_score


def evaluate_model(y_real, y_pred, tolerance=0.05):
    if isinstance(y_pred, pd.Series):
        y_real = y_real.values
    # check if regression or classification
    if isinstance(y_pred[0], (int, np.integer)):
        is_classification = True
    else:
        is_classification = False

    # calculate metrics based on type
    if is_classification:
        accuracy = accuracy_score(y_real, y_pred)
        precision = precision_score(y_real, y_pred, average='macro')
        recall = recall_score(y_real, y_pred, average='macro')
        f1 = f1_score(y_real, y_pred, average='macro')
        metrics = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        }
    else:
        rmse = mean_squared_error(y_real, y_pred, squared=False)
        mae = mean_absolute_error(y_real, y_pred)
        mape = mean_absolute_percentage_error(y_real, y_pred)
        if tolerance is not None:
            accuracy = np.sum(np.where(abs(y_real-y_pred) < tolerance, 1, 0)) / len(y_real)
            metrics = {
                'rmse': rmse,
                'mae': mae,
                'mape':mape,
                'accuracy':accuracy
            }
        else:
            metrics = {
                'rmse': rmse,
                'mae': mae,
                'mape':mape
            }

    # create and return pandas DataFrame
    metrics_df = pd.DataFrame.from_dict(metrics, orient='index', columns=['value'])
    return metrics_df

In [8]:
evaluate_model(y_test, lr_.predict(X_test), tolerance=None)

Unnamed: 0,value
rmse,1.041445
mae,0.82863
mape,0.06921


## XGBoost

In [9]:
from xgboost import XGBRegressor

xgb = XGBRegressor().fit(X_train, y_train)

evaluate_model(y_test, xgb.predict(X_test), tolerance=None)

Unnamed: 0,value
rmse,30.921693
mae,23.456919
mape,2.14701


In [10]:
from sklearn.inspection import permutation_importance

r = permutation_importance(xgb, X_test, y_test, scoring='neg_mean_absolute_error')
feature_importance = pd.Series(r['importances_mean'])
feature_importance.sort_values(ascending=False).head(20)

7     51.620799
1     43.064514
25    39.519991
40    17.613404
26     0.330367
92     0.238676
24     0.165128
0      0.149194
27     0.128085
64     0.106856
4      0.099855
96     0.094412
21     0.087440
34     0.083394
41     0.073344
94     0.063999
2      0.060210
38     0.055103
3      0.054310
32     0.053940
dtype: float64

In [11]:
pd.DataFrame([r['importances_mean'], r['importances_std']], index=['mean', 'std']).T

Unnamed: 0,mean,std
0,0.149194,0.065956
1,43.064514,1.032178
2,0.060210,0.032951
3,0.054310,0.094158
4,0.099855,0.059469
...,...,...
95,-0.090190,0.015596
96,0.094412,0.031956
97,0.014474,0.068817
98,0.002933,0.046068


### 상위 피처 선택
- 피처 중요도 기준 상위 20개
- 상관 계수 기준 상위 20개

In [12]:
high_features = feature_importance.sort_values(ascending=False).head(20).index.to_list()

xgb = XGBRegressor().fit(X_train[:, high_features], y_train)

evaluate_model(y_test, xgb.predict(X_test[:, high_features]), tolerance=None)

Unnamed: 0,value
rmse,27.327854
mae,20.321854
mape,2.22038


In [13]:
high_features = coef.abs().sort_values(ascending=False).head(20).index.to_list()

xgb = XGBRegressor().fit(X_train[:, high_features], y_train)

evaluate_model(y_test, xgb.predict(X_test[:, high_features]), tolerance=None)

Unnamed: 0,value
rmse,26.746387
mae,20.106369
mape,1.512563
