### 반복측정 분산분석

#### statsmodels를 이용한 반복측정 분산분석

In [14]:
data = pd.read_csv("AnovaRM_예제.csv")
display(data) 

Unnamed: 0,sub_id,y,x
0,1,10,a
1,2,20,a
2,3,30,a
3,1,20,b
4,2,40,b
5,3,60,b
6,1,30,c
7,2,60,c
8,3,90,c


In [15]:
from statsmodels.stats.anova import AnovaRM
aovrm = AnovaRM(data, 'y', 'sub_id', within=['x'])
res = aovrm.fit()
print(res)

            Anova
  F Value Num DF Den DF Pr > F
------------------------------
x 12.0000 2.0000 4.0000 0.0204



In [25]:
display(res.__dict__['anova_table'])

Unnamed: 0,F Value,Num DF,Den DF,Pr > F
x,12.0,2.0,4.0,0.020408


#### 실험 데이터 준비

In [1]:
import pandas as pd
df = pd.read_csv("../../data/regression/mortgage.csv")
X = df.drop('y', axis = 1)
y = df['y']

In [5]:
from lightgbm import LGBMRegressor as LGB
from sklearn.model_selection import ParameterGrid, cross_val_score
grid = ParameterGrid({"n_estimators":[50, 100, 150, 200],
                      "learning_rate":[0.01, 0.05, 0.1, 0.3],
                      "num_leaves":[2**3, 2**4, 2**5, 2**6],
                      "boosting_type":["gbdt", "goss"],
                      "random_state":[2020, 2021, 2022, 2023],
                      "metric":["mae", "mse"],
                      "reg_alpha":[0, 0.1, 1.0, 10],
                      "colsample_bytree":[0.5, 0.7, 0.9]})

score_list = []
for param in grid:
    score = (-cross_val_score(LGB(**param), X, y,
                              scoring = "neg_mean_absolute_error")).mean()
    score_list.append(score)

In [7]:
grid_search_data = pd.DataFrame(grid)
grid_search_data['score'] = score_list
display(grid_search_data.head())
grid_search_data.to_csv("LightGBM_하이퍼파라미터선택_실험데이터.csv", index = False)

Unnamed: 0,boosting_type,colsample_bytree,learning_rate,metric,n_estimators,num_leaves,random_state,reg_alpha,score
0,gbdt,0.5,0.01,mae,50,8,2020,0.0,1.499215
1,gbdt,0.5,0.01,mae,50,8,2020,0.1,1.499427
2,gbdt,0.5,0.01,mae,50,8,2020,1.0,1.502761
3,gbdt,0.5,0.01,mae,50,8,2020,10.0,1.52336
4,gbdt,0.5,0.01,mae,50,8,2021,0.0,1.499254


#### 주요 하이퍼 파라미터 식별

In [37]:
hyper_params = grid_search_data.columns[:-1].tolist()
hyper_params.remove("boosting_type")
hyper_params.insert(0, "boosting_type")
grid_search_data.sort_values(by = hyper_params, inplace = True)
boosting_type_size = len(grid_search_data['boosting_type'].unique())
num_ID = int(len(grid_search_data) / boosting_type_size)
grid_search_data['subject_ID'] = list(range(num_ID)) * boosting_type_size

In [45]:
aovrm = AnovaRM(grid_search_data, 'score', 'subject_ID', within=["boosting_type"])
res = aovrm.fit()
res = res.__dict__['anova_table']
display(res)

Unnamed: 0,F Value,Num DF,Den DF,Pr > F
boosting_type,6568.24873,1.0,6143.0,0.0


In [55]:
AnovaRM_result = pd.DataFrame() # 분석 결과 초기화
param_cols = grid_search_data.columns.tolist()
param_cols.remove('score')
param_cols.remove('subject_ID')

for param in param_cols:
    # 데이터 수정
    hyper_params = grid_search_data.columns[:-1].tolist()
    hyper_params.remove(param)
    hyper_params.insert(0, param)
    grid_search_data.sort_values(by = hyper_params, inplace = True)
    param_size = len(grid_search_data[param].unique())
    num_ID = int(len(grid_search_data) / param_size)
    grid_search_data['subject_ID'] = list(range(num_ID)) * param_size
    
    # 분석 결과 추가
    aovrm = AnovaRM(grid_search_data, 'score', 'subject_ID', within=[param])
    res = aovrm.fit()
    res = res.__dict__['anova_table']
    AnovaRM_result = AnovaRM_result.append(res)

In [56]:
display(AnovaRM_result.sort_values(by = "F Value", ascending = False))

Unnamed: 0,F Value,Num DF,Den DF,Pr > F
reg_alpha,64666.215872,3.0,9213.0,0.0
learning_rate,9682.902945,3.0,9213.0,0.0
boosting_type,6568.24873,1.0,6143.0,0.0
num_leaves,4943.618015,3.0,9213.0,0.0
colsample_bytree,2143.730367,2.0,8190.0,0.0
n_estimators,1477.375094,3.0,9213.0,0.0
metric,819.48595,1.0,6143.0,2.7034790000000004e-169
random_state,346.890102,3.0,9213.0,1.930288e-213


In [61]:
for val in grid_search_data['reg_alpha'].unique():
    avg_score = grid_search_data.loc[grid_search_data['reg_alpha'] == val,
                                     'score'].mean()
    print("{}:{}".format(val, avg_score))

0.0:0.2895331565457736
0.1:0.2902266120608595
1.0:0.2979066980451965
10.0:0.3392285961577812


In [63]:
for val in grid_search_data['learning_rate'].unique():
    avg_score = grid_search_data.loc[grid_search_data['learning_rate'] == val,
                                     'score'].mean()
    print("{}:{}".format(val, avg_score))

0.01:0.8465915564287093
0.05:0.14289935759632696
0.1:0.1085783767317755
0.3:0.11882577205279798


In [64]:
for val in grid_search_data['random_state'].unique():
    avg_score = grid_search_data.loc[grid_search_data['random_state'] == val,
                                     'score'].mean()
    print("{}:{}".format(val, avg_score))

2020:0.3042900489902465
2021:0.30343115815837834
2022:0.3042139604741169
2023:0.30495989518687033


### 결정 나무를 이용한 하이퍼 파라미터 범위 설정

#### 실험 데이터 준비

In [None]:
grid = ParameterGrid({"learning_rate":np.arange(0.05, 0.3, 0.01)
                      "num_leaves":[2**2, 2**3, 2**4, 2**5, 2**6, 2**7, 2**8],
                      "boosting_type":["gbdt", "goss"]})

score_list = []
for param in grid:
    score = (-cross_val_score(LGB(**param), X, y,
                              scoring = "neg_mean_absolute_error")).mean()
    score_list.append(score)
    
grid_search_data = pd.DataFrame(grid)
grid_search_data['score'] = score_list
display(grid_search_data.head())
grid_search_data.to_csv("LightGBM_하이퍼파라미터범위설정_실험데이터.csv", index = False)