In [1]:
import pandas as pd
data_path = "../../data/regression"
meta_file_list = ["abalone.csv",
                 "autoMPG6.csv",
                 "baseball.csv",
                 "friedman.csv",
                 "stock.csv",
                 "wankara.csv"]

meta_data_list = []
for file in meta_file_list:
    df = pd.read_csv(data_path + "/" + file)
    X = df.drop('y', axis = 1)
    y = df['y']
    meta_data_list.append((X, y))

In [2]:
def extract_meta_features(X, y):
    num_samples, num_features = X.shape
    label_max = y.max()
    label_min = y.min()
    label_mean = y.mean()
    label_std = y.std()
    corr_mean = X.corr().abs().values.mean()
    corr_max = X.corr().abs().values.max()
    corr_min = X.corr().abs().values.min()
    
    meta_features = [num_samples,
                     num_features,
                     label_max,
                     label_min,
                     label_mean,
                     label_std,
                     corr_mean,
                     corr_max,
                     corr_min]    
    
    return meta_features

In [3]:
meta_data = []
for X, y in meta_data_list:
    meta_features = extract_meta_features(X, y)
    meta_data.append(meta_features)

meta_col_names = ["num_samples",
                  "num_features",
                  "label_max",
                  "label_min",
                  "label_mean",
                  "label_std",
                  "corr_mean",
                  "corr_max",
                  "corr_min"]

meta_data = pd.DataFrame(meta_data, columns = meta_col_names)
meta_data['data_name'] = meta_file_list

In [4]:
import numpy as np
def hyperparameter_sampling():
    h1 = np.random.randint(5, 15)
    h2, h3, h4 = np.random.randint(0, 10, 3)
    if h2 == 0:
        h3, h4 = 0, 0
    elif h3 == 0:
        h4 = 0
    max_iter = np.random.choice([100, 200, 1000, 2000])
    random_state = np.random.choice([2020, 2021, 2022])
    return h1, h2, h3, h4, max_iter, random_state

In [5]:
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor as MLP
import warnings
warnings.filterwarnings("ignore")
experiment_data = []
for i in range(len(meta_data_list)):
    data_name = meta_file_list[i]
    X, y = meta_data_list[i]
    for _ in range(1000):
        h1, h2, h3, h4, max_iter, random_state = hyperparameter_sampling()
        if h2 == 0:
            layers = (h1, )
        elif h3 == 0:
            layers = (h1, h2)
        elif h4 == 0:
            layers = (h1, h2, h3)
        else:
            layers = (h1, h2, h3, h4)
        
        model = MLP(hidden_layer_sizes = layers,
                    max_iter = max_iter,
                    random_state = random_state)
        score_list = -cross_val_score(model, X, y, cv = 5,
                                      scoring = "neg_mean_absolute_error")
        score = score_list.mean()
        record = [data_name, h1, h2, h3, h4, max_iter, random_state, score]
        experiment_data.append(record)

hyper_param_cols = ["h1", "h2", "h3", "h4", "max_iter", "random_state"]
experiment_data = pd.DataFrame(experiment_data,
                               columns = ["data_name"] + hyper_param_cols + ["score"])

In [6]:
meta_data = pd.merge(meta_data,
                      experiment_data,
                      on = "data_name")
meta_data.to_csv("../../data/메타데이터_예제.csv", index = False)
display(meta_data.head())

Unnamed: 0,num_samples,num_features,label_max,label_min,label_mean,label_std,corr_mean,corr_max,corr_min,data_name,h1,h2,h3,h4,max_iter,random_state,score
0,4176,8,29.0,1.0,9.931034,3.220003,0.807719,1.0,0.418048,abalone.csv,11,4,9,2,200,2022,1.522659
1,4176,8,29.0,1.0,9.931034,3.220003,0.807719,1.0,0.418048,abalone.csv,9,8,3,8,2000,2020,1.537957
2,4176,8,29.0,1.0,9.931034,3.220003,0.807719,1.0,0.418048,abalone.csv,13,5,3,1,100,2020,1.765142
3,4176,8,29.0,1.0,9.931034,3.220003,0.807719,1.0,0.418048,abalone.csv,12,3,2,9,2000,2022,2.360753
4,4176,8,29.0,1.0,9.931034,3.220003,0.807719,1.0,0.418048,abalone.csv,7,2,4,2,100,2021,1.692493


#### 메타 모델 학습

In [36]:
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.preprocessing import MinMaxScaler
meta_X = meta_data.drop(['data_name', 'score'], axis = 1)
meta_X['random_state_2020'] = (meta_X['random_state'] == 2020).astype(int)
meta_X['random_state_2021'] = (meta_X['random_state'] == 2021).astype(int)
meta_X = meta_data.drop(['random_state'], axis = 1)
meta_y = meta_data['score']
scaler = MinMaxScaler().fit(meta_X)
meta_X = scaler.transform(meta_X)
meta_model = KNN().fit(meta_X, meta_y)

#### 메타 모델 활용

In [37]:
df = pd.read_csv(data_path + "/autoMPG8.csv")
X = df.drop('y', axis = 1)
y = df['y']
meta_features = extract_meta_features(X, y)
sample_list = []
for _ in range(100):
    h1, h2, h3, h4, max_iter, random_state = hyperparameter_sampling()
    if h2 == 0:
        layers = (h1, )
    elif h3 == 0:
        layers = (h1, h2)
    elif h4 == 0:
        layers = (h1, h2, h3)
    else:
        layers = (h1, h2, h3, h4)
    sample_list.append(meta_features + [h1, h2, h3, h4, max_iter, random_state])

sample_list = pd.DataFrame(sample_list,
                           columns = meta_col_names + hyper_param_cols)

sample_list['random_state_2020'] = (sample_list['random_state'] == 2020).astype(int)
sample_list['random_state_2021'] = (sample_list['random_state'] == 2021).astype(int)
sample_list = meta_data.drop(['random_state'], axis = 1)
y_pred = meta_model.predict(scaler.transform(sample_list))

In [42]:
init_sample = sample_list.loc[np.argsort(y_pred)[:3]]
display(init_sample)

Unnamed: 0,num_samples,num_features,label_max,label_min,label_mean,label_std,corr_mean,corr_max,corr_min,h1,h2,h3,h4,max_iter,random_state
52,392,7,46.6,9.0,23.445918,7.805007,0.628158,1.0,0.181528,9,8,7,9,1000,2021
84,392,7,46.6,9.0,23.445918,7.805007,0.628158,1.0,0.181528,9,9,5,8,1000,2021
21,392,7,46.6,9.0,23.445918,7.805007,0.628158,1.0,0.181528,6,6,2,7,2000,2021


In [39]:
y_actual = []
for sample in sample_list.values:
    h1, h2, h3, h4, max_iter, random_state = sample[-6:].astype(int)
    if h2 == 0:
        layers = (h1, )
    elif h3 == 0:
        layers = (h1, h2)
    elif h4 == 0:
        layers = (h1, h2, h3)
    else:
        layers = (h1, h2, h3, h4)
        
    model = MLP(hidden_layer_sizes = layers,
                max_iter = max_iter,
                random_state = random_state)
    
    score_list = -cross_val_score(model, X, y, cv = 5,
                                  scoring = "neg_mean_absolute_error")
    
    score = score_list.mean()
    y_actual.append(score)

In [40]:
from scipy.stats import spearmanr, rankdata
print(spearmanr(y_actual, y_pred))
print(rankdata(y_actual)[np.argsort(y_pred)[:3]])

SpearmanrResult(correlation=0.6190514766137369, pvalue=6.652410770978535e-12)
[13. 26.  7.]
