## Import

In [1]:
from sklearn.preprocessing import StandardScaler
from glob import glob

import tensorflow as tf
import pandas as pd
import numpy as np
import os, warnings

warnings.filterwarnings(action='ignore')

## Load data

* url: https://www.openml.org/search?type=study

In [2]:
os.getcwd()

'C:\\Users\\PC0\\Documents\\GitHub\\AutoFE\\ipynb'

In [3]:
data_path = "../datasets/"
file_list = glob(data_path + "*")

In [4]:
file_name = file_list[0].split("\\")[1]
file_name

'openml_586.csv'

In [5]:
for file_path in glob(data_path + "*"):
    file_name = file_path.split("\\")[1]
    file_name = file_name.split(".csv")[0]
    globals()[file_name] = pd.read_csv(file_path)
    
    if "rmftsa_ladata" in file_name:
        globals()[file_name].rename(columns = {"Respiratory_Mortality":"target"}, inplace = True)
    else:
        globals()[file_name].rename(columns = {globals()[file_name].columns[globals()[file_name].shape[1]-1]:"target"}, inplace = True)
    print(file_name)

openml_586
openml_589
openml_607
openml_616
openml_618
openml_620
openml_637
rmftsa_ladata
steel_plate
wine_quality_red
wine_quality_white


## Baseline performance

In [6]:
from sklearn.model_selection import train_test_split, cross_validate, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import make_scorer, SCORERS, classification_report
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from ngboost import NGBRegressor, NGBClassifier
from tqdm import tqdm_notebook, tqdm

In [7]:
model_xgb_reg = XGBRegressor()
model_lgbm_reg = LGBMRegressor()
model_rf_reg = RandomForestRegressor()
model_ngb_reg = NGBRegressor()
model_ridge = Ridge()

model_xgb_clf = XGBClassifier(objective="mlogloss")
model_lgbm_clf = LGBMClassifier()
model_rf_clf = RandomForestClassifier()

In [8]:
wine_quality_white.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,target
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,4
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,4
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,4
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,4
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,4


* train/test split

In [9]:
from sklearn.preprocessing import minmax_scale, StandardScaler
from sklearn.metrics import f1_score, accuracy_score

In [10]:
def split_function(data) :
    data_x = data.loc[:, ~data.columns.isin(['target'])]
    data_y = data.loc[:, data.columns.isin(['target'])]
    
    X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.25, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [11]:
wine_quality_white.target -= 1

In [12]:
X_train, X_test, y_train, y_test = split_function(wine_quality_white)

In [13]:
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

In [14]:
kfold = KFold(n_splits=5, shuffle=True, random_state=0)
results = cross_val_score(model_lgbm_clf, X_train_sc, y_train, cv=kfold)

In [15]:
print("cv_results:", np.mean(results))

cv_results: 0.6411625794732061


In [16]:
model_lgbm_clf.fit(X_train_sc, y_train)
pred_lgbm = model_lgbm_clf.predict(X_test_sc)

In [17]:
print("test_results \n", classification_report(y_test, pred_lgbm))

test_results 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.44      0.22      0.29        32
           2       0.66      0.65      0.65       358
           3       0.64      0.74      0.68       544
           4       0.70      0.59      0.64       241
           5       0.79      0.35      0.48        43
           6       0.00      0.00      0.00         0

    accuracy                           0.65      1225
   macro avg       0.46      0.36      0.39      1225
weighted avg       0.65      0.65      0.64      1225



### AutoFE

In [121]:
y_train_one = np.eye(7)[np.array(y_train).squeeze()]
y_test_one = np.eye(7)[np.array(y_test).squeeze()]

In [122]:
input_layer = tf.keras.Input(shape = (X_train_sc.shape[1]))
hidden_x = tf.keras.layers.Dense(32, activation = "selu")(input_layer)
hidden_x = tf.keras.layers.BatchNormalization()(hidden_x)
hidden_x = tf.keras.layers.Dense(16, activation = "selu")(hidden_x)
hidden_x = tf.keras.layers.BatchNormalization()(hidden_x)
feature_x = tf.keras.layers.Dense(1, activation = "sigmoid")(hidden_x)

model_fe = tf.keras.Model(inputs = input_layer, outputs = feature_x)
model_fe.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 11)]              0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                384       
_________________________________________________________________
batch_normalization (BatchNo (None, 32)                128       
_________________________________________________________________
dense_4 (Dense)              (None, 16)                528       
_________________________________________________________________
batch_normalization_1 (Batch (None, 16)                64        
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 17        
Total params: 1,121
Trainable params: 1,025
Non-trainable params: 96
________________________________________________________

In [123]:
model_fe(X_train_sc[:2])

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[0.7327358 ],
       [0.42176852]], dtype=float32)>

In [124]:
optimizer = tf.keras.optimizers.Adam()

In [125]:
num_epi = 500000

In [126]:
reward_set = []
mean_set = []
std_set = []
acc_set = []
f1_set = []

In [127]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train_sc, np.array(y_train))).shuffle(buffer_size=X_train_sc.shape[0]).batch(32)
test_ds = tf.data.Dataset.from_tensor_slices((X_test_sc, np.array(y_test))).shuffle(buffer_size=X_test_sc.shape[0]).batch(1)

In [128]:
for episode in range(num_epi):
    features = model_fe.predict(X_train_sc)
    features_test = model_fe.predict(X_test_sc)
    
    data = np.hstack([X_train_sc,features])
    data_test = np.hstack([X_test_sc,features_test])
    
    model_lgbm_clf.fit(data, np.array(y_train))
    predict_test = model_lgbm_clf.predict(data_test)
    
    acc = accuracy_score(y_test, predict_test)
    f1 = f1_score(y_test, predict_test, average = "macro")
    
    acc_set.append(acc)
    f1_set.append(f1)
    
    reward_set_epoch = []
    probs_set_epoch = []
    
    for idx, (x, y) in enumerate(test_ds):
        if idx % 128 == 0:
            features = model_fe.predict(X_train_sc)
            data = np.hstack([X_train_sc,features])
            model_lgbm_clf.fit(data, np.array(y_train))
        
        with tf.GradientTape() as tape:
            ## Action 도출
            fe_probs = model_fe(x)

            ## 도출된 action을 데이터의 변수로 추가
            data_val = np.hstack([x,fe_probs.numpy()])

            ## ML 학습 및 rewards 도출
            lgbm_predict_probs = model_lgbm_clf.predict_proba(data_val)
            y_idx = y[0][0].numpy()
            rewards = lgbm_predict_probs[0,y_idx] - 0.5
            rewards = tf.constant(np.array(rewards), dtype = tf.float32)

            ## backprobs
            cross_entropy_actions = tf.math.log(fe_probs + 1e-5)
            fe_loss_set = - cross_entropy_actions * rewards

            # 오류함수를 줄이는 방향으로 모델 업데이트
            grads = tape.gradient(fe_loss_set, model_fe.trainable_variables)
            optimizer.apply_gradients(zip(grads, model_fe.trainable_variables))
        
        reward_set_epoch.append(np.round(rewards.numpy(), 5))
        probs_set_epoch.append(np.round(fe_probs.numpy()[0][0],5))
    
    reward_set.append(np.mean(reward_set_epoch))
    mean_set.append(np.mean(probs_set_epoch))
    std_set.append(np.std(probs_set_epoch))
    
    print("episode:", episode,
          "rewards:", np.mean(reward_set_epoch),
          "probs_mean:", np.mean(probs_set_epoch),
          "probs_std:", np.std(probs_set_epoch),
          "acc:", acc,
          "f1:", f1)

episode: 0 rewards: 0.06426617 probs_mean: 0.6915288 probs_std: 0.2969272 acc: 0.6587755102040816 f1: 0.4752877576417515
episode: 1 rewards: 0.07057245 probs_mean: 0.8043469 probs_std: 0.2872442 acc: 0.6628571428571428 f1: 0.467696040283717
episode: 2 rewards: 0.074771866 probs_mean: 0.72074425 probs_std: 0.3873794 acc: 0.6710204081632654 f1: 0.478186472864256
episode: 3 rewards: 0.07632658 probs_mean: 0.76692444 probs_std: 0.3774135 acc: 0.673469387755102 f1: 0.4782222710281217
episode: 4 rewards: 0.07730506 probs_mean: 0.7683305 probs_std: 0.3891697 acc: 0.6661224489795918 f1: 0.47836390525369316
episode: 5 rewards: 0.07528742 probs_mean: 0.7691711 probs_std: 0.392894 acc: 0.6742857142857143 f1: 0.4807328546366349
episode: 6 rewards: 0.075325415 probs_mean: 0.77858406 probs_std: 0.39431837 acc: 0.6824489795918367 f1: 0.416696554867598
episode: 7 rewards: 0.075950354 probs_mean: 0.7566542 probs_std: 0.41174415 acc: 0.6636734693877551 f1: 0.3993135715424666
episode: 8 rewards: 0.072215

KeyboardInterrupt: 

* past

In [None]:
reward_set = []
mean_set = []
std_set = []
acc_set = []
f1_set = []

In [25]:
for episode in range(num_epi):
    features = model_fe.predict(X_train_sc)
    data = np.hstack([X_train_sc,features])
    model_lgbm_clf.fit(data, np.array(y_train))
    
    with tf.GradientTape() as tape:
        ## Action 도출
        fe_probs = model_fe(X_test_sc)

        ## 도출된 action을 데이터의 변수로 추가
        data_val = np.hstack([X_test_sc,fe_probs.numpy()])

        ## ML 학습 및 rewards 도출
        lgbm_predict_probs = model_lgbm_clf.predict_proba(data_val)
        rewards = [x[y][0] - 0.5 for x, y in zip(lgbm_predict_probs, np.array(y_test))]
        rewards = tf.constant(np.expand_dims(np.array(rewards),1), dtype = tf.float32)

        ## backprobs
        cross_entropy_actions = tf.math.log(fe_probs + 1e-5)
        fe_loss_set = - (cross_entropy_actions * rewards)
    
        # 오류함수를 줄이는 방향으로 모델 업데이트
        grads = tape.gradient(fe_loss_set, model_fe.trainable_variables)
        optimizer.apply_gradients(zip(grads, model_fe.trainable_variables))
    
    predict_val = np.argmax(lgbm_predict_probs, axis = 1)
    
    m_reward, m_probs, std_probs, acc, f1 = np.mean(rewards), np.mean(fe_probs.numpy()), np.std(fe_probs.numpy()), accuracy_score(y_test,predict_val), f1_score(y_test,predict_val, average ="macro")
    reward_set.append(np.round(m_reward, 5))
    mean_set.append(np.round(m_probs,5))
    std_set.append(np.round(std_probs,5))
    acc_set.append(np.round(acc,5))
    f1_set.append(np.round(f1,5))
    
    print(episode, m_reward, m_probs, np.round(std_probs, 5), np.round(acc, 5), np.round(f1,5))

0 0.074824415 0.51126164 0.22318 0.66204 0.51009
1 0.0766955 0.5138964 0.21813 0.65224 0.45769
2 0.073644154 0.5165759 0.21325 0.65878 0.46324
3 0.07871637 0.51927423 0.20889 0.65878 0.48279
4 0.080483206 0.52203816 0.20486 0.66694 0.43371


KeyboardInterrupt: 

In [None]:
# 1. 전체 데이터 셋에 대해서 한번에 변수를 추정한다
# 2. reward를 5-cv로 측정한다 -> 각 transition에 대해서 reward가 달리 나와야함 / classification이기 때문에 확률이 얼만큼 정답과 가까워지는가로 측정할 수 있음
# 3. action에 따라서 target을 측정한다 -> value based approach