## Import

In [1]:
from sklearn.preprocessing import StandardScaler
from glob import glob

import tensorflow as tf
import pandas as pd
import numpy as np
import os

## Load data

* url: https://www.openml.org/search?type=study

In [2]:
os.getcwd()

'C:\\Users\\PC0\\Documents\\GitHub\\AutoFE\\ipynb'

In [3]:
data_path = "../datasets/"
file_list = glob(data_path + "*")

In [4]:
file_name = file_list[0].split("\\")[1]
file_name

'openml_586.csv'

In [6]:
for file_path in glob(data_path + "*"):
    file_name = file_path.split("\\")[1]
    file_name = file_name.split(".csv")[0]
    globals()[file_name] = pd.read_csv(file_path)
    globals()[file_name].rename(columns = {globals()[file_name].columns[globals()[file_name].shape[1]-1]:"target"}, inplace = True)
    print(file_name)

openml_586
openml_589
openml_607
openml_616
openml_618
openml_620
openml_637
steel_plate
wine_quality_red
wine_quality_white


## Baseline performance

In [24]:
from sklearn.model_selection import train_test_split, cross_validate, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import make_scorer, SCORERS
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from ngboost import NGBRegressor, NGBClassifier
from tqdm import tqdm_notebook, tqdm

In [25]:
model_xgb_reg = XGBRegressor()
model_lgbm_reg = LGBMRegressor()
model_rf_reg = RandomForestRegressor()
model_ngb_reg = NGBRegressor()

In [26]:
def split_function(data) :
    data_x = data.loc[:, ~data.columns.isin(['target'])]
    data_y = data.loc[:, data.columns.isin(['target'])]
    
    X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.25, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [54]:
data_sets = [openml_586, openml_589, openml_607, openml_616, openml_618, openml_620, openml_637]#, wine_quality_red, wine_quality_white]
data_keys = ["openml_586", "openml_589", "openml_607", "openml_616", "openml_618", "openml_620", "openml_637"]#, "wine_quality_red", "wine_quality_white"]

In [55]:
def total_cv_results(model, data_keys, datasets):
    for keys, data in zip(data_keys, datasets):
        X_train, X_test, y_train, y_test = split_function(data)
        kfold = KFold(n_splits=5, shuffle=True, random_state=0)
        results = cross_val_score(model, X_train, y_train, cv=kfold)
        print(keys, np.mean(results))

In [56]:
total_cv_results(model_xgb_reg, data_keys, data_sets)

openml_586 0.8901435860726974
openml_589 0.8901550101733576
openml_607 0.8909622699507083
openml_616 0.8178318490655471
openml_618 0.8784853992316893
openml_620 0.8816393012336887
openml_637 0.800049487707651


### Make states

In [59]:
SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [58]:
scaler = StandardScaler()

for keys, data in zip(data_keys, data_sets):
    X_train, X_test, y_train, y_test = split_function(data)
    
    train_x_sc = scaler.fit_transform(X_train)
    test_x_sc = scaler.fit_transform(X_test)
    
    kfold = KFold(n_splits=5, shuffle=True, random_state=0)
    
    results = cross_val_score(model_xgb_reg, train_x_sc, y_train, cv=kfold)
    
    print(keys, np.mean(results))

openml_586 0.8901435860726974
openml_589 0.8901550101733576
openml_607 0.8909622699507083
openml_616 0.8178318490655471
openml_618 0.8784853992316893
openml_620 0.8816393012336887
openml_637 0.800049487707651


In [None]:
scaled_x_pd = pd.DataFrame(scaled_x)
scaled_x_pd.columns = total_data_x.columns[:(total_data_x.shape[1])]
scaled_x_pd['class_vector'] = class_vector

In [None]:
std_vector = {x:np.std(scaled_x_pd.groupby("class_vector")[x].mean()) for x in scaled_x_pd.columns if x != "class_vector"}

In [None]:
std_df = pd.DataFrame.from_dict(std_vector, orient="index").reset_index()
std_df.columns = ['features', 'values']
#std_df = std_df.sort_values('values', ascending= False)
std_df.index = [x for x in range(std_df.shape[0])]

In [None]:
feature_names = std_df.features
total_df = std_df.T.iloc[1:,:].reset_index()
change_col = ["features"]
change_col.extend(list(feature_names))
total_df.columns = change_col
total_df = total_df.iloc[:,1:]

In [None]:
## original data의 표준편차 / 평균
state_2 = pd.DataFrame(np.std(total_data_x) / np.mean(total_data_x)).T
del state_2['최종수율']
state_2

In [None]:
from scipy.stats import pearsonr

In [None]:
state_3 = copy.deepcopy(state_2)

In [None]:
for col in state_3.columns :
    state_3[col] = pearsonr(np.array(total_data_x[col]).squeeze(), np.array(total_data_y).squeeze())[0]

In [None]:
state_3