In [29]:
import os
import glob
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split, cross_val_score

In [30]:
def load_config_files(directory_path):
    knob_list = glob.glob(os.path.join(directory_path, "my_*.cnf"))
    cnt = 0
    A_config = None

    for xx in range(len(knob_list)):
        path = os.path.join(directory_path, "my_{}.cnf".format(xx))
        a_all = pd.read_csv(path, sep="=", names=['Sample', 'value'], header=5)
        a_all = a_all.set_index("Sample").T
        cur_all_df = a_all

        if cnt == 0:
            A_config = cur_all_df
        else:
            A_config = pd.concat([A_config, cur_all_df], axis=0)
        cnt += 1

    A_config = A_config.reset_index(drop=True)
    
    # 전처리: 문자열이나 비정상적인 값을 NaN으로 처리하고 나중에 채움
    A_config.replace([np.inf, -np.inf], np.nan, inplace=True)
    A_config = A_config.apply(pd.to_numeric, errors='coerce')

    return A_config

In [31]:
def evaluate_model(model, X, y):
    scores = cross_val_score(model, X, y, cv=5) # 교차검증
    return np.mean(scores), np.std(scores)

# 모델의 평균 점수와, 표준편차를 반홤함 -> 모델의 성능 검증에 사용

In [32]:
def process_single_dataset(directory_path, external_metrics_path):
    # 설정 파일 로드
    configs_df = load_config_files(directory_path)

    # 외부 성능 메트릭 로드
    external_metrics = pd.read_csv(external_metrics_path, index_col=0)

    # 결측값 대체 (IterativeImputer 사용)
    imputer = IterativeImputer(random_state=42)
    configs_imputed = pd.DataFrame(imputer.fit_transform(configs_df), columns=configs_df.columns)
    external_metrics_imputed = pd.DataFrame(imputer.fit_transform(external_metrics), columns=external_metrics.columns)

    # 특징(X)와 타겟(y) 분리
    X = configs_imputed
    y = external_metrics_imputed[['tps', 'latency']]

    # 각 타겟에 대해 별도로 Lasso 모델 훈련 및 중요도 계산
    importance = pd.DataFrame()
    for target in y.columns:
        model = LassoCV(cv=5, random_state=42).fit(X, y[target])

        # 모델 평가
        mean_score, std_score = evaluate_model(model, X, y[target])
        print(f'Model performance for {target}: Mean score = {mean_score}, Std dev = {std_score}')

        target_importance = pd.DataFrame({
            'feature': X.columns,
            f'importance_{target}': np.abs(model.coef_)
        })

        if importance.empty:
            importance = target_importance
        else:
            importance = pd.merge(importance, target_importance, on='feature')

    # 각 타겟의 중요도를 합산하여 전체 중요도 계산
    importance['importance'] = importance[[f'importance_{target}' for target in y.columns]].mean(axis=1)
    
    return importance

In [33]:
# 모든 워크로드를 처리하여 결과를 취합
root_directory = '/Users/yoonji_kim/delab/tuning/mysql_dataset'

workload_paths = {
    'AA': {
        'config_path': f'{root_directory}/ycsb_AA/configs',
        'metrics_path': f'{root_directory}/ycsb_AA/results/external_metrics_AA.csv'
    },
    'BB': {
        'config_path': f'{root_directory}/ycsb_BB/configs',
        'metrics_path': f'{root_directory}/ycsb_BB/results/external_metrics_BB.csv'
    },
    'EE': {
        'config_path': f'{root_directory}/ycsb_EE/configs',
        'metrics_path': f'{root_directory}/ycsb_EE/results/external_metrics_EE.csv'
    },
    'FF': {
        'config_path': f'{root_directory}/ycsb_FF/configs',
        'metrics_path': f'{root_directory}/ycsb_FF/results/external_metrics_FF.csv'
    }
}

In [34]:
# 중요도를 합산할 데이터프레임 초기화
total_importance = pd.DataFrame()

# 각 워크로드에 대해 처리
for workload, paths in workload_paths.items():
    try:
        importance = process_single_dataset(paths['config_path'], paths['metrics_path'])
        
        if total_importance.empty:
            total_importance = importance
        else:
            total_importance = pd.merge(total_importance, importance, on='feature', how='outer')
            total_importance['importance'] = total_importance.filter(like='importance_').sum(axis=1)

    except Exception as e:
        print(f"Error processing workload {workload}: {e}")
        continue

Model performance for tps: Mean score = 0.29186278548145417, Std dev = 0.05075943650015996
Model performance for latency: Mean score = 0.2848290481575623, Std dev = 0.04391697999627951
Error processing workload BB: Input X contains infinity or a value too large for dtype('float64').
Error processing workload EE: Input X contains infinity or a value too large for dtype('float64').
Error processing workload FF: Input X contains infinity or a value too large for dtype('float64').


In [36]:
n = 30  # 상위 30개의 Knob 추출
final_top_n_knobs = total_importance.sort_values(by='importance', ascending=False).head(n)

# 결과 출력
print("Top 30 important knobs across all workloads:")
print(final_top_n_knobs)

# CSV 파일로 저장
output_file = '../csv/top_30_knobs.csv'
final_top_n_knobs.to_csv(output_file, index=False)

print(f"Top 30 knobs have been saved to {output_file}")

Top 30 important knobs across all workloads:
                               feature  importance_tps  importance_latency  \
119            range_alloc_block_size     2.924906e-05            0.112549   
113            query_alloc_block_size     1.330935e-06            0.008539   
126                  sort_buffer_size     6.367625e-07            0.006673   
46               innodb_log_file_size     2.651263e-09            0.000597   
114                 query_cache_limit     9.039792e-08            0.000431   
133                    tmp_table_size     7.332440e-08            0.000363   
85                    key_buffer_size     2.904449e-08            0.000359   
40       innodb_ft_result_cache_limit     8.748058e-09            0.000271   
116                  query_cache_size     9.950875e-09            0.000207   
100               max_heap_table_size     9.855458e-08            0.000205   
95              max_binlog_cache_size     5.996286e-08            0.000203   
97         max_binl