In [5]:
import csv
import re
import os

log_dir = "./log_data"
csv_output_path = "./h3_summary.csv"

filename_pattern = r"t(\d+)_hot_(\w+)_cold_(\w+)\.log"

# 기존 stat key (단순 COUNT만 추출)
target_keys = [
    # WAF
    'rocksdb.flush.write.bytes',
    'rocksdb.compact.write.bytes',
    'rocksdb.bytes.written',

    # RAF
    'rocksdb.number.keys.read', # read count
    'rocksdb.bytes.read',       # read bytes
    

    # Cache Hit/Miss
    'rocksdb.block.cache.hit',
    'rocksdb.block.cache.miss',
    "rocksdb.memtable.hit",
    "rocksdb.memtable.miss",

    # Latency
    'rocksdb.db.get.micros',
    'rocksdb.db.write.micros',
    'rocksdb.db.seek.micros',    
]

# 헤더 구성
latency_fields = [
    'get.P50', 'get.P95', 'get.P99', 'get.P100', 'get.COUNT', 'get.SUM', 'get.AVG',
    'write.P50', 'write.P95', 'write.P99', 'write.P100', 'write.COUNT', 'write.SUM', 'write.AVG',
]
header = [
    'trial', 'hot_compaction', 'cold_compaction',
    'time(s)', 'hot_column_key', 'default_column_key'
] + target_keys + latency_fields

rows = []

for log_file_name in os.listdir(log_dir):
    if not log_file_name.endswith('.log'):
        continue

    match = re.match(filename_pattern, log_file_name)
    if not match:
        print(f"Filename {log_file_name} does not match the expected pattern.")
        continue

    trial = match.group(1)
    hot_compaction = match.group(2)
    cold_compaction = match.group(3)

    stats_dict = {key: 0 for key in target_keys}
    latency_dict = {key: 0 for key in latency_fields}
    time_sec = 0
    hot_col_keys = 0
    default_col_keys = 0

    log_file_path = os.path.join(log_dir, log_file_name)

    with open(log_file_path, "r") as f:
        for line in f:
            line = line.strip()

            time_match = re.match(r"총 소요시간: (\d+(?:\.\d+)?)초", line)
            if time_match:
                time_sec = float(time_match.group(1))
                continue

            hot_match = re.match(r"hot 컬럼에 저장된 키 수: (\d+)", line)
            if hot_match:
                hot_col_keys = int(hot_match.group(1))
                continue

            default_match = re.match(r"default 컬럼에 저장된 키 수: (\d+)", line)
            if default_match:
                default_col_keys = int(default_match.group(1))
                continue

            # 일반 stat 추출
            stat_match = re.match(r"(rocksdb\.[\w\.]+)\s+COUNT\s*:\s*(\d+)", line)
            if stat_match:
                key = stat_match.group(1)
                value = int(stat_match.group(2))
                if key in stats_dict:
                    stats_dict[key] = value

            # Latency: get
            get_match = re.match(r"rocksdb\.db\.get\.micros\s+P50\s*:\s*([\d\.]+)\s+P95\s*:\s*([\d\.]+)\s+P99\s*:\s*([\d\.]+)\s+P100\s*:\s*([\d\.]+)\s+COUNT\s*:\s*(\d+)\s+SUM\s*:\s*(\d+)", line)
            if get_match:
                p50, p95, p99, p100, count, total = map(float, get_match.groups())
                latency_dict.update({
                    'get.P50': p50,
                    'get.P95': p95,
                    'get.P99': p99,
                    'get.P100': p100,
                    'get.COUNT': int(count),
                    'get.SUM': int(total),
                    'get.AVG': total / count if count > 0 else 0
                })

            # Latency: write
            write_match = re.match(r"rocksdb\.db\.write\.micros\s+P50\s*:\s*([\d\.]+)\s+P95\s*:\s*([\d\.]+)\s+P99\s*:\s*([\d\.]+)\s+P100\s*:\s*([\d\.]+)\s+COUNT\s*:\s*(\d+)\s+SUM\s*:\s*(\d+)", line)
            if write_match:
                p50, p95, p99, p100, count, total = map(float, write_match.groups())
                latency_dict.update({
                    'write.P50': p50,
                    'write.P95': p95,
                    'write.P99': p99,
                    'write.P100': p100,
                    'write.COUNT': int(count),
                    'write.SUM': int(total),
                    'write.AVG': total / count if count > 0 else 0
                })

    row = [
        trial, work, hot_compaction, cold_compaction,
        time_sec, hot_col_keys, default_col_keys
    ] + [stats_dict[key] for key in target_keys] + [latency_dict[key] for key in latency_fields]
    rows.append(row)

with open(csv_output_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)
    writer.writerows(rows)

print(f"CSV summary saved to {csv_output_path}")

CSV summary saved to ./h3_summary.csv


In [6]:
import pandas as pd

# CSV 파일 읽기
df = pd.read_csv("./h3_summary.csv")

df

Unnamed: 0,trial,hot_compaction,cold_compaction,time(s),hot_column_key,default_column_key,rocksdb.flush.write.bytes,rocksdb.compact.write.bytes,rocksdb.bytes.written,rocksdb.number.keys.read,...,get.COUNT,get.SUM,get.AVG,write.P50,write.P95,write.P99,write.P100,write.COUNT,write.SUM,write.AVG
1,universal,universal,universal,215.646,88822,543645,811590816,3768128779,16407028699,1911178,...,1911178,13803129,7.222315,8.622239,14.985785,44.959654,3166763.0,1000000,213881233,213.881233
2,universal,universal,level,309.711,88818,543256,811597848,7295480689,16407029052,1911182,...,1911182,13627992,7.130662,8.507494,14.442705,38.848749,3797310.0,1000000,307948584,307.948584
2,universal,universal,universal,252.733,88592,543566,811652440,3886721728,16407028784,1911408,...,1911408,13552759,7.090458,8.421768,14.199999,37.564776,3734166.0,1000000,251054057,251.054057
2,universal,level,level,267.389,88948,543385,811595301,7041300012,16407028895,1911052,...,1911052,13818065,7.230606,8.414508,14.064281,33.004418,3503366.0,1000000,265712666,265.712666
2,universal,level,universal,273.964,89077,543107,811540036,3873078173,16407028477,1910923,...,1910923,13387882,7.005977,8.489377,14.323994,32.665415,3458730.0,1000000,272312499,272.312499
3,universal,level,level,303.68,88748,543769,811622172,7209420559,16407028746,1911252,...,1911252,13111027,6.859915,8.455859,14.541652,35.754415,3650556.0,1000000,302098691,302.098691
1,universal,level,universal,435.653,88794,543040,811629101,4180097864,16407029090,1911206,...,1911206,13903951,7.274962,8.750211,16.880403,46.817891,7401714.0,1000000,433860850,433.86085
3,universal,universal,universal,196.579,88893,543244,811621772,3779265860,16407028558,1911107,...,1911107,13628706,7.131315,8.425454,14.077325,32.014338,2135283.0,1000000,194934750,194.93475
3,universal,level,universal,254.691,88746,543708,811725984,3938471118,16407028018,1911254,...,1911254,13753392,7.196004,8.430955,14.073417,31.423313,3721982.0,1000000,253108289,253.108289
1,universal,universal,level,334.687,89125,542997,808394069,6990263479,16407029204,1910875,...,1910875,13624950,7.130215,8.466371,14.63971,41.330087,4920635.0,1000000,332933497,332.933497


In [7]:
import pandas as pd

# case 분류 함수
def classify_case(row):
    hot = row.get('hot_compaction', '').lower()
    cold = row.get('cold_compaction', '').lower()

    if hot == 'level' and cold == 'level':
        return 0
    elif hot == 'universal' and cold == 'universal':
        return 1
    elif hot == 'level' and cold == 'universal':
        return 2
    elif hot == 'universal' and cold == 'level':
        return 3
    else:
        return -1  # 알 수 없는 조합 (예외 처리용)

df_waf = df.copy()

# WAF 계산 (0으로 나눌 경우 방지 및 NaN은 0으로 처리)
df_waf['WAF'] = df.apply(
    lambda row: (row.get('rocksdb.flush.write.bytes', 0) + row.get('rocksdb.compact.write.bytes', 0)) / row['rocksdb.bytes.written']
    if row['rocksdb.bytes.written'] != 0 else 0,
    axis=1
)

# RAF 계산 (0으로 나눌 경우 방지 및 NaN은 0으로 처리)
df_waf['RAF'] = df.apply(
    lambda row: (
        (row.get('rocksdb.number.keys.read', 0) * 16 * 1024) / row['rocksdb.bytes.read']
        if row['rocksdb.bytes.read'] != 0 else 0
    ),
    axis=1
)


# Cache Hit Ratio 계산 (0으로 나눌 경우 방지 및 NaN은 0으로 처리)
df_waf['cache_hit_ratio'] = df.apply(
    lambda row: row.get('rocksdb.block.cache.hit', 0) / (row.get('rocksdb.block.cache.hit', 0) + row.get('rocksdb.block.cache.miss', 0))
    if (row.get('rocksdb.block.cache.hit', 0) + row.get('rocksdb.block.cache.miss', 0)) != 0 else 0,
    axis=1
)

# Throughput 계산 (0으로 나눌 경우 방지 및 NaN은 0으로 처리)
df_waf['throughput'] = df.apply(
    lambda row: row.get('rocksdb.bytes.written', 0) / row['time(s)']
    if row['time(s)'] != 0 else 0,
    axis=1
)

# case 분류
df_waf['case'] = df.apply(classify_case, axis=1)

# NaN 값이 있을 경우 최종적으로 0으로 채움
df_waf.fillna(0, inplace=True)

# 저장
df_waf.to_csv('modified_h3_summary.csv', index=False)

In [8]:
df_waf

Unnamed: 0,trial,hot_compaction,cold_compaction,time(s),hot_column_key,default_column_key,rocksdb.flush.write.bytes,rocksdb.compact.write.bytes,rocksdb.bytes.written,rocksdb.number.keys.read,...,write.P99,write.P100,write.COUNT,write.SUM,write.AVG,WAF,RAF,cache_hit_ratio,throughput,case
1,universal,universal,universal,215.646,88822,543645,811590816,3768128779,16407028699,1911178,...,44.959654,3166763.0,1000000,213881233,213.881233,0.279132,3.021783,0.553418,76083160.0,1
2,universal,universal,level,309.711,88818,543256,811597848,7295480689,16407029052,1911182,...,38.848749,3797310.0,1000000,307948584,307.948584,0.494122,3.023668,0.441664,52975290.0,3
2,universal,universal,universal,252.733,88592,543566,811652440,3886721728,16407028784,1911408,...,37.564776,3734166.0,1000000,251054057,251.054057,0.286363,3.023624,0.552143,64918430.0,1
2,universal,level,level,267.389,88948,543385,811595301,7041300012,16407028895,1911052,...,33.004418,3503366.0,1000000,265712666,265.712666,0.47863,3.022224,0.454006,61360150.0,0
2,universal,level,universal,273.964,89077,543107,811540036,3873078173,16407028477,1910923,...,32.665415,3458730.0,1000000,272312499,272.312499,0.285525,3.022732,0.552765,59887530.0,2
3,universal,level,level,303.68,88748,543769,811622172,7209420559,16407028746,1911252,...,35.754415,3650556.0,1000000,302098691,302.098691,0.488878,3.021661,0.428351,54027360.0,0
1,universal,level,universal,435.653,88794,543040,811629101,4180097864,16407029090,1911206,...,46.817891,7401714.0,1000000,433860850,433.86085,0.304243,3.024855,0.531558,37660770.0,2
3,universal,universal,universal,196.579,88893,543244,811621772,3779265860,16407028558,1911107,...,32.014338,2135283.0,1000000,194934750,194.93475,0.279812,3.023248,0.553239,83462770.0,1
3,universal,level,universal,254.691,88746,543708,811725984,3938471118,16407028018,1911254,...,31.423313,3721982.0,1000000,253108289,253.108289,0.289522,3.021965,0.542208,64419350.0,2
1,universal,universal,level,334.687,89125,542997,808394069,6990263479,16407029204,1910875,...,41.330087,4920635.0,1000000,332933497,332.933497,0.475324,3.022953,0.44917,49022010.0,3
