In [36]:
import csv
import re
import os

log_dir = "./log_data_2"
csv_output_path = "./h3_summary_2.csv"

filename_pattern = r"(\d+)_(\w+)_hot_(\w+)_cold_(\w+)\.log"

# 기존 stat key (단순 COUNT만 추출)
target_keys = [
    # WAF
    'rocksdb.flush.write.bytes',
    'rocksdb.compact.write.bytes',
    'rocksdb.bytes.written',

    # RAF
    'rocksdb.number.keys.read', # read count
    'rocksdb.bytes.read',       # read bytes
    

    # Cache Hit/Miss
    'rocksdb.block.cache.hit',
    'rocksdb.block.cache.miss',
    "rocksdb.memtable.hit",
    "rocksdb.memtable.miss",

    # Latency
    'rocksdb.db.get.micros',
    'rocksdb.db.write.micros',
    'rocksdb.db.seek.micros',    
]

# 헤더 구성
latency_fields = [
    'get.P50', 'get.P95', 'get.P99', 'get.P100', 'get.COUNT', 'get.SUM', 'get.AVG',
    'write.P50', 'write.P95', 'write.P99', 'write.P100', 'write.COUNT', 'write.SUM', 'write.AVG',
]
header = [
    'trial', 'work', 'hot_compaction', 'cold_compaction',
    'time(s)', 'hot_column_key', 'default_column_key'
] + target_keys + latency_fields

rows = []

for log_file_name in os.listdir(log_dir):
    if not log_file_name.endswith('.log'):
        continue

    match = re.match(filename_pattern, log_file_name)
    if not match:
        print(f"Filename {log_file_name} does not match the expected pattern.")
        continue

    trial = match.group(1)
    work = match.group(2)
    hot_compaction = match.group(3)
    cold_compaction = match.group(4)

    stats_dict = {key: 0 for key in target_keys}
    latency_dict = {key: 0 for key in latency_fields}
    time_sec = 0
    hot_col_keys = 0
    default_col_keys = 0

    log_file_path = os.path.join(log_dir, log_file_name)

    with open(log_file_path, "r") as f:
        for line in f:
            line = line.strip()

            time_match = re.match(r"총 소요시간: (\d+(?:\.\d+)?)초", line)
            if time_match:
                time_sec = float(time_match.group(1))
                continue

            hot_match = re.match(r"hot 컬럼에 저장된 키 수: (\d+)", line)
            if hot_match:
                hot_col_keys = int(hot_match.group(1))
                continue

            default_match = re.match(r"default 컬럼에 저장된 키 수: (\d+)", line)
            if default_match:
                default_col_keys = int(default_match.group(1))
                continue

            # 일반 stat 추출
            stat_match = re.match(r"(rocksdb\.[\w\.]+)\s+COUNT\s*:\s*(\d+)", line)
            if stat_match:
                key = stat_match.group(1)
                value = int(stat_match.group(2))
                if key in stats_dict:
                    stats_dict[key] = value

            # Latency: get
            get_match = re.match(r"rocksdb\.db\.get\.micros\s+P50\s*:\s*([\d\.]+)\s+P95\s*:\s*([\d\.]+)\s+P99\s*:\s*([\d\.]+)\s+P100\s*:\s*([\d\.]+)\s+COUNT\s*:\s*(\d+)\s+SUM\s*:\s*(\d+)", line)
            if get_match:
                p50, p95, p99, p100, count, total = map(float, get_match.groups())
                latency_dict.update({
                    'get.P50': p50,
                    'get.P95': p95,
                    'get.P99': p99,
                    'get.P100': p100,
                    'get.COUNT': int(count),
                    'get.SUM': int(total),
                    'get.AVG': total / count if count > 0 else 0
                })

            # Latency: write
            write_match = re.match(r"rocksdb\.db\.write\.micros\s+P50\s*:\s*([\d\.]+)\s+P95\s*:\s*([\d\.]+)\s+P99\s*:\s*([\d\.]+)\s+P100\s*:\s*([\d\.]+)\s+COUNT\s*:\s*(\d+)\s+SUM\s*:\s*(\d+)", line)
            if write_match:
                p50, p95, p99, p100, count, total = map(float, write_match.groups())
                latency_dict.update({
                    'write.P50': p50,
                    'write.P95': p95,
                    'write.P99': p99,
                    'write.P100': p100,
                    'write.COUNT': int(count),
                    'write.SUM': int(total),
                    'write.AVG': total / count if count > 0 else 0
                })

    row = [
        trial, work, hot_compaction, cold_compaction,
        time_sec, hot_col_keys, default_col_keys
    ] + [stats_dict[key] for key in target_keys] + [latency_dict[key] for key in latency_fields]
    rows.append(row)

with open(csv_output_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)
    writer.writerows(rows)

print(f"CSV summary saved to {csv_output_path}")

CSV summary saved to ./h3_summary_2.csv


In [37]:
import pandas as pd

pd.set_option('display.max_columns', None)  # 모든 열 표시
pd.set_option('display.max_rows', None)  # 모든 행 표시

# CSV 파일 읽기
df = pd.read_csv("./h3_summary_2.csv")

df

Unnamed: 0,trial,work,hot_compaction,cold_compaction,time(s),hot_column_key,default_column_key,rocksdb.flush.write.bytes,rocksdb.compact.write.bytes,rocksdb.bytes.written,rocksdb.number.keys.read,rocksdb.bytes.read,rocksdb.block.cache.hit,rocksdb.block.cache.miss,rocksdb.memtable.hit,rocksdb.memtable.miss,rocksdb.db.get.micros,rocksdb.db.write.micros,rocksdb.db.seek.micros,get.P50,get.P95,get.P99,get.P100,get.COUNT,get.SUM,get.AVG,write.P50,write.P95,write.P99,write.P100,write.COUNT,write.SUM,write.AVG
0,3,read,universal,universal,205.176,0,0,0,3070759,0,1000000,10347282432,1321139,2462473,0,1000000,0,0,0,47.947409,1199.967626,3433.098316,94264.0,1000000,200131914,200.131914,0.0,0.0,0.0,0.0,0,0,0.0
1,3,read,level,universal,325.054,0,0,0,6365182,0,1000000,10342875136,729748,4327180,0,1000000,0,0,0,86.963071,2124.00888,4371.571649,75500.0,1000000,318825792,318.825792,0.0,0.0,0.0,0.0,0,0,0.0
2,1,write,universal,level,376.56,88674,543638,812032160,3025443077,16407028307,1911326,10359799808,6267022,4942665,3751,1907575,0,0,0,9.189578,1077.542745,2738.6564,254117.0,1911326,286588101,149.94203,24.990465,195.160126,9434.827834,1417946.0,1000000,355701353,355.701353
3,1,write,universal,universal,369.122,88723,543583,811904197,1728629039,16407028576,1911277,10359701504,3557411,3353573,3737,1907540,0,0,0,5.971998,1149.962222,2891.181889,195343.0,1911277,315740018,165.198461,29.417361,168.001056,8140.919781,1611209.0,1000000,343088677,343.088677
4,1,read,universal,level,221.967,0,0,0,17890583,0,1000000,10339631104,1152497,2426696,0,1000000,0,0,0,44.190713,1294.47399,3689.36988,429383.0,1000000,217214343,217.214343,0.0,0.0,0.0,0.0,0,0,0.0
5,1,read,universal,universal,272.256,0,0,0,3058824,0,1000000,10329849856,1301229,3178043,0,1000000,0,0,0,66.497526,1743.978735,4341.108247,87072.0,1000000,265745456,265.745456,0.0,0.0,0.0,0.0,0,0,0.0
6,2,read,level,level,338.783,0,0,0,3082009,0,1000000,10336485376,1274655,2419833,0,1000000,0,0,0,53.249548,2320.700928,4368.324817,276028.0,1000000,333956024,333.956024,0.0,0.0,0.0,0.0,0,0,0.0
7,3,write,level,universal,328.092,89166,543019,812017703,1778731487,16407028915,1910834,10357719040,7738345,3382861,3732,1907102,0,0,0,9.234759,1183.528381,2805.634063,103851.0,1910834,281421927,147.277015,25.294651,157.283748,8066.856404,1313464.0,1000000,308830323,308.830323
8,1,write,level,level,89.41,88655,543998,812049935,2933129521,16407028810,1911345,10365386752,3797924,4821893,3746,1907599,0,0,0,1.758832,318.512141,592.456233,30969.0,1911345,77437216,40.514515,8.726167,32.669809,2003.152655,299756.0,1000000,85957553,85.957553
9,2,read,level,universal,236.639,0,0,0,3052094,0,1000000,10349887488,1334103,3348536,0,1000000,0,0,0,66.135427,1295.76757,3700.713681,60645.0,1000000,230762218,230.762218,0.0,0.0,0.0,0.0,0,0,0.0


In [38]:
import pandas as pd

# case 분류 함수
def classify_case(row):
    hot = row.get('hot_compaction', '').lower()
    cold = row.get('cold_compaction', '').lower()

    if hot == 'level' and cold == 'level':
        return 0
    elif hot == 'universal' and cold == 'universal':
        return 1
    elif hot == 'level' and cold == 'universal':
        return 2
    elif hot == 'universal' and cold == 'level':
        return 3
    else:
        return -1  # 알 수 없는 조합 (예외 처리용)

df_waf = df.copy()

# WAF 계산 (0으로 나눌 경우 방지 및 NaN은 0으로 처리)
df_waf['WAF'] = df.apply(
    lambda row: (row.get('rocksdb.flush.write.bytes', 0) + row.get('rocksdb.compact.write.bytes', 0)) / row['rocksdb.bytes.written']
    if row['rocksdb.bytes.written'] != 0 else 0,
    axis=1
)

# RAF 계산 (0으로 나눌 경우 방지 및 NaN은 0으로 처리)
df_waf['RAF'] = df.apply(
    lambda row: (
        (row.get('rocksdb.number.keys.read', 0) * 16 * 1024) / row['rocksdb.bytes.read']
        if row['rocksdb.bytes.read'] != 0 else 0
    ),
    axis=1
)


# Cache Hit Ratio 계산 (0으로 나눌 경우 방지 및 NaN은 0으로 처리)
df_waf['cache_hit_ratio'] = df.apply(
    lambda row: row.get('rocksdb.block.cache.hit', 0) / (row.get('rocksdb.block.cache.hit', 0) + row.get('rocksdb.block.cache.miss', 0))
    if (row.get('rocksdb.block.cache.hit', 0) + row.get('rocksdb.block.cache.miss', 0)) != 0 else 0,
    axis=1
)

# Throughput 계산 (0으로 나눌 경우 방지 및 NaN은 0으로 처리)
df_waf['throughput'] = df.apply(
    lambda row: row.get('rocksdb.bytes.written', 0) / row['time(s)']
    if row['time(s)'] != 0 else 0,
    axis=1
)

# case 분류
df_waf['case'] = df.apply(classify_case, axis=1)

# NaN 값이 있을 경우 최종적으로 0으로 채움
df_waf.fillna(0, inplace=True)

# 저장
df_waf.to_csv('modified_h3_summary_2.csv', index=False)

In [39]:
df_waf

Unnamed: 0,trial,work,hot_compaction,cold_compaction,time(s),hot_column_key,default_column_key,rocksdb.flush.write.bytes,rocksdb.compact.write.bytes,rocksdb.bytes.written,rocksdb.number.keys.read,rocksdb.bytes.read,rocksdb.block.cache.hit,rocksdb.block.cache.miss,rocksdb.memtable.hit,rocksdb.memtable.miss,rocksdb.db.get.micros,rocksdb.db.write.micros,rocksdb.db.seek.micros,get.P50,get.P95,get.P99,get.P100,get.COUNT,get.SUM,get.AVG,write.P50,write.P95,write.P99,write.P100,write.COUNT,write.SUM,write.AVG,WAF,RAF,cache_hit_ratio,throughput,case
0,3,read,universal,universal,205.176,0,0,0,3070759,0,1000000,10347282432,1321139,2462473,0,1000000,0,0,0,47.947409,1199.967626,3433.098316,94264.0,1000000,200131914,200.131914,0.0,0.0,0.0,0.0,0,0,0.0,0.0,1.583411,0.349174,0.0,1
1,3,read,level,universal,325.054,0,0,0,6365182,0,1000000,10342875136,729748,4327180,0,1000000,0,0,0,86.963071,2124.00888,4371.571649,75500.0,1000000,318825792,318.825792,0.0,0.0,0.0,0.0,0,0,0.0,0.0,1.584086,0.144307,0.0,2
2,1,write,universal,level,376.56,88674,543638,812032160,3025443077,16407028307,1911326,10359799808,6267022,4942665,3751,1907575,0,0,0,9.189578,1077.542745,2738.6564,254117.0,1911326,286588101,149.94203,24.990465,195.160126,9434.827834,1417946.0,1000000,355701353,355.701353,0.233892,3.022758,0.559072,43570820.0,3
3,1,write,universal,universal,369.122,88723,543583,811904197,1728629039,16407028576,1911277,10359701504,3557411,3353573,3737,1907540,0,0,0,5.971998,1149.962222,2891.181889,195343.0,1911277,315740018,165.198461,29.417361,168.001056,8140.919781,1611209.0,1000000,343088677,343.088677,0.154844,3.022709,0.514747,44448800.0,1
4,1,read,universal,level,221.967,0,0,0,17890583,0,1000000,10339631104,1152497,2426696,0,1000000,0,0,0,44.190713,1294.47399,3689.36988,429383.0,1000000,217214343,217.214343,0.0,0.0,0.0,0.0,0,0,0.0,0.0,1.584583,0.321999,0.0,3
5,1,read,universal,universal,272.256,0,0,0,3058824,0,1000000,10329849856,1301229,3178043,0,1000000,0,0,0,66.497526,1743.978735,4341.108247,87072.0,1000000,265745456,265.745456,0.0,0.0,0.0,0.0,0,0,0.0,0.0,1.586083,0.2905,0.0,1
6,2,read,level,level,338.783,0,0,0,3082009,0,1000000,10336485376,1274655,2419833,0,1000000,0,0,0,53.249548,2320.700928,4368.324817,276028.0,1000000,333956024,333.956024,0.0,0.0,0.0,0.0,0,0,0.0,0.0,1.585065,0.345015,0.0,0
7,3,write,level,universal,328.092,89166,543019,812017703,1778731487,16407028915,1910834,10357719040,7738345,3382861,3732,1907102,0,0,0,9.234759,1183.528381,2805.634063,103851.0,1910834,281421927,147.277015,25.294651,157.283748,8066.856404,1313464.0,1000000,308830323,308.830323,0.157905,3.022587,0.695819,50007400.0,2
8,1,write,level,level,89.41,88655,543998,812049935,2933129521,16407028810,1911345,10365386752,3797924,4821893,3746,1907599,0,0,0,1.758832,318.512141,592.456233,30969.0,1911345,77437216,40.514515,8.726167,32.669809,2003.152655,299756.0,1000000,85957553,85.957553,0.228267,3.021159,0.440604,183503300.0,0
9,2,read,level,universal,236.639,0,0,0,3052094,0,1000000,10349887488,1334103,3348536,0,1000000,0,0,0,66.135427,1295.76757,3700.713681,60645.0,1000000,230762218,230.762218,0.0,0.0,0.0,0.0,0,0,0.0,0.0,1.583012,0.284904,0.0,2
