In [1]:
import csv
import re
import os

log_dir = "./log_data"
csv_output_path = "./h4_summary.csv"

filename_pattern = r"(\w+)_hot_(\w+)_cold_(\w+)_run(\d+).log"

# 기존 stat key (단순 COUNT만 추출)
target_keys = [
    # WAF
    'rocksdb.flush.write.bytes',
    'rocksdb.compact.write.bytes',
    'rocksdb.bytes.written',

    # RAF
    'rocksdb.number.keys.read', # read count
    'rocksdb.bytes.read',       # read bytes
    

    # Cache Hit/Miss
    'rocksdb.block.cache.hit',
    'rocksdb.block.cache.miss',
    "rocksdb.memtable.hit",
    "rocksdb.memtable.miss",

    # Latency
    'rocksdb.db.get.micros',
    'rocksdb.db.write.micros',
    'rocksdb.db.seek.micros',  

    # time
    'rocksdb.compaction.total.time.cpu_micros',  

    # 압축률
    'rocksdb.bytes.compressed.from',
    'rocksdb.bytes.compressed.to'
]

# 헤더 구성
latency_fields = [
    'get.P50', 'get.P95', 'get.P99', 'get.P100', 'get.COUNT', 'get.SUM', 'get.AVG',
    'write.P50', 'write.P95', 'write.P99', 'write.P100', 'write.COUNT', 'write.SUM', 'write.AVG',
]
header = [
    'work', 'hot', 'cold', 'trial',
    'time(s)', 'hot_column_key', 'default_column_key'
] + target_keys + latency_fields

rows = []

for log_file_name in os.listdir(log_dir):
    if not log_file_name.endswith('.log'):
        continue

    match = re.match(filename_pattern, log_file_name)
    if not match:
        print(f"Filename {log_file_name} does not match the expected pattern.")
        continue

    work = match.group(1)
    hot = match.group(2)
    cold = match.group(3)
    trial = match.group(4)

    stats_dict = {key: 0 for key in target_keys}
    latency_dict = {key: 0 for key in latency_fields}
    time_sec = 0
    hot_col_keys = 0
    default_col_keys = 0

    log_file_path = os.path.join(log_dir, log_file_name)

    with open(log_file_path, "r") as f:
        for line in f:
            line = line.strip()

            time_match = re.match(r"총 소요시간: (\d+(?:\.\d+)?)초", line)
            if time_match:
                time_sec = float(time_match.group(1))
                continue

            hot_match = re.match(r"hot 컬럼에서 찾은 키 수: (\d+)", line)
            if hot_match:
                hot_col_keys = int(hot_match.group(1))
                continue

            default_match = re.match(r"default 컬럼에 저장된 키 수: (\d+)", line)
            if default_match:
                default_col_keys = int(default_match.group(1))
                continue

            # 일반 stat 추출
            stat_match = re.match(r"(rocksdb\.[\w\.]+)\s+COUNT\s*:\s*(\d+)", line)
            if stat_match:
                key = stat_match.group(1)
                value = int(stat_match.group(2))
                if key in stats_dict:
                    stats_dict[key] = value

            # Latency: get
            get_match = re.match(r"rocksdb\.db\.get\.micros\s+P50\s*:\s*([\d\.]+)\s+P95\s*:\s*([\d\.]+)\s+P99\s*:\s*([\d\.]+)\s+P100\s*:\s*([\d\.]+)\s+COUNT\s*:\s*(\d+)\s+SUM\s*:\s*(\d+)", line)
            if get_match:
                p50, p95, p99, p100, count, total = map(float, get_match.groups())
                latency_dict.update({
                    'get.P50': p50,
                    'get.P95': p95,
                    'get.P99': p99,
                    'get.P100': p100,
                    'get.COUNT': int(count),
                    'get.SUM': int(total),
                    'get.AVG': total / count if count > 0 else 0
                })

            # Latency: write
            write_match = re.match(r"rocksdb\.db\.write\.micros\s+P50\s*:\s*([\d\.]+)\s+P95\s*:\s*([\d\.]+)\s+P99\s*:\s*([\d\.]+)\s+P100\s*:\s*([\d\.]+)\s+COUNT\s*:\s*(\d+)\s+SUM\s*:\s*(\d+)", line)
            if write_match:
                p50, p95, p99, p100, count, total = map(float, write_match.groups())
                latency_dict.update({
                    'write.P50': p50,
                    'write.P95': p95,
                    'write.P99': p99,
                    'write.P100': p100,
                    'write.COUNT': int(count),
                    'write.SUM': int(total),
                    'write.AVG': total / count if count > 0 else 0
                })

        row = [
            work, hot, cold, trial,
            time_sec, hot_col_keys, default_col_keys
        ] + [stats_dict[key] for key in target_keys] + [latency_dict[key] for key in latency_fields]
        rows.append(row)


with open(csv_output_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)
    writer.writerows(rows)

print(f"CSV summary saved to {csv_output_path}")

CSV summary saved to ./h4_summary.csv


In [2]:
import pandas as pd

pd.set_option('display.max_columns', None)  # 모든 열 표시
pd.set_option('display.max_rows', None)  # 모든 행 표시

# CSV 파일 읽기
df = pd.read_csv("./h4_summary.csv")

df

Unnamed: 0,work,hot,cold,trial,time(s),hot_column_key,default_column_key,rocksdb.flush.write.bytes,rocksdb.compact.write.bytes,rocksdb.bytes.written,rocksdb.number.keys.read,rocksdb.bytes.read,rocksdb.block.cache.hit,rocksdb.block.cache.miss,rocksdb.memtable.hit,rocksdb.memtable.miss,rocksdb.db.get.micros,rocksdb.db.write.micros,rocksdb.db.seek.micros,rocksdb.compaction.total.time.cpu_micros,rocksdb.bytes.compressed.from,rocksdb.bytes.compressed.to,get.P50,get.P95,get.P99,get.P100,get.COUNT,get.SUM,get.AVG,write.P50,write.P95,write.P99,write.P100,write.COUNT,write.SUM,write.AVG
0,read,LZ4,Zlib,2,24.314,679362,0,0,334523,0,1000000,12659818496,1141503,1696079,0,1000000,0,0,0,0,62018794,313573,8.688403,92.45293,107.570592,784.0,1000000,23208530,23.20853,0.0,0.0,0.0,0.0,0,0,0.0
1,read,Snappy,ZSTD,1,11.83,678638,0,0,853594,0,1000000,12656115712,1192057,1263358,0,1000000,0,0,0,0,61954404,832663,8.896087,18.084461,21.627374,768.0,1000000,10589710,10.58971,0.0,0.0,0.0,0.0,0,0,0.0
2,read,none,ZSTD,2,21.091,678608,0,0,2114885947,0,1000000,12643778560,658150,3730954,0,1000000,0,0,0,1719046,36232737,121925,16.50312,47.175926,66.397294,1768.0,1000000,19716978,19.716978,0.0,0.0,0.0,0.0,0,0,0.0
3,read,Snappy,Snappy,1,20.867,678600,0,0,201724887,0,1000000,12659474432,219891,2672206,0,1000000,0,0,0,2914179,4069768137,200479868,9.283244,50.832027,71.299575,796.0,1000000,19574484,19.574484,0.0,0.0,0.0,0.0,0,0,0.0
4,write,Snappy,Zlib,1,205.98,0,250961,579596782,2630024822,16407310193,1806052,7289389056,8307136,5395064,7755,1798297,0,0,0,104748457,85258050781,3183367964,2.013612,27.858383,44.965496,1053.0,1806052,14386003,7.965442,8.427429,13.872257,18.192436,3397636.0,1000000,204604063,204.604063
5,write,Snappy,Zlib,3,251.382,0,250647,582799400,2794161050,16407310576,1806047,7284326400,6582383,5639556,3778,1802269,0,0,0,109521071,89333834920,3349457628,1.960597,23.997822,45.440049,798.0,1806047,13506835,7.478673,8.403924,13.854018,20.905573,4207485.0,1000000,249943394,249.943394
6,write,Snappy,Zlib,2,187.347,0,250324,582735306,2636250077,16407311069,1806116,7277903872,5887314,5210088,3772,1802344,0,0,0,95773948,83380148594,3193305391,2.665865,27.745932,45.201586,793.0,1806116,12663682,7.011555,8.519034,14.146075,21.455537,3085047.0,1000000,185807323,185.807323
7,write,Snappy,ZSTD,3,226.802,0,250171,579075930,3064465116,16407311368,1805920,7278608384,5255702,5976683,3775,1802145,0,0,0,56868625,96023403936,3613994699,2.224221,13.405911,21.474886,783.0,1805920,7892066,4.370108,8.396168,14.062344,31.633969,3796554.0,1000000,225267342,225.267342
8,read,Snappy,Snappy,2,18.913,678674,0,0,3093595,0,1000000,12653805568,1162824,2025128,0,1000000,0,0,0,0,62056292,3072628,9.452199,46.592771,62.517001,787.0,1000000,17727709,17.727709,0.0,0.0,0.0,0.0,0,0,0.0
9,read,Snappy,Zlib,1,30.8,678559,0,0,3510695,0,1000000,12659523584,738565,2086566,0,1000000,0,0,0,0,127379361,3469842,18.750786,95.811856,108.554649,889.0,1000000,29503313,29.503313,0.0,0.0,0.0,0.0,0,0,0.0


In [5]:
import pandas as pd

# case 분류 함수
def classify_case(row):
    hot = row.get('hot', '').lower()
    cold = row.get('cold', '').lower()

    if hot == 'snappy' and cold == 'snappy':
        return 0
    elif hot == 'lz4' and cold == 'zlib':
        return 1
    elif hot == 'lz4' and cold == 'zstd':
        return 2
    elif hot == 'none' and cold == 'zlib':
        return 3
    elif hot == 'none' and cold == 'zstd':
        return 4
    elif hot == 'snappy' and cold == 'zlib':
        return 5
    elif hot == 'snappy' and cold == 'zstd':
        return 6
    else:
        return -1  # 알 수 없는 조합 (예외 처리용)

df_waf = df.copy()

# WAF 계산 (0으로 나눌 경우 방지 및 NaN은 0으로 처리)
df_waf['WAF'] = df.apply(
    lambda row: (row.get('rocksdb.flush.write.bytes', 0) + row.get('rocksdb.compact.write.bytes', 0)) / row['rocksdb.bytes.written']
    if row['rocksdb.bytes.written'] != 0 else 0,
    axis=1
)

# RAF 계산 (0으로 나눌 경우 방지 및 NaN은 0으로 처리)
df_waf['RAF'] = df.apply(
    lambda row: (
        (row.get('rocksdb.number.keys.read', 0) * 16 * 1024) / row['rocksdb.bytes.read']
        if row['rocksdb.bytes.read'] != 0 else 0
    ),
    axis=1
)

# Cache Hit Ratio 계산 (0으로 나눌 경우 방지 및 NaN은 0으로 처리)
df_waf['cache_hit_ratio'] = df.apply(
    lambda row: row.get('rocksdb.block.cache.hit', 0) / (row.get('rocksdb.block.cache.hit', 0) + row.get('rocksdb.block.cache.miss', 0))
    if (row.get('rocksdb.block.cache.hit', 0) + row.get('rocksdb.block.cache.miss', 0)) != 0 else 0,
    axis=1
)

df_waf['memtable_hit_ratio'] = df.apply(
    lambda row: row.get('rocksdb.memtable.hit', 0) / (row.get('rocksdb.memtable.hit', 0) + row.get('rocksdb.memtable.miss', 0))
    if (row.get('rocksdb.memtable.hit', 0) + row.get('rocksdb.memtable.miss', 0)) != 0 else 0,
    axis=1
)

# Throughput 계산 (0으로 나눌 경우 방지 및 NaN은 0으로 처리)
df_waf['throughput'] = df.apply(
    lambda row: row.get('rocksdb.bytes.written', 0) / row['time(s)']
    if row['time(s)'] != 0 else 0,
    axis=1
)

# 압축률 계산
df_waf['compression_ratio'] = df.apply(
    lambda row: row.get('rocksdb.bytes.compressed.from', 0) / row.get('rocksdb.bytes.compressed.to', 1)
    if row.get('rocksdb.bytes.compressed.to', 1) != 0 else 0,
    axis=1
)

# case 분류
df_waf['case'] = df.apply(classify_case, axis=1).astype(int)

# NaN 값이 있을 경우 최종적으로 0으로 채움
df_waf.fillna(0, inplace=True)

# 저장
df_waf.to_csv('modified_h4_summary.csv', index=False)

In [6]:
df_waf

Unnamed: 0,work,hot,cold,trial,time(s),hot_column_key,default_column_key,rocksdb.flush.write.bytes,rocksdb.compact.write.bytes,rocksdb.bytes.written,rocksdb.number.keys.read,rocksdb.bytes.read,rocksdb.block.cache.hit,rocksdb.block.cache.miss,rocksdb.memtable.hit,rocksdb.memtable.miss,rocksdb.db.get.micros,rocksdb.db.write.micros,rocksdb.db.seek.micros,rocksdb.compaction.total.time.cpu_micros,rocksdb.bytes.compressed.from,rocksdb.bytes.compressed.to,get.P50,get.P95,get.P99,get.P100,get.COUNT,get.SUM,get.AVG,write.P50,write.P95,write.P99,write.P100,write.COUNT,write.SUM,write.AVG,WAF,RAF,cache_hit_ratio,memtable_hit_ratio,throughput,compression_ratio,case
0,read,LZ4,Zlib,2,24.314,679362,0,0,334523,0,1000000,12659818496,1141503,1696079,0,1000000,0,0,0,0,62018794,313573,8.688403,92.45293,107.570592,784.0,1000000,23208530,23.20853,0.0,0.0,0.0,0.0,0,0,0.0,0.0,1.294173,0.40228,0.0,0.0,197.78104,1
1,read,Snappy,ZSTD,1,11.83,678638,0,0,853594,0,1000000,12656115712,1192057,1263358,0,1000000,0,0,0,0,61954404,832663,8.896087,18.084461,21.627374,768.0,1000000,10589710,10.58971,0.0,0.0,0.0,0.0,0,0,0.0,0.0,1.294552,0.485481,0.0,0.0,74.405136,6
2,read,none,ZSTD,2,21.091,678608,0,0,2114885947,0,1000000,12643778560,658150,3730954,0,1000000,0,0,0,1719046,36232737,121925,16.50312,47.175926,66.397294,1768.0,1000000,19716978,19.716978,0.0,0.0,0.0,0.0,0,0,0.0,0.0,1.295815,0.149951,0.0,0.0,297.172335,4
3,read,Snappy,Snappy,1,20.867,678600,0,0,201724887,0,1000000,12659474432,219891,2672206,0,1000000,0,0,0,2914179,4069768137,200479868,9.283244,50.832027,71.299575,796.0,1000000,19574484,19.574484,0.0,0.0,0.0,0.0,0,0,0.0,0.0,1.294209,0.076032,0.0,0.0,20.300134,0
4,write,Snappy,Zlib,1,205.98,0,250961,579596782,2630024822,16407310193,1806052,7289389056,8307136,5395064,7755,1798297,0,0,0,104748457,85258050781,3183367964,2.013612,27.858383,44.965496,1053.0,1806052,14386003,7.965442,8.427429,13.872257,18.192436,3397636.0,1000000,204604063,204.604063,0.195621,4.059374,0.606263,0.004294,79654870.0,26.782342,5
5,write,Snappy,Zlib,3,251.382,0,250647,582799400,2794161050,16407310576,1806047,7284326400,6582383,5639556,3778,1802269,0,0,0,109521071,89333834920,3349457628,1.960597,23.997822,45.440049,798.0,1806047,13506835,7.478673,8.403924,13.854018,20.905573,4207485.0,1000000,249943394,249.943394,0.20582,4.062184,0.538571,0.002092,65268440.0,26.671135,5
6,write,Snappy,Zlib,2,187.347,0,250324,582735306,2636250077,16407311069,1806116,7277903872,5887314,5210088,3772,1802344,0,0,0,95773948,83380148594,3193305391,2.665865,27.745932,45.201586,793.0,1806116,12663682,7.011555,8.519034,14.146075,21.455537,3085047.0,1000000,185807323,185.807323,0.196192,4.065924,0.530513,0.002088,87577120.0,26.110922,5
7,write,Snappy,ZSTD,3,226.802,0,250171,579075930,3064465116,16407311368,1805920,7278608384,5255702,5976683,3775,1802145,0,0,0,56868625,96023403936,3613994699,2.224221,13.405911,21.474886,783.0,1805920,7892066,4.370108,8.396168,14.062344,31.633969,3796554.0,1000000,225267342,225.267342,0.222068,4.065089,0.467906,0.00209,72342000.0,26.56988,6
8,read,Snappy,Snappy,2,18.913,678674,0,0,3093595,0,1000000,12653805568,1162824,2025128,0,1000000,0,0,0,0,62056292,3072628,9.452199,46.592771,62.517001,787.0,1000000,17727709,17.727709,0.0,0.0,0.0,0.0,0,0,0.0,0.0,1.294788,0.364756,0.0,0.0,20.196487,0
9,read,Snappy,Zlib,1,30.8,678559,0,0,3510695,0,1000000,12659523584,738565,2086566,0,1000000,0,0,0,0,127379361,3469842,18.750786,95.811856,108.554649,889.0,1000000,29503313,29.503313,0.0,0.0,0.0,0.0,0,0,0.0,0.0,1.294204,0.261427,0.0,0.0,36.710421,5
