In [1]:
import csv
import re
import os

log_dir = "./log_data_2"
csv_output_path = "./h4_summary_2.csv"

filename_pattern = r"(\w+)_hot_(\w+)_cold_(\w+)_run(\d+).log"

# 기존 stat key (단순 COUNT만 추출)
target_keys = [
    # WAF
    'rocksdb.flush.write.bytes',
    'rocksdb.compact.write.bytes',
    'rocksdb.bytes.written',

    # RAF
    'rocksdb.number.keys.read', # read count
    'rocksdb.bytes.read',       # read bytes
    

    # Cache Hit/Miss
    'rocksdb.block.cache.hit',
    'rocksdb.block.cache.miss',
    "rocksdb.memtable.hit",
    "rocksdb.memtable.miss",

    # Latency
    'rocksdb.db.get.micros',
    'rocksdb.db.write.micros',
    'rocksdb.db.seek.micros',  

    # time
    'rocksdb.compaction.total.time.cpu_micros',  

    # 압축률
    'rocksdb.bytes.compressed.from',
    'rocksdb.bytes.compressed.to'
]

# 헤더 구성
latency_fields = [
    'get.P50', 'get.P95', 'get.P99', 'get.P100', 'get.COUNT', 'get.SUM', 'get.AVG',
    'write.P50', 'write.P95', 'write.P99', 'write.P100', 'write.COUNT', 'write.SUM', 'write.AVG',
]
header = [
    'work', 'hot', 'cold', 'trial',
    'time(s)', 'hot_column_key', 'default_column_key'
] + target_keys + latency_fields

rows = []

for log_file_name in os.listdir(log_dir):
    if not log_file_name.endswith('.log'):
        continue

    match = re.match(filename_pattern, log_file_name)
    if not match:
        print(f"Filename {log_file_name} does not match the expected pattern.")
        continue

    work = match.group(1)
    hot = match.group(2)
    cold = match.group(3)
    trial = match.group(4)

    stats_dict = {key: 0 for key in target_keys}
    latency_dict = {key: 0 for key in latency_fields}
    time_sec = 0
    hot_col_keys = 0
    default_col_keys = 0

    log_file_path = os.path.join(log_dir, log_file_name)

    with open(log_file_path, "r") as f:
        for line in f:
            line = line.strip()

            time_match = re.match(r"총 소요시간: (\d+(?:\.\d+)?)초", line)
            if time_match:
                time_sec = float(time_match.group(1))
                continue

            hot_match = re.match(r"hot 컬럼에서 찾은 키 수: (\d+)", line)
            if hot_match:
                hot_col_keys = int(hot_match.group(1))
                continue

            default_match = re.match(r"default 컬럼에 저장된 키 수: (\d+)", line)
            if default_match:
                default_col_keys = int(default_match.group(1))
                continue

            # 일반 stat 추출
            stat_match = re.match(r"(rocksdb\.[\w\.]+)\s+COUNT\s*:\s*(\d+)", line)
            if stat_match:
                key = stat_match.group(1)
                value = int(stat_match.group(2))
                if key in stats_dict:
                    stats_dict[key] = value

            # Latency: get
            get_match = re.match(r"rocksdb\.db\.get\.micros\s+P50\s*:\s*([\d\.]+)\s+P95\s*:\s*([\d\.]+)\s+P99\s*:\s*([\d\.]+)\s+P100\s*:\s*([\d\.]+)\s+COUNT\s*:\s*(\d+)\s+SUM\s*:\s*(\d+)", line)
            if get_match:
                p50, p95, p99, p100, count, total = map(float, get_match.groups())
                latency_dict.update({
                    'get.P50': p50,
                    'get.P95': p95,
                    'get.P99': p99,
                    'get.P100': p100,
                    'get.COUNT': int(count),
                    'get.SUM': int(total),
                    'get.AVG': total / count if count > 0 else 0
                })

            # Latency: write
            write_match = re.match(r"rocksdb\.db\.write\.micros\s+P50\s*:\s*([\d\.]+)\s+P95\s*:\s*([\d\.]+)\s+P99\s*:\s*([\d\.]+)\s+P100\s*:\s*([\d\.]+)\s+COUNT\s*:\s*(\d+)\s+SUM\s*:\s*(\d+)", line)
            if write_match:
                p50, p95, p99, p100, count, total = map(float, write_match.groups())
                latency_dict.update({
                    'write.P50': p50,
                    'write.P95': p95,
                    'write.P99': p99,
                    'write.P100': p100,
                    'write.COUNT': int(count),
                    'write.SUM': int(total),
                    'write.AVG': total / count if count > 0 else 0
                })

        row = [
            work, hot, cold, trial,
            time_sec, hot_col_keys, default_col_keys
        ] + [stats_dict[key] for key in target_keys] + [latency_dict[key] for key in latency_fields]
        rows.append(row)


with open(csv_output_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)
    writer.writerows(rows)

print(f"CSV summary saved to {csv_output_path}")

CSV summary saved to ./h4_summary_2.csv


In [2]:
import pandas as pd

pd.set_option('display.max_columns', None)  # 모든 열 표시
pd.set_option('display.max_rows', None)  # 모든 행 표시

# CSV 파일 읽기
df = pd.read_csv("./h4_summary_2.csv")

df

Unnamed: 0,work,hot,cold,trial,time(s),hot_column_key,default_column_key,rocksdb.flush.write.bytes,rocksdb.compact.write.bytes,rocksdb.bytes.written,rocksdb.number.keys.read,rocksdb.bytes.read,rocksdb.block.cache.hit,rocksdb.block.cache.miss,rocksdb.memtable.hit,rocksdb.memtable.miss,rocksdb.db.get.micros,rocksdb.db.write.micros,rocksdb.db.seek.micros,rocksdb.compaction.total.time.cpu_micros,rocksdb.bytes.compressed.from,rocksdb.bytes.compressed.to,get.P50,get.P95,get.P99,get.P100,get.COUNT,get.SUM,get.AVG,write.P50,write.P95,write.P99,write.P100,write.COUNT,write.SUM,write.AVG
0,write,ZSTD,ZSTD,1,215.989,0,250256,59110061,266389530,16407311946,1806121,7276707840,4620763,5853952,3777,1802344,0,0,0,46993481,94012927221,296601423,2.00461,9.808885,17.122193,800.0,1806121,6963421,3.855457,8.419907,14.090376,34.585866,3201283.0,1000000,214501770,214.50177
1,read,Snappy,Snappy,1,20.983,679106,0,0,11284192,0,1000000,12665978880,900816,2326121,0,1000000,0,0,0,117649,226474778,11212114,10.932704,49.788358,69.790461,830.0,1000000,19688107,19.688107,0.0,0.0,0.0,0.0,0,0,0.0
2,read,LZ4,LZ4,3,12.967,678509,0,0,442063,0,1000000,12658835456,1161771,2030310,0,1000000,0,0,0,0,61986262,421126,8.755086,29.7775,33.479661,845.0,1000000,11809587,11.809587,0.0,0.0,0.0,0.0,0,0,0.0
3,read,Zlib,Zlib,3,44.595,678529,0,0,3575178,0,1000000,12653854720,946497,2179057,0,1000000,0,0,0,2755123,813966804,3324276,20.250963,138.713836,164.138373,893.0,1000000,43271304,43.271304,0.0,0.0,0.0,0.0,0,0,0.0
4,read,Zlib,Zlib,1,59.309,679155,0,0,791502,0,1000000,12666486784,1251194,2883307,0,1000000,0,0,0,426540,179541720,733717,49.63483,136.344975,163.384182,936.0,1000000,58036462,58.036462,0.0,0.0,0.0,0.0,0,0,0.0
5,read,Snappy,Snappy,2,18.552,678309,0,0,3089355,0,1000000,12647743488,1206466,2021422,0,1000000,0,0,0,0,61973942,3068413,9.380357,45.6325,58.745987,831.0,1000000,17387750,17.38775,0.0,0.0,0.0,0.0,0,0,0.0
6,read,LZ4,LZ4,1,13.022,679645,0,0,442347,0,1000000,12671156224,1174693,2019593,0,1000000,0,0,0,0,62019153,421400,8.792951,29.873445,33.551465,798.0,1000000,11858636,11.858636,0.0,0.0,0.0,0.0,0,0,0.0
7,write,ZSTD,ZSTD,3,193.977,0,250372,59098541,266456545,16407310918,1806015,7280345088,4578971,5854806,3777,1802238,0,0,0,47262545,94029540110,296651831,1.992202,9.78896,16.279925,778.0,1806015,6951580,3.849126,8.492948,14.54595,43.080026,2757323.0,1000000,192408746,192.408746
8,read,ZSTD,ZSTD,3,13.242,678965,0,0,229370,0,1000000,12661014528,1162542,2024479,0,1000000,0,0,0,0,62035323,208416,8.830104,30.302283,33.800596,808.0,1000000,12021015,12.021015,0.0,0.0,0.0,0.0,0,0,0.0
9,write,LZ4,LZ4,3,230.968,0,250238,114564212,533629708,16407310709,1806039,7277756416,4558061,5853549,3774,1802265,0,0,0,41746725,94015582666,619295371,1.969052,9.721859,14.902624,830.0,1806039,6755277,3.740383,8.472808,14.385077,38.099007,3296950.0,1000000,229507332,229.507332


In [3]:
import pandas as pd

# case 분류 함수
def classify_case(row):
    hot = row.get('hot', '').lower()
    cold = row.get('cold', '').lower()

    if hot == 'snappy' and cold == 'snappy':
        return 'snappy'
    elif hot == 'lz4' and cold == 'lz4':
        return 'lz4'
    elif hot == 'zlib' and cold == 'zlib':
        return 'zlib'
    elif hot == 'zstd' and cold == 'zstd':
        return 'zstd'
    else:
        return -1  # 알 수 없는 조합 (예외 처리용)

df_waf = df.copy()

# WAF 계산 (0으로 나눌 경우 방지 및 NaN은 0으로 처리)
df_waf['WAF'] = df.apply(
    lambda row: (row.get('rocksdb.flush.write.bytes', 0) + row.get('rocksdb.compact.write.bytes', 0)) / row['rocksdb.bytes.written']
    if row['rocksdb.bytes.written'] != 0 else 0,
    axis=1
)

# RAF 계산 (0으로 나눌 경우 방지 및 NaN은 0으로 처리)
df_waf['RAF'] = df.apply(
    lambda row: (
        (row.get('rocksdb.number.keys.read', 0) * 16 * 1024) / row['rocksdb.bytes.read']
        if row['rocksdb.bytes.read'] != 0 else 0
    ),
    axis=1
)

# Cache Hit Ratio 계산 (0으로 나눌 경우 방지 및 NaN은 0으로 처리)
df_waf['cache_hit_ratio'] = df.apply(
    lambda row: row.get('rocksdb.block.cache.hit', 0) / (row.get('rocksdb.block.cache.hit', 0) + row.get('rocksdb.block.cache.miss', 0))
    if (row.get('rocksdb.block.cache.hit', 0) + row.get('rocksdb.block.cache.miss', 0)) != 0 else 0,
    axis=1
)

df_waf['memtable_hit_ratio'] = df.apply(
    lambda row: row.get('rocksdb.memtable.hit', 0) / (row.get('rocksdb.memtable.hit', 0) + row.get('rocksdb.memtable.miss', 0))
    if (row.get('rocksdb.memtable.hit', 0) + row.get('rocksdb.memtable.miss', 0)) != 0 else 0,
    axis=1
)

# Throughput 계산 (0으로 나눌 경우 방지 및 NaN은 0으로 처리)
df_waf['throughput'] = df.apply(
    lambda row: row.get('rocksdb.bytes.written', 0) / row['time(s)']
    if row['time(s)'] != 0 else 0,
    axis=1
)

# 압축률 계산
df_waf['compression_ratio'] = df.apply(
    lambda row: row.get('rocksdb.bytes.compressed.to', 0) / row.get('rocksdb.bytes.compressed.from', 1) * 100
    if row.get('rocksdb.bytes.compressed.from', 1) != 0 else 0,
    axis=1
)

# case 분류
df_waf['case'] = df.apply(classify_case, axis=1)

# NaN 값이 있을 경우 최종적으로 0으로 채움
df_waf.fillna(0, inplace=True)

# 저장
df_waf.to_csv('modified_h4_summary_2.csv', index=False)

In [4]:
df_waf

Unnamed: 0,work,hot,cold,trial,time(s),hot_column_key,default_column_key,rocksdb.flush.write.bytes,rocksdb.compact.write.bytes,rocksdb.bytes.written,rocksdb.number.keys.read,rocksdb.bytes.read,rocksdb.block.cache.hit,rocksdb.block.cache.miss,rocksdb.memtable.hit,rocksdb.memtable.miss,rocksdb.db.get.micros,rocksdb.db.write.micros,rocksdb.db.seek.micros,rocksdb.compaction.total.time.cpu_micros,rocksdb.bytes.compressed.from,rocksdb.bytes.compressed.to,get.P50,get.P95,get.P99,get.P100,get.COUNT,get.SUM,get.AVG,write.P50,write.P95,write.P99,write.P100,write.COUNT,write.SUM,write.AVG,WAF,RAF,cache_hit_ratio,memtable_hit_ratio,throughput,compression_ratio,case
0,write,ZSTD,ZSTD,1,215.989,0,250256,59110061,266389530,16407311946,1806121,7276707840,4620763,5853952,3777,1802344,0,0,0,46993481,94012927221,296601423,2.00461,9.808885,17.122193,800.0,1806121,6963421,3.855457,8.419907,14.090376,34.585866,3201283.0,1000000,214501770,214.50177,0.019839,4.066604,0.441135,0.002091,75963650.0,0.31549,zstd
1,read,Snappy,Snappy,1,20.983,679106,0,0,11284192,0,1000000,12665978880,900816,2326121,0,1000000,0,0,0,117649,226474778,11212114,10.932704,49.788358,69.790461,830.0,1000000,19688107,19.688107,0.0,0.0,0.0,0.0,0,0,0.0,0.0,1.293544,0.279155,0.0,0.0,4.950712,snappy
2,read,LZ4,LZ4,3,12.967,678509,0,0,442063,0,1000000,12658835456,1161771,2030310,0,1000000,0,0,0,0,61986262,421126,8.755086,29.7775,33.479661,845.0,1000000,11809587,11.809587,0.0,0.0,0.0,0.0,0,0,0.0,0.0,1.294274,0.363954,0.0,0.0,0.679386,lz4
3,read,Zlib,Zlib,3,44.595,678529,0,0,3575178,0,1000000,12653854720,946497,2179057,0,1000000,0,0,0,2755123,813966804,3324276,20.250963,138.713836,164.138373,893.0,1000000,43271304,43.271304,0.0,0.0,0.0,0.0,0,0,0.0,0.0,1.294783,0.302825,0.0,0.0,0.408404,zlib
4,read,Zlib,Zlib,1,59.309,679155,0,0,791502,0,1000000,12666486784,1251194,2883307,0,1000000,0,0,0,426540,179541720,733717,49.63483,136.344975,163.384182,936.0,1000000,58036462,58.036462,0.0,0.0,0.0,0.0,0,0,0.0,0.0,1.293492,0.302623,0.0,0.0,0.408661,zlib
5,read,Snappy,Snappy,2,18.552,678309,0,0,3089355,0,1000000,12647743488,1206466,2021422,0,1000000,0,0,0,0,61973942,3068413,9.380357,45.6325,58.745987,831.0,1000000,17387750,17.38775,0.0,0.0,0.0,0.0,0,0,0.0,0.0,1.295409,0.373763,0.0,0.0,4.951134,snappy
6,read,LZ4,LZ4,1,13.022,679645,0,0,442347,0,1000000,12671156224,1174693,2019593,0,1000000,0,0,0,0,62019153,421400,8.792951,29.873445,33.551465,798.0,1000000,11858636,11.858636,0.0,0.0,0.0,0.0,0,0,0.0,0.0,1.293015,0.367748,0.0,0.0,0.679468,lz4
7,write,ZSTD,ZSTD,3,193.977,0,250372,59098541,266456545,16407310918,1806015,7280345088,4578971,5854806,3777,1802238,0,0,0,47262545,94029540110,296651831,1.992202,9.78896,16.279925,778.0,1806015,6951580,3.849126,8.492948,14.54595,43.080026,2757323.0,1000000,192408746,192.408746,0.019842,4.064333,0.43886,0.002091,84583800.0,0.315488,zstd
8,read,ZSTD,ZSTD,3,13.242,678965,0,0,229370,0,1000000,12661014528,1162542,2024479,0,1000000,0,0,0,0,62035323,208416,8.830104,30.302283,33.800596,808.0,1000000,12021015,12.021015,0.0,0.0,0.0,0.0,0,0,0.0,0.0,1.294051,0.364774,0.0,0.0,0.335963,zstd
9,write,LZ4,LZ4,3,230.968,0,250238,114564212,533629708,16407310709,1806039,7277756416,4558061,5853549,3774,1802265,0,0,0,41746725,94015582666,619295371,1.969052,9.721859,14.902624,830.0,1806039,6755277,3.740383,8.472808,14.385077,38.099007,3296950.0,1000000,229507332,229.507332,0.039506,4.065833,0.437786,0.00209,71037160.0,0.658716,lz4
