## Level_base_log_data

In [13]:
import csv
import re
import os

log_dir = "./level_base_log_data"
csv_output_path = "./h1_base_output.csv"

filename_pattern = r"h1_(\w+)_cs(\d+)_size(\d+)_base(\d+).log"

target_keys = [
    "rocksdb.number.keys.written",
    "rocksdb.bytes.written",
    "rocksdb.compact.write.bytes",
    "rocksdb.flush.write.bytes",
    "rocksdb.wal.bytes",
    "rocksdb.write.wal",
]

target_workloads = ["fillrandom", "overwrite"]

with open(csv_output_path, mode="w", newline='') as csvfile:
    writer = csv.writer(csvfile)
    header = ["workload", "compaction_style", "value_size", "max_bytes_for_level_base"] + target_keys

    for w in target_workloads:
        header += [
            f"{w}_latency_us", 
            f"{w}_throughput_ops", 
            f"{w}_throughput_bytes"
        ]

    writer.writerow(header)

    for log_file_name in os.listdir(log_dir):
        if not log_file_name.endswith(".log"):
            continue

        match = re.match(filename_pattern, log_file_name)
        if not match:
            print(f"파일 이름 형식이 올바르지 않아 건너뜀: {log_file_name}")
            continue

        workload = match.group(1)
        compaction_style = int(match.group(2))
        value_size = int(match.group(3))
        max_bytes_for_level_base = int(match.group(4))

        stats_dict = {key: None for key in target_keys}
        latency_throughput = {w: {"lat": None, "ops": None, "bytes": None} for w in target_workloads}
        log_file_path = os.path.join(log_dir, log_file_name)

        with open(log_file_path, "r") as f:
            for line in f:
                line = line.strip()

                match_count = re.match(r"(rocksdb\.[\w\.]+)\s+COUNT\s*:\s*(\d+)", line)
                if match_count:
                    key = match_count.group(1)
                    value = int(match_count.group(2))
                    if key in stats_dict:
                        stats_dict[key] = value
                    continue

                match_avg = re.match(r"(rocksdb\.[\w\.]+)\s+AVG\s*:\s*([\d\.]+)", line)
                if match_avg:
                    key = match_avg.group(1)
                    value = float(match_avg.group(2))
                    if key in stats_dict:
                        stats_dict[key] = value

                # Latency / Throughput 정보 추출
                for w in target_workloads:
                    if w in line:
                        match_perf = re.search(
                            r"([\d\.]+)\s+micros/op\s+([\d\.]+)\s+ops/sec.*?([\d\.]+)\s+MB/s", line
                        )
                        if match_perf:
                            latency_throughput[w]["lat"] = float(match_perf.group(1))
                            latency_throughput[w]["ops"] = float(match_perf.group(2))
                            latency_throughput[w]["bytes"] = float(match_perf.group(3))

        stat_values = [stats_dict[key] if stats_dict[key] is not None else "" for key in target_keys]

        # latency/throughput 값 추가
        latency_values = []
        for w in target_workloads:
            lt = latency_throughput[w]
            latency_values += [
                lt["lat"] if lt["lat"] is not None else "",
                lt["ops"] if lt["ops"] is not None else "",
                lt["bytes"] if lt["bytes"] is not None else "",
            ]

        row = [workload, compaction_style, value_size, max_bytes_for_level_base] + stat_values + latency_values
        writer.writerow(row)

print("CSV 저장이 완료되었습니다.")


CSV 저장이 완료되었습니다.


In [14]:
import pandas as pd

# CSV 파일 읽기
df = pd.read_csv("./h1_base_output.csv")

# workload, compaction_style 기준 정렬
df_sorted = df.sort_values(by=["workload", "compaction_style"])

# 정렬된 데이터프레임 출력
df_sorted

Unnamed: 0,workload,compaction_style,value_size,max_bytes_for_level_base,rocksdb.number.keys.written,rocksdb.bytes.written,rocksdb.compact.write.bytes,rocksdb.flush.write.bytes,rocksdb.wal.bytes,rocksdb.write.wal,fillrandom_latency_us,fillrandom_throughput_ops,fillrandom_throughput_bytes,overwrite_latency_us,overwrite_throughput_ops,overwrite_throughput_bytes
2,fillrandom,0,16384,536870912,1000000,16417000000,36052092660,8996302229,16417000000,1000000,437.744,2284.0,35.7,,,
5,fillrandom,0,4096,268435456,1000000,4128000000,4003270522,2301632341,4128000000,1000000,57.195,17484.0,68.6,,,
6,fillrandom,0,1024,67108864,1000000,1056000000,657101850,554147546,1056000000,1000000,16.308,61319.0,60.8,,,
7,fillrandom,0,1024,536870912,1000000,1056000000,588671433,554175641,1056000000,1000000,16.58,60312.0,59.8,,,
8,fillrandom,0,1024,268435456,1000000,1056000000,612860902,554235912,1056000000,1000000,15.947,62707.0,62.2,,,
9,fillrandom,0,16384,67108864,1000000,16417000000,27054478941,8996874388,16417000000,1000000,373.762,2675.0,41.8,,,
10,fillrandom,0,4096,536870912,1000000,4128000000,4390431402,2301548443,4128000000,1000000,58.202,17181.0,67.4,,,
13,fillrandom,0,16384,268435456,1000000,16417000000,29512061896,8997147451,16417000000,1000000,468.074,2136.0,33.4,,,
16,fillrandom,0,4096,67108864,1000000,4128000000,3625923549,2264390085,4128000000,1000000,75.279,13283.0,52.1,,,
0,overwrite,0,4096,67108864,1000000,4128000000,3624640394,2264460227,4128000000,1000000,,,,56.711,17633.0,69.1


In [15]:
import pandas as pd

df_waf = df.copy()

# compaction_style 매핑
style_map = {0: 'leveled', 1: 'universal'}
df_waf['compaction_style'] = df['compaction_style'].map(style_map)

# value_size 매핑
value_size_map = {1024: '1KB', 4096: '4KB', 16384: '16KB'}
df_waf['value_size'] = df['value_size'].map(value_size_map)

# max_bytes_for_level_multiplier 매핑
max_bytes_map = {67108864: '64MB', 268435456: '256MB', 536870912: '512MB'}
df_waf['max_bytes_for_level_base'] = df['max_bytes_for_level_base'].map(max_bytes_map)

# WAF 계산: wal.bytes가 0인 경우는 NaN 처리
df_waf['WAF'] = df.apply(
    lambda row: (row['rocksdb.flush.write.bytes'] + row['rocksdb.compact.write.bytes'] + row['rocksdb.bytes.written']) / row['rocksdb.bytes.written'],
    axis=1
)

# _latency_us 컬럼들을 하나로 합치기
latency_columns = [col for col in df_waf.columns if '_latency_us' in col]
df_waf['latency'] = df_waf[latency_columns].sum(axis=1)  # 합산 예시

# _throughput_ops 컬럼들을 하나로 합치기
throughput_ops_columns = [col for col in df_waf.columns if '_throughput_ops' in col]
df_waf['throughput_ops'] = df_waf[throughput_ops_columns].sum(axis=1)  # 합산 예시

# _throughput_bytes 컬럼들을 하나로 합치기
throughput_bytes_columns = [col for col in df_waf.columns if '_throughput_bytes' in col]
df_waf['throughput_bytes'] = df_waf[throughput_bytes_columns].sum(axis=1)  # 합산 예시

# 기존 컬럼들 삭제
df_waf.drop(columns=latency_columns + throughput_ops_columns + throughput_bytes_columns, inplace=True)


# 결과 저장 (선택)
df_waf.to_csv('modified_h1_base_output.csv', index=False)

In [16]:
# workload, compaction_style 기준 정렬
df_waf_sorted = df_waf.sort_values(by=["workload", "compaction_style"])
df_waf_sorted

Unnamed: 0,workload,compaction_style,value_size,max_bytes_for_level_base,rocksdb.number.keys.written,rocksdb.bytes.written,rocksdb.compact.write.bytes,rocksdb.flush.write.bytes,rocksdb.wal.bytes,rocksdb.write.wal,WAF,latency,throughput_ops,throughput_bytes
2,fillrandom,leveled,16KB,512MB,1000000,16417000000,36052092660,8996302229,16417000000,1000000,3.744009,437.744,2284.0,35.7
5,fillrandom,leveled,4KB,256MB,1000000,4128000000,4003270522,2301632341,4128000000,1000000,2.52735,57.195,17484.0,68.6
6,fillrandom,leveled,1KB,64MB,1000000,1056000000,657101850,554147546,1056000000,1000000,2.147016,16.308,61319.0,60.8
7,fillrandom,leveled,1KB,512MB,1000000,1056000000,588671433,554175641,1056000000,1000000,2.082242,16.58,60312.0,59.8
8,fillrandom,leveled,1KB,256MB,1000000,1056000000,612860902,554235912,1056000000,1000000,2.105205,15.947,62707.0,62.2
9,fillrandom,leveled,16KB,64MB,1000000,16417000000,27054478941,8996874388,16417000000,1000000,3.195977,373.762,2675.0,41.8
10,fillrandom,leveled,4KB,512MB,1000000,4128000000,4390431402,2301548443,4128000000,1000000,2.621119,58.202,17181.0,67.4
13,fillrandom,leveled,16KB,256MB,1000000,16417000000,29512061896,8997147451,16417000000,1000000,3.345691,468.074,2136.0,33.4
16,fillrandom,leveled,4KB,64MB,1000000,4128000000,3625923549,2264390085,4128000000,1000000,2.426917,75.279,13283.0,52.1
0,overwrite,leveled,4KB,64MB,1000000,4128000000,3624640394,2264460227,4128000000,1000000,2.426623,56.711,17633.0,69.1


## Level_multiplier_log_data

In [None]:
import csv
import re
import os

log_dir = "./level_multiplier_log_data"
csv_output_path = "./h1_multiplier_output.csv"

filename_pattern = r"h1_(\w+)_cs(\d+)_size(\d+)_multiplier(\d+).log"

target_keys = [
    "rocksdb.number.keys.written",
    "rocksdb.bytes.written",
    "rocksdb.compact.write.bytes",
    "rocksdb.flush.write.bytes",
    "rocksdb.wal.bytes",
    "rocksdb.write.wal",
]

target_workloads = ["fillrandom", "overwrite"]

with open(csv_output_path, mode="w", newline='') as csvfile:
    writer = csv.writer(csvfile)
    header = ["workload", "compaction_style", "value_size", "max_bytes_for_level_base"] + target_keys

    for w in target_workloads:
        header += [
            f"{w}_latency_us", 
            f"{w}_throughput_ops", 
            f"{w}_throughput_bytes"
        ]

    writer.writerow(header)

    for log_file_name in os.listdir(log_dir):
        if not log_file_name.endswith(".log"):
            continue

        match = re.match(filename_pattern, log_file_name)
        if not match:
            print(f"파일 이름 형식이 올바르지 않아 건너뜀: {log_file_name}")
            continue

        workload = match.group(1)
        compaction_style = int(match.group(2))
        value_size = int(match.group(3))
        max_bytes_for_level_base = int(match.group(4))

        stats_dict = {key: None for key in target_keys}
        latency_throughput = {w: {"lat": None, "ops": None, "bytes": None} for w in target_workloads}
        log_file_path = os.path.join(log_dir, log_file_name)

        with open(log_file_path, "r") as f:
            for line in f:
                line = line.strip()

                match_count = re.match(r"(rocksdb\.[\w\.]+)\s+COUNT\s*:\s*(\d+)", line)
                if match_count:
                    key = match_count.group(1)
                    value = int(match_count.group(2))
                    if key in stats_dict:
                        stats_dict[key] = value
                    continue

                match_avg = re.match(r"(rocksdb\.[\w\.]+)\s+AVG\s*:\s*([\d\.]+)", line)
                if match_avg:
                    key = match_avg.group(1)
                    value = float(match_avg.group(2))
                    if key in stats_dict:
                        stats_dict[key] = value

                # Latency / Throughput 정보 추출
                for w in target_workloads:
                    if w in line:
                        match_perf = re.search(
                            r"([\d\.]+)\s+micros/op\s+([\d\.]+)\s+ops/sec.*?([\d\.]+)\s+MB/s", line
                        )
                        if match_perf:
                            latency_throughput[w]["lat"] = float(match_perf.group(1))
                            latency_throughput[w]["ops"] = float(match_perf.group(2))
                            latency_throughput[w]["bytes"] = float(match_perf.group(3))

        stat_values = [stats_dict[key] if stats_dict[key] is not None else "" for key in target_keys]

        # latency/throughput 값 추가
        latency_values = []
        for w in target_workloads:
            lt = latency_throughput[w]
            latency_values += [
                lt["lat"] if lt["lat"] is not None else "",
                lt["ops"] if lt["ops"] is not None else "",
                lt["bytes"] if lt["bytes"] is not None else "",
            ]

        row = [workload, compaction_style, value_size, max_bytes_for_level_base] + stat_values + latency_values
        writer.writerow(row)

print("CSV 저장이 완료되었습니다.")

CSV 저장이 완료되었습니다.


In [22]:
import pandas as pd

# CSV 파일 읽기
df = pd.read_csv("./h1_multiplier_output.csv")

# workload, compaction_style 기준 정렬
df_sorted = df.sort_values(by=["workload", "compaction_style"])

# 정렬된 데이터프레임 출력
df_sorted

Unnamed: 0,workload,compaction_style,value_size,max_bytes_for_level_base,rocksdb.number.keys.written,rocksdb.bytes.written,rocksdb.compact.write.bytes,rocksdb.flush.write.bytes,rocksdb.wal.bytes,rocksdb.write.wal,fillrandom_latency_us,fillrandom_throughput_ops,fillrandom_throughput_bytes,overwrite_latency_us,overwrite_throughput_ops,overwrite_throughput_bytes
1,fillrandom,0,1024,10,1000000,1056000000,532065061,554231834,1056000000,1000000,12.622,79224.0,78.6,,,
2,fillrandom,0,16384,10,1000000,16417000000,26808615703,8996537425,16417000000,1000000,410.739,2434.0,38.1,,,
4,fillrandom,0,16384,20,1000000,16417000000,28859522575,8996966203,16417000000,1000000,458.188,2182.0,34.1,,,
5,fillrandom,0,16384,4,1000000,16417000000,26312224465,8997182632,16417000000,1000000,397.332,2516.0,39.4,,,
7,fillrandom,0,4096,10,1000000,4128000000,3668790542,2264613878,4128000000,1000000,71.569,13972.0,54.8,,,
11,fillrandom,0,1024,20,1000000,1056000000,573779355,554231337,1056000000,1000000,14.197,70438.0,69.9,,,
13,fillrandom,0,4096,4,1000000,4128000000,3961650233,2263893245,4128000000,1000000,72.693,13756.0,53.9,,,
14,fillrandom,0,4096,20,1000000,4128000000,3812224142,2264247445,4128000000,1000000,79.614,12560.0,49.3,,,
17,fillrandom,0,1024,4,1000000,1056000000,518053266,554174033,1056000000,1000000,14.077,71038.0,70.5,,,
0,overwrite,0,4096,20,1000000,4128000000,3876164853,2264551781,4128000000,1000000,,,,77.384,12922.0,50.7


In [23]:
import pandas as pd

df_waf = df.copy()

# compaction_style 매핑
style_map = {0: 'leveled', 1: 'universal'}
df_waf['compaction_style'] = df['compaction_style'].map(style_map)

# value_size 매핑
value_size_map = {1024: '1KB', 4096: '4KB', 16384: '16KB'}
df_waf['value_size'] = df['value_size'].map(value_size_map)

# WAF 계산: wal.bytes가 0인 경우는 NaN 처리
df_waf['WAF'] = df.apply(
    lambda row: (row['rocksdb.flush.write.bytes'] + row['rocksdb.compact.write.bytes'] + row['rocksdb.bytes.written']) / row['rocksdb.bytes.written'],
    axis=1
)

# _latency_us 컬럼들을 하나로 합치기
latency_columns = [col for col in df_waf.columns if '_latency_us' in col]
df_waf['latency'] = df_waf[latency_columns].sum(axis=1)  # 합산 예시

# _throughput_ops 컬럼들을 하나로 합치기
throughput_ops_columns = [col for col in df_waf.columns if '_throughput_ops' in col]
df_waf['throughput_ops'] = df_waf[throughput_ops_columns].sum(axis=1)  # 합산 예시

# _throughput_bytes 컬럼들을 하나로 합치기
throughput_bytes_columns = [col for col in df_waf.columns if '_throughput_bytes' in col]
df_waf['throughput_bytes'] = df_waf[throughput_bytes_columns].sum(axis=1)  # 합산 예시

# 기존 컬럼들 삭제
df_waf.drop(columns=latency_columns + throughput_ops_columns + throughput_bytes_columns, inplace=True)

# 결과 저장 (선택)
df_waf.to_csv('modified_h1_multiplier_output.csv', index=False)

In [24]:
# workload, compaction_style 기준 정렬
df_waf_sorted = df_waf.sort_values(by=["workload", "compaction_style"])
df_waf_sorted

Unnamed: 0,workload,compaction_style,value_size,max_bytes_for_level_base,rocksdb.number.keys.written,rocksdb.bytes.written,rocksdb.compact.write.bytes,rocksdb.flush.write.bytes,rocksdb.wal.bytes,rocksdb.write.wal,WAF,latency,throughput_ops,throughput_bytes
1,fillrandom,leveled,1KB,10,1000000,1056000000,532065061,554231834,1056000000,1000000,2.02869,12.622,79224.0,78.6
2,fillrandom,leveled,16KB,10,1000000,16417000000,26808615703,8996537425,16417000000,1000000,3.18098,410.739,2434.0,38.1
4,fillrandom,leveled,16KB,20,1000000,16417000000,28859522575,8996966203,16417000000,1000000,3.305932,458.188,2182.0,34.1
5,fillrandom,leveled,16KB,4,1000000,16417000000,26312224465,8997182632,16417000000,1000000,3.150783,397.332,2516.0,39.4
7,fillrandom,leveled,4KB,10,1000000,4128000000,3668790542,2264613878,4128000000,1000000,2.437356,71.569,13972.0,54.8
11,fillrandom,leveled,1KB,20,1000000,1056000000,573779355,554231337,1056000000,1000000,2.068192,14.197,70438.0,69.9
13,fillrandom,leveled,4KB,4,1000000,4128000000,3961650233,2263893245,4128000000,1000000,2.508126,72.693,13756.0,53.9
14,fillrandom,leveled,4KB,20,1000000,4128000000,3812224142,2264247445,4128000000,1000000,2.472013,79.614,12560.0,49.3
17,fillrandom,leveled,1KB,4,1000000,1056000000,518053266,554174033,1056000000,1000000,2.015367,14.077,71038.0,70.5
0,overwrite,leveled,4KB,20,1000000,4128000000,3876164853,2264551781,4128000000,1000000,2.487577,77.384,12922.0,50.7


## Leveled Log

In [34]:
import csv
import re
import os

log_dir = "./leveled_log_data"
csv_output_path = "./h1_leveled_output.csv"

# 정규표현식을 이용해 파일 이름에서 정보 추출
filename_pattern = r"h1_(\w+)_cs(\d+)_size(\d+).log"

target_keys = [
    "rocksdb.number.keys.written",
    "rocksdb.bytes.written",
    "rocksdb.compact.write.bytes",
    "rocksdb.flush.write.bytes",
    "rocksdb.wal.bytes",
    "rocksdb.write.wal",
]

target_workloads = ["fillrandom", "overwrite"]

with open(csv_output_path, mode="w", newline='') as csvfile:
    writer = csv.writer(csvfile)
    header = ["workload", "compaction_style", "value_size", "max_bytes_for_level_base"] + target_keys

    for w in target_workloads:
        header += [
            f"{w}_latency_us", 
            f"{w}_throughput_ops", 
            f"{w}_throughput_bytes"
        ]

    writer.writerow(header)

    for log_file_name in os.listdir(log_dir):
        if not log_file_name.endswith(".log"):
            continue

        match = re.match(filename_pattern, log_file_name)
        if not match:
            print(f"파일 이름 형식이 올바르지 않아 건너뜀: {log_file_name}")
            continue

        workload = match.group(1)
        compaction_style = int(match.group(2))
        value_size = int(match.group(3))

        stats_dict = {key: None for key in target_keys}
        latency_throughput = {w: {"lat": None, "ops": None, "bytes": None} for w in target_workloads}
        log_file_path = os.path.join(log_dir, log_file_name)

        with open(log_file_path, "r") as f:
            for line in f:
                line = line.strip()

                match_count = re.match(r"(rocksdb\.[\w\.]+)\s+COUNT\s*:\s*(\d+)", line)
                if match_count:
                    key = match_count.group(1)
                    value = int(match_count.group(2))
                    if key in stats_dict:
                        stats_dict[key] = value
                    continue

                match_avg = re.match(r"(rocksdb\.[\w\.]+)\s+AVG\s*:\s*([\d\.]+)", line)
                if match_avg:
                    key = match_avg.group(1)
                    value = float(match_avg.group(2))
                    if key in stats_dict:
                        stats_dict[key] = value

                # Latency / Throughput 정보 추출
                for w in target_workloads:
                    if w in line:
                        match_perf = re.search(
                            r"([\d\.]+)\s+micros/op\s+([\d\.]+)\s+ops/sec.*?([\d\.]+)\s+MB/s", line
                        )
                        if match_perf:
                            latency_throughput[w]["lat"] = float(match_perf.group(1))
                            latency_throughput[w]["ops"] = float(match_perf.group(2))
                            latency_throughput[w]["bytes"] = float(match_perf.group(3))

        stat_values = [stats_dict[key] if stats_dict[key] is not None else "" for key in target_keys]

        # latency/throughput 값 추가
        latency_values = []
        for w in target_workloads:
            lt = latency_throughput[w]
            latency_values += [
                lt["lat"] if lt["lat"] is not None else "",
                lt["ops"] if lt["ops"] is not None else "",
                lt["bytes"] if lt["bytes"] is not None else "",
            ]

        row = [workload, compaction_style, value_size, max_bytes_for_level_base] + stat_values + latency_values
        writer.writerow(row)

print("CSV 저장이 완료되었습니다.")

CSV 저장이 완료되었습니다.


In [35]:
import pandas as pd

# CSV 파일 읽기
df = pd.read_csv("./h1_leveled_output.csv")

# workload, compaction_style 기준 정렬
df_sorted = df.sort_values(by=["workload", "compaction_style"])

# 정렬된 데이터프레임 출력
df_sorted

Unnamed: 0,workload,compaction_style,value_size,max_bytes_for_level_base,rocksdb.number.keys.written,rocksdb.bytes.written,rocksdb.compact.write.bytes,rocksdb.flush.write.bytes,rocksdb.wal.bytes,rocksdb.write.wal,fillrandom_latency_us,fillrandom_throughput_ops,fillrandom_throughput_bytes,overwrite_latency_us,overwrite_throughput_ops,overwrite_throughput_bytes
1,fillrandom,0,16384,4,,,,,,,,,,,,
4,fillrandom,0,4096,4,1000000.0,4128000000.0,3392791000.0,2264426000.0,4128000000.0,1000000.0,76.668,13043.0,51.1,,,
5,fillrandom,0,1024,4,1000000.0,1056000000.0,526336500.0,554192600.0,1056000000.0,1000000.0,17.585,56838.0,56.4,,,
0,overwrite,0,1024,4,1000000.0,1056000000.0,542321600.0,554233100.0,1056000000.0,1000000.0,,,,20.251,49349.0,48.9
2,overwrite,0,4096,4,1000000.0,4128000000.0,3402204000.0,2264465000.0,4128000000.0,1000000.0,,,,63.033,15864.0,62.2
3,overwrite,0,16384,4,,,,,,,,,,,,


In [36]:
import pandas as pd

df_waf = df.copy()

# compaction_style 매핑
style_map = {0: 'leveled', 1: 'universal'}
df_waf['compaction_style'] = df['compaction_style'].map(style_map)

# value_size 매핑
value_size_map = {1024: '1KB', 4096: '4KB', 16384: '16KB'}
df_waf['value_size'] = df['value_size'].map(value_size_map)

# WAF 계산: wal.bytes가 0인 경우는 NaN 처리
df_waf['WAF'] = df.apply(
    lambda row: (row['rocksdb.flush.write.bytes'] + row['rocksdb.compact.write.bytes'] + row['rocksdb.bytes.written']) / row['rocksdb.bytes.written'],
    axis=1
)

# _latency_us 컬럼들을 하나로 합치기
latency_columns = [col for col in df_waf.columns if '_latency_us' in col]
df_waf['latency'] = df_waf[latency_columns].sum(axis=1)  # 합산 예시

# _throughput_ops 컬럼들을 하나로 합치기
throughput_ops_columns = [col for col in df_waf.columns if '_throughput_ops' in col]
df_waf['throughput_ops'] = df_waf[throughput_ops_columns].sum(axis=1)  # 합산 예시

# _throughput_bytes 컬럼들을 하나로 합치기
throughput_bytes_columns = [col for col in df_waf.columns if '_throughput_bytes' in col]
df_waf['throughput_bytes'] = df_waf[throughput_bytes_columns].sum(axis=1)  # 합산 예시

# 기존 컬럼들 삭제
df_waf.drop(columns=latency_columns + throughput_ops_columns + throughput_bytes_columns, inplace=True)

# 결과 저장 (선택)
df_waf.to_csv('modified_h1_leveled_output.csv', index=False)

In [37]:
# workload, compaction_style 기준 정렬
df_waf_sorted = df_waf.sort_values(by=["workload", "compaction_style"])
df_waf_sorted

Unnamed: 0,workload,compaction_style,value_size,max_bytes_for_level_base,rocksdb.number.keys.written,rocksdb.bytes.written,rocksdb.compact.write.bytes,rocksdb.flush.write.bytes,rocksdb.wal.bytes,rocksdb.write.wal,WAF,latency,throughput_ops,throughput_bytes
1,fillrandom,leveled,16KB,4,,,,,,,,0.0,0.0,0.0
4,fillrandom,leveled,4KB,4,1000000.0,4128000000.0,3392791000.0,2264426000.0,4128000000.0,1000000.0,2.37045,76.668,13043.0,51.1
5,fillrandom,leveled,1KB,4,1000000.0,1056000000.0,526336500.0,554192600.0,1056000000.0,1000000.0,2.023228,17.585,56838.0,56.4
0,overwrite,leveled,1KB,4,1000000.0,1056000000.0,542321600.0,554233100.0,1056000000.0,1000000.0,2.038404,20.251,49349.0,48.9
2,overwrite,leveled,4KB,4,1000000.0,4128000000.0,3402204000.0,2264465000.0,4128000000.0,1000000.0,2.37274,63.033,15864.0,62.2
3,overwrite,leveled,16KB,4,,,,,,,,0.0,0.0,0.0


## Universal Log

In [38]:
import csv
import re
import os

log_dir = "./universal_log_data"
csv_output_path = "./h1_universal_output.csv"

# 정규표현식을 이용해 파일 이름에서 정보 추출
filename_pattern = r"h1_(\w+)_cs(\d+)_size(\d+).log"

target_keys = [
    "rocksdb.number.keys.written",
    "rocksdb.bytes.written",
    "rocksdb.compact.write.bytes",
    "rocksdb.flush.write.bytes",
    "rocksdb.wal.bytes",
    "rocksdb.write.wal",
]

target_workloads = ["fillrandom", "overwrite"]

with open(csv_output_path, mode="w", newline='') as csvfile:
    writer = csv.writer(csvfile)
    header = ["workload", "compaction_style", "value_size", "max_bytes_for_level_base"] + target_keys

    for w in target_workloads:
        header += [
            f"{w}_latency_us", 
            f"{w}_throughput_ops", 
            f"{w}_throughput_bytes"
        ]

    writer.writerow(header)

    for log_file_name in os.listdir(log_dir):
        if not log_file_name.endswith(".log"):
            continue

        match = re.match(filename_pattern, log_file_name)
        if not match:
            print(f"파일 이름 형식이 올바르지 않아 건너뜀: {log_file_name}")
            continue

        workload = match.group(1)
        compaction_style = int(match.group(2))
        value_size = int(match.group(3))

        stats_dict = {key: None for key in target_keys}
        latency_throughput = {w: {"lat": None, "ops": None, "bytes": None} for w in target_workloads}
        log_file_path = os.path.join(log_dir, log_file_name)

        with open(log_file_path, "r") as f:
            for line in f:
                line = line.strip()

                match_count = re.match(r"(rocksdb\.[\w\.]+)\s+COUNT\s*:\s*(\d+)", line)
                if match_count:
                    key = match_count.group(1)
                    value = int(match_count.group(2))
                    if key in stats_dict:
                        stats_dict[key] = value
                    continue

                match_avg = re.match(r"(rocksdb\.[\w\.]+)\s+AVG\s*:\s*([\d\.]+)", line)
                if match_avg:
                    key = match_avg.group(1)
                    value = float(match_avg.group(2))
                    if key in stats_dict:
                        stats_dict[key] = value

                # Latency / Throughput 정보 추출
                for w in target_workloads:
                    if w in line:
                        match_perf = re.search(
                            r"([\d\.]+)\s+micros/op\s+([\d\.]+)\s+ops/sec.*?([\d\.]+)\s+MB/s", line
                        )
                        if match_perf:
                            latency_throughput[w]["lat"] = float(match_perf.group(1))
                            latency_throughput[w]["ops"] = float(match_perf.group(2))
                            latency_throughput[w]["bytes"] = float(match_perf.group(3))

        stat_values = [stats_dict[key] if stats_dict[key] is not None else "" for key in target_keys]

        # latency/throughput 값 추가
        latency_values = []
        for w in target_workloads:
            lt = latency_throughput[w]
            latency_values += [
                lt["lat"] if lt["lat"] is not None else "",
                lt["ops"] if lt["ops"] is not None else "",
                lt["bytes"] if lt["bytes"] is not None else "",
            ]

        row = [workload, compaction_style, value_size, max_bytes_for_level_base] + stat_values + latency_values
        writer.writerow(row)

print("CSV 저장이 완료되었습니다.")

CSV 저장이 완료되었습니다.


In [39]:
import pandas as pd

# CSV 파일 읽기
df = pd.read_csv("./h1_universal_output.csv")

# workload, compaction_style 기준 정렬
df_sorted = df.sort_values(by=["workload", "compaction_style"])

# 정렬된 데이터프레임 출력
df_sorted

Unnamed: 0,workload,compaction_style,value_size,max_bytes_for_level_base,rocksdb.number.keys.written,rocksdb.bytes.written,rocksdb.compact.write.bytes,rocksdb.flush.write.bytes,rocksdb.wal.bytes,rocksdb.write.wal,fillrandom_latency_us,fillrandom_throughput_ops,fillrandom_throughput_bytes,overwrite_latency_us,overwrite_throughput_ops,overwrite_throughput_bytes
1,fillrandom,1,4096,4,1000000,4128000000,3540818396,2301488797,4128000000,1000000,74.644,13396.0,52.5,,,
3,fillrandom,1,16384,4,1000000,16417000000,23330374011,8996719499,16417000000,1000000,344.567,2902.0,45.4,,,
4,fillrandom,1,1024,4,1000000,1056000000,590356007,554189559,1056000000,1000000,13.143,76087.0,75.5,,,
0,overwrite,1,1024,4,1000000,1056000000,589472628,554212973,1056000000,1000000,,,,13.732,72819.0,72.2
2,overwrite,1,16384,4,1000000,16417000000,20922566009,8996726917,16417000000,1000000,,,,315.428,3170.0,49.6
5,overwrite,1,4096,4,1000000,4128000000,3254281401,2263874391,4128000000,1000000,,,,61.157,16351.0,64.1


In [40]:
import pandas as pd

df_waf = df.copy()

# compaction_style 매핑
style_map = {0: 'leveled', 1: 'universal'}
df_waf['compaction_style'] = df['compaction_style'].map(style_map)

# value_size 매핑
value_size_map = {1024: '1KB', 4096: '4KB', 16384: '16KB'}
df_waf['value_size'] = df['value_size'].map(value_size_map)

# WAF 계산: wal.bytes가 0인 경우는 NaN 처리
df_waf['WAF'] = df.apply(
    lambda row: (row['rocksdb.flush.write.bytes'] + row['rocksdb.compact.write.bytes'] + row['rocksdb.bytes.written']) / row['rocksdb.bytes.written'],
    axis=1
)

# _latency_us 컬럼들을 하나로 합치기
latency_columns = [col for col in df_waf.columns if '_latency_us' in col]
df_waf['latency'] = df_waf[latency_columns].sum(axis=1)  # 합산 예시

# _throughput_ops 컬럼들을 하나로 합치기
throughput_ops_columns = [col for col in df_waf.columns if '_throughput_ops' in col]
df_waf['throughput_ops'] = df_waf[throughput_ops_columns].sum(axis=1)  # 합산 예시

# _throughput_bytes 컬럼들을 하나로 합치기
throughput_bytes_columns = [col for col in df_waf.columns if '_throughput_bytes' in col]
df_waf['throughput_bytes'] = df_waf[throughput_bytes_columns].sum(axis=1)  # 합산 예시

# 기존 컬럼들 삭제
df_waf.drop(columns=latency_columns + throughput_ops_columns + throughput_bytes_columns, inplace=True)

# 결과 저장 (선택)
df_waf.to_csv('modified_h1_universal_output.csv', index=False)

In [41]:
# workload, compaction_style 기준 정렬
df_waf_sorted = df_waf.sort_values(by=["workload", "compaction_style"])
df_waf_sorted

Unnamed: 0,workload,compaction_style,value_size,max_bytes_for_level_base,rocksdb.number.keys.written,rocksdb.bytes.written,rocksdb.compact.write.bytes,rocksdb.flush.write.bytes,rocksdb.wal.bytes,rocksdb.write.wal,WAF,latency,throughput_ops,throughput_bytes
1,fillrandom,universal,4KB,4,1000000,4128000000,3540818396,2301488797,4128000000,1000000,2.415288,74.644,13396.0,52.5
3,fillrandom,universal,16KB,4,1000000,16417000000,23330374011,8996719499,16417000000,1000000,2.969123,344.567,2902.0,45.4
4,fillrandom,universal,1KB,4,1000000,1056000000,590356007,554189559,1056000000,1000000,2.08385,13.143,76087.0,75.5
0,overwrite,universal,1KB,4,1000000,1056000000,589472628,554212973,1056000000,1000000,2.083036,13.732,72819.0,72.2
2,overwrite,universal,16KB,4,1000000,16417000000,20922566009,8996726917,16417000000,1000000,2.822458,315.428,3170.0,49.6
5,overwrite,universal,4KB,4,1000000,4128000000,3254281401,2263874391,4128000000,1000000,2.336763,61.157,16351.0,64.1
