In [12]:
import csv
import re
import os

log_dir = "./log_data"
csv_output_path = "./h2_output.csv"

# 정규표현식을 이용해 파일 이름에서 정보 추출
filename_pattern = r"(\w+)_cs(\d+)_(\w+).log"

# 추출할 stat key 목록
target_keys = [
    "rocksdb.number.keys.written",
    "rocksdb.bytes.written",
    "rocksdb.compact.write.bytes",
    "rocksdb.flush.write.bytes",
    "rocksdb.wal.bytes",
    "rocksdb.write.wal",
]

# latency/throughput을 저장할 대상 워크로드 목록
target_workloads = ["fillrandom", "overwrite", "readwhilewriting", "read"]

# CSV 파일 열고 헤더 작성
with open(csv_output_path, mode="w", newline='') as csvfile:
    writer = csv.writer(csvfile)

    # 기본 헤더
    header = ["workload", "compaction_style", "config"] + target_keys

    # workload별 latency, throughput 추가
    for w in target_workloads:
        header += [
            f"{w}_latency_us", 
            f"{w}_throughput_ops", 
            f"{w}_throughput_bytes"
        ]

    writer.writerow(header)

    # 디렉토리 내 모든 .log 파일 처리
    for log_file_name in os.listdir(log_dir):
        if not log_file_name.endswith(".log"):
            continue

        match = re.match(filename_pattern, log_file_name)
        if not match:
            print(f"파일 이름 형식이 올바르지 않아 건너뜀: {log_file_name}")
            continue

        workload = match.group(1)
        compaction_style = int(match.group(2))
        configs = match.group(3)

        stats_dict = {key: None for key in target_keys}
        latency_throughput = {w: {"lat": None, "ops": None, "bytes": None} for w in target_workloads}
        log_file_path = os.path.join(log_dir, log_file_name)

        with open(log_file_path, "r") as f:
            for line in f:
                line = line.strip()

                # Stat 정보 추출
                match_stat = re.match(r"(rocksdb\.[\w\.]+) COUNT\s*:\s*(\d+)", line)
                if match_stat:
                    key = match_stat.group(1)
                    value = int(match_stat.group(2))
                    if key in stats_dict:
                        stats_dict[key] = value

                # Latency / Throughput 정보 추출
                for w in target_workloads:
                    if line.startswith(w):
                        # 예시: fillrandom   :     171.493 micros/op 5831 ops/sec 17.149 seconds 100000 operations;   91.2 MB/s
                        match_perf = re.search(
                            r"([\d\.]+)\s+micros/op\s+([\d\.]+)\s+ops/sec.*?([\d\.]+)\s+MB/s", line
                        )
                        if match_perf:
                            latency_throughput[w]["lat"] = float(match_perf.group(1))
                            latency_throughput[w]["ops"] = float(match_perf.group(2))
                            latency_throughput[w]["bytes"] = float(match_perf.group(3))

        row = [workload, compaction_style, configs] + [stats_dict[key] for key in target_keys]

        # workload별 latency/throughput 정보 추가
        for w in target_workloads:
            lt = latency_throughput[w]
            row += [lt["lat"], lt["ops"], lt["bytes"]]

        writer.writerow(row)

print("CSV 저장이 완료되었습니다.")

CSV 저장이 완료되었습니다.


In [13]:
import pandas as pd

# CSV 파일 읽기
df = pd.read_csv("./h2_output.csv")

# workload, compaction_style 기준 정렬
df_sorted = df.sort_values(by=["workload", "compaction_style"])

# 정렬된 데이터프레임 출력
df_sorted

Unnamed: 0,workload,compaction_style,config,rocksdb.number.keys.written,rocksdb.bytes.written,rocksdb.compact.write.bytes,rocksdb.flush.write.bytes,rocksdb.wal.bytes,rocksdb.write.wal,fillrandom_latency_us,...,fillrandom_throughput_bytes,overwrite_latency_us,overwrite_throughput_ops,overwrite_throughput_bytes,readwhilewriting_latency_us,readwhilewriting_throughput_ops,readwhilewriting_throughput_bytes,read_latency_us,read_throughput_ops,read_throughput_bytes
0,fillrandom,0,perf,100000,1641700000,141354496,823240788,0,0,171.493,...,91.2,,,,,,,,,
2,fillrandom,0,stable,100000,1641700000,1063040346,823101149,1641700000,100000,224.795,...,69.6,,,,,,,,,
4,fillrandom,1,stable,100000,1641700000,1002593258,822519262,1641700000,100000,243.944,...,64.1,,,,,,,,,
8,fillrandom,1,perf,100000,1641700000,303681536,858545886,0,0,269.884,...,58.0,,,,,,,,,
3,fillrandom,2,perf,100000,1641700000,0,822797396,0,0,131.218,...,119.2,,,,,,,,,
13,fillrandom,2,stable,100000,1641700000,0,822837515,1641700000,100000,119.717,...,130.6,,,,,,,,,
12,overwrite,0,stable,100000,1641700000,1085882756,822364925,1641700000,100000,,...,,257.715,3880.0,60.7,,,,,,
17,overwrite,0,perf,100000,1641700000,168217600,822098004,0,0,,...,,201.758,4956.0,77.5,,,,,,
5,overwrite,1,stable,100000,1641700000,1001390441,822601511,1641700000,100000,,...,,249.902,4001.0,62.6,,,,,,
10,overwrite,1,perf,100000,1641700000,320281088,858279646,0,0,,...,,264.573,3779.0,59.1,,,,,,


In [14]:
df['compaction_style'].value_counts()

compaction_style
0    6
2    6
1    6
Name: count, dtype: int64

In [20]:
import pandas as pd

df_waf = df.copy()

# compaction_style 매핑
style_map = {0: 'leveled', 1: 'universal', 2: 'fifo'}
df_waf['compaction_style'] = df['compaction_style'].map(style_map)

# WAF 계산: wal.bytes가 0인 경우는 NaN 처리
df_waf['WAF'] = df.apply(
    lambda row: (row['rocksdb.flush.write.bytes'] + row['rocksdb.compact.write.bytes'] + row['rocksdb.bytes.written']) / row['rocksdb.bytes.written'],
    axis=1
)

# _latency_us 컬럼들을 하나로 합치기
latency_columns = [col for col in df_waf.columns if '_latency_us' in col]
df_waf['latency'] = df_waf[latency_columns].sum(axis=1)  # 합산 예시

# _throughput_ops 컬럼들을 하나로 합치기
throughput_ops_columns = [col for col in df_waf.columns if '_throughput_ops' in col]
df_waf['throughput_ops'] = df_waf[throughput_ops_columns].sum(axis=1)  # 합산 예시

# _throughput_bytes 컬럼들을 하나로 합치기
throughput_bytes_columns = [col for col in df_waf.columns if '_throughput_bytes' in col]
df_waf['throughput_bytes'] = df_waf[throughput_bytes_columns].sum(axis=1)  # 합산 예시

# 기존 컬럼들 삭제
df_waf.drop(columns=latency_columns + throughput_ops_columns + throughput_bytes_columns, inplace=True)

# 결과 저장 (선택)
df_waf.to_csv('modified_h2_output.csv', index=False)

In [21]:
df_waf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   workload                     18 non-null     object 
 1   compaction_style             18 non-null     object 
 2   config                       18 non-null     object 
 3   rocksdb.number.keys.written  18 non-null     int64  
 4   rocksdb.bytes.written        18 non-null     int64  
 5   rocksdb.compact.write.bytes  18 non-null     int64  
 6   rocksdb.flush.write.bytes    18 non-null     int64  
 7   rocksdb.wal.bytes            18 non-null     int64  
 8   rocksdb.write.wal            18 non-null     int64  
 9   WAF                          18 non-null     float64
 10  latency                      18 non-null     float64
 11  throughput_ops               18 non-null     float64
 12  throughput_bytes             18 non-null     float64
dtypes: float64(4), int64(6

In [22]:
# workload, compaction_style 기준 정렬
df_sorted = df_waf.sort_values(by=["workload", "compaction_style"])

# 정렬된 데이터프레임 출력
df_sorted

Unnamed: 0,workload,compaction_style,config,rocksdb.number.keys.written,rocksdb.bytes.written,rocksdb.compact.write.bytes,rocksdb.flush.write.bytes,rocksdb.wal.bytes,rocksdb.write.wal,WAF,latency,throughput_ops,throughput_bytes
3,fillrandom,fifo,perf,100000,1641700000,0,822797396,0,0,1.501186,131.218,7620.0,119.2
13,fillrandom,fifo,stable,100000,1641700000,0,822837515,1641700000,100000,1.501211,119.717,8352.0,130.6
0,fillrandom,leveled,perf,100000,1641700000,141354496,823240788,0,0,1.587559,171.493,5831.0,91.2
2,fillrandom,leveled,stable,100000,1641700000,1063040346,823101149,1641700000,100000,2.148895,224.795,4448.0,69.6
4,fillrandom,universal,stable,100000,1641700000,1002593258,822519262,1641700000,100000,2.111721,243.944,4099.0,64.1
8,fillrandom,universal,perf,100000,1641700000,303681536,858545886,0,0,1.707941,269.884,3705.0,58.0
1,overwrite,fifo,stable,100000,1641700000,0,856930432,1641700000,100000,1.521977,133.894,7468.0,116.8
6,overwrite,fifo,perf,100000,1641700000,0,822333524,0,0,1.500904,146.222,6838.0,107.0
12,overwrite,leveled,stable,100000,1641700000,1085882756,822364925,1641700000,100000,2.162361,257.715,3880.0,60.7
17,overwrite,leveled,perf,100000,1641700000,168217600,822098004,0,0,1.603226,201.758,4956.0,77.5
