In [7]:
import csv
import re
import os

log_dir = "./log_data"
csv_output_path = "./h1_output.csv"

# 정규표현식을 이용해 파일 이름에서 정보 추출
filename_pattern = r"(\w+)_cs(\d+)_(\w+).log"

# 추출할 stat key 목록
target_keys = [
    "rocksdb.number.keys.written",
    "rocksdb.bytes.written",
    "rocksdb.compact.write.bytes",
    "rocksdb.flush.write.bytes",
    "rocksdb.wal.bytes",
    "rocksdb.write.wal",
]

# latency/throughput을 저장할 대상 워크로드 목록
target_workloads = ["fillrandom", "overwrite", "readwhilewriting", "read"]

# CSV 파일 열고 헤더 작성
with open(csv_output_path, mode="w", newline='') as csvfile:
    writer = csv.writer(csvfile)

    # 기본 헤더
    header = ["workload", "compaction_style", "config"] + target_keys

    # workload별 latency, throughput 추가
    for w in target_workloads:
        header += [
            f"{w}_latency_us", 
            f"{w}_throughput_ops", 
            f"{w}_throughput_bytes"
        ]

    writer.writerow(header)

    # 디렉토리 내 모든 .log 파일 처리
    for log_file_name in os.listdir(log_dir):
        if not log_file_name.endswith(".log"):
            continue

        match = re.match(filename_pattern, log_file_name)
        if not match:
            print(f"파일 이름 형식이 올바르지 않아 건너뜀: {log_file_name}")
            continue

        workload = match.group(1)
        compaction_style = int(match.group(2))
        configs = match.group(3)

        stats_dict = {key: None for key in target_keys}
        latency_throughput = {w: {"lat": None, "ops": None, "bytes": None} for w in target_workloads}
        log_file_path = os.path.join(log_dir, log_file_name)

        with open(log_file_path, "r") as f:
            for line in f:
                line = line.strip()

                # Stat 정보 추출
                match_stat = re.match(r"(rocksdb\.[\w\.]+) COUNT\s*:\s*(\d+)", line)
                if match_stat:
                    key = match_stat.group(1)
                    value = int(match_stat.group(2))
                    if key in stats_dict:
                        stats_dict[key] = value

                # Latency / Throughput 정보 추출
                for w in target_workloads:
                    if line.startswith(w):
                        # 예시: fillrandom   :     171.493 micros/op 5831 ops/sec 17.149 seconds 100000 operations;   91.2 MB/s
                        match_perf = re.search(
                            r"([\d\.]+)\s+micros/op\s+([\d\.]+)\s+ops/sec.*?([\d\.]+)\s+MB/s", line
                        )
                        if match_perf:
                            latency_throughput[w]["lat"] = float(match_perf.group(1))
                            latency_throughput[w]["ops"] = float(match_perf.group(2))
                            latency_throughput[w]["bytes"] = float(match_perf.group(3))

        row = [workload, compaction_style, configs] + [stats_dict[key] for key in target_keys]

        # workload별 latency/throughput 정보 추가
        for w in target_workloads:
            lt = latency_throughput[w]
            row += [lt["lat"], lt["ops"], lt["bytes"]]

        writer.writerow(row)

print("CSV 저장이 완료되었습니다.")

CSV 저장이 완료되었습니다.


In [8]:
import pandas as pd

# CSV 파일 읽기
df = pd.read_csv("./h1_output.csv")

# workload, compaction_style 기준 정렬
df_sorted = df.sort_values(by=["workload", "compaction_style"])

# 정렬된 데이터프레임 출력
df_sorted

Unnamed: 0,workload,compaction_style,config,rocksdb.number.keys.written,rocksdb.bytes.written,rocksdb.compact.write.bytes,rocksdb.flush.write.bytes,rocksdb.wal.bytes,rocksdb.write.wal,fillrandom_latency_us,...,fillrandom_throughput_bytes,overwrite_latency_us,overwrite_throughput_ops,overwrite_throughput_bytes,readwhilewriting_latency_us,readwhilewriting_throughput_ops,readwhilewriting_throughput_bytes,read_latency_us,read_throughput_ops,read_throughput_bytes
9,fillrandom,0,stable_4096,500000,2064000000,909146688,1104605088,2064000000,500000,32.67,...,119.9,,,,,,,,,
4,fillrandom,1,perf_4096,500000,2064000000,938107994,1104751825,0,0,27.459,...,142.8,,,,,,,,,
10,fillrandom,1,stable_4096,500000,2064000000,948039261,1105095275,2064000000,500000,30.982,...,126.5,,,,,,,,,
0,fillrandom,2,perf_4096,500000,2064000000,0,1104517339,0,0,10.955,...,358.0,,,,,,,,,
7,fillrandom,2,stable_4096,500000,2064000000,0,1104514199,2064000000,500000,18.277,...,214.6,,,,,,,,,
3,overwrite,0,stable_4096,500000,2064000000,928642689,1104800027,2064000000,500000,,...,,33.234,30046.0,117.8,,,,,,
5,overwrite,0,perf_4096,500000,2064000000,899581983,1104972461,0,0,,...,,25.775,38797.0,152.1,,,,,,
1,overwrite,1,stable_4096,500000,2064000000,874169405,1104783902,2064000000,500000,,...,,38.76,25697.0,100.8,,,,,,
6,overwrite,1,perf_4096,500000,2064000000,937020799,1104694067,0,0,,...,,29.564,33825.0,132.6,,,,,,
2,overwrite,2,perf_4096,500000,2064000000,0,1104125169,0,0,,...,,12.586,79450.0,311.6,,,,,,


In [9]:
df['compaction_style'].value_counts()

compaction_style
2    4
1    4
0    3
Name: count, dtype: int64

In [10]:
import pandas as pd

df_waf = df.copy()

# compaction_style 매핑
style_map = {0: 'leveled', 1: 'universal', 2: 'fifo'}
df_waf['compaction_style'] = df['compaction_style'].map(style_map)

# WAF 계산: wal.bytes가 0인 경우는 NaN 처리
df_waf['WAF'] = df.apply(
    lambda row: (row['rocksdb.flush.write.bytes'] + row['rocksdb.compact.write.bytes']) / row['rocksdb.bytes.written'],
    axis=1
)

# _latency_us 컬럼들을 하나로 합치기
latency_columns = [col for col in df_waf.columns if '_latency_us' in col]
df_waf['latency'] = df_waf[latency_columns].sum(axis=1)  # 합산 예시

# _throughput_ops 컬럼들을 하나로 합치기
throughput_ops_columns = [col for col in df_waf.columns if '_throughput_ops' in col]
df_waf['throughput_ops'] = df_waf[throughput_ops_columns].sum(axis=1)  # 합산 예시

# _throughput_bytes 컬럼들을 하나로 합치기
throughput_bytes_columns = [col for col in df_waf.columns if '_throughput_bytes' in col]
df_waf['throughput_bytes'] = df_waf[throughput_bytes_columns].sum(axis=1)  # 합산 예시

# 기존 컬럼들 삭제
df_waf.drop(columns=latency_columns + throughput_ops_columns + throughput_bytes_columns, inplace=True)

# 결과 저장 (선택)
df_waf.to_csv('modified_h1_output.csv', index=False)

In [11]:
df_waf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   workload                     11 non-null     object 
 1   compaction_style             11 non-null     object 
 2   config                       11 non-null     object 
 3   rocksdb.number.keys.written  11 non-null     int64  
 4   rocksdb.bytes.written        11 non-null     int64  
 5   rocksdb.compact.write.bytes  11 non-null     int64  
 6   rocksdb.flush.write.bytes    11 non-null     int64  
 7   rocksdb.wal.bytes            11 non-null     int64  
 8   rocksdb.write.wal            11 non-null     int64  
 9   WAF                          11 non-null     float64
 10  latency                      11 non-null     float64
 11  throughput_ops               11 non-null     float64
 12  throughput_bytes             11 non-null     float64
dtypes: float64(4), int64(6

In [12]:
# workload, compaction_style 기준 정렬
df_sorted = df_waf.sort_values(by=["workload", "compaction_style"])

# 정렬된 데이터프레임 출력
df_sorted

Unnamed: 0,workload,compaction_style,config,rocksdb.number.keys.written,rocksdb.bytes.written,rocksdb.compact.write.bytes,rocksdb.flush.write.bytes,rocksdb.wal.bytes,rocksdb.write.wal,WAF,latency,throughput_ops,throughput_bytes
0,fillrandom,fifo,perf_4096,500000,2064000000,0,1104517339,0,0,0.535134,10.955,91279.0,358.0
7,fillrandom,fifo,stable_4096,500000,2064000000,0,1104514199,2064000000,500000,0.535133,18.277,54712.0,214.6
9,fillrandom,leveled,stable_4096,500000,2064000000,909146688,1104605088,2064000000,500000,0.975655,32.67,30565.0,119.9
4,fillrandom,universal,perf_4096,500000,2064000000,938107994,1104751825,0,0,0.989758,27.459,36406.0,142.8
10,fillrandom,universal,stable_4096,500000,2064000000,948039261,1105095275,2064000000,500000,0.994736,30.982,32249.0,126.5
2,overwrite,fifo,perf_4096,500000,2064000000,0,1104125169,0,0,0.534944,12.586,79450.0,311.6
8,overwrite,fifo,stable_4096,500000,2064000000,0,1104760479,2064000000,500000,0.535252,19.996,50010.0,196.1
3,overwrite,leveled,stable_4096,500000,2064000000,928642689,1104800027,2064000000,500000,0.985195,33.234,30046.0,117.8
5,overwrite,leveled,perf_4096,500000,2064000000,899581983,1104972461,0,0,0.971199,25.775,38797.0,152.1
1,overwrite,universal,stable_4096,500000,2064000000,874169405,1104783902,2064000000,500000,0.958795,38.76,25697.0,100.8
