In [1]:
import os
import re
import pandas as pd

# 定义日志文件所在的目录
log_directory = "/home/ycwang/sglang/python/sglang"

# 用于存储数据的列表
data = []

# 定义正则表达式来匹配文件名和文件内容中的所需信息
filename_pattern = re.compile(r"sglang_exp_(\d+)_(random|length|reverse_length)_(prefill-reorder|decode-reorder|decode-different-len-reorder|U-distribution)\.log")
duration_pattern = re.compile(r"Benchmark duration \(s\):\s+(\d+\.\d+)")
request_throughput_pattern = re.compile(r"Request throughput \(req/s\):\s+(\d+\.\d+)")
output_throughput_pattern = re.compile(r"Output token throughput \(tok/s\):\s+(\d+\.\d+)")

# 遍历目录中的所有文件
for filename in os.listdir(log_directory):
    match = filename_pattern.match(filename)
    if match:
        # 提取length, policy, request pattern
        length = match.group(1)
        policy = match.group(2)
        request_pattern = match.group(3)

        # 读取文件内容
        with open(os.path.join(log_directory, filename), 'r') as file:
            content = file.read()
            
            # 使用正则表达式提取所需的值
            duration_match = duration_pattern.search(content)
            request_throughput_match = request_throughput_pattern.search(content)
            output_throughput_match = output_throughput_pattern.search(content)
            
            if duration_match and request_throughput_match and output_throughput_match:
                duration = float(duration_match.group(1))
                request_throughput = float(request_throughput_match.group(1))
                output_throughput = float(output_throughput_match.group(1))
                
                # 将提取到的数据添加到列表中
                data.append({
                    "length": length,
                    "policy": policy,
                    "request_pattern": request_pattern,
                    "duration": duration,
                    "request_throughput": request_throughput,
                    "output_throughput": output_throughput
                })

# 将数据转化为DataFrame
df = pd.DataFrame(data)

# 输出整理好的数据表
print(df)

# 如果需要，你也可以将结果保存为CSV文件
df.to_csv("benchmark_results.csv", index=False)

    length          policy               request_pattern  duration  \
0     2500          random                decode-reorder    482.98   
1     1000          length                U-distribution     40.86   
2     1500  reverse_length  decode-different-len-reorder    468.40   
3     2000          random  decode-different-len-reorder   1156.37   
4     1500          random  decode-different-len-reorder    507.07   
..     ...             ...                           ...       ...   
97    1500          length  decode-different-len-reorder    478.07   
98    4500          length                U-distribution    254.46   
99    1500          random                U-distribution     67.80   
100   3500          length                U-distribution    186.37   
101   5000  reverse_length               prefill-reorder    994.67   

     request_throughput  output_throughput  
0                  5.18              25.88  
1                 24.48            1568.42  
2                  3.20 

In [5]:
import os
import re
import pandas as pd

# 定义日志文件所在的目录
log_directory = "/home/ycwang/sglang/python/sglang"

# 用于存储数据的列表
data = []

# 定义正则表达式来匹配文件名和文件内容中的所需信息
filename_pattern = re.compile(r"sglang_exp_(\d+)_(random|length|reverse_length)_(prefill-reorder|decode-reorder|decode-different-len-reorder|U-distribution)\.log")
duration_pattern = re.compile(r"Benchmark duration \(s\):\s+(\d+\.\d+)")
request_throughput_pattern = re.compile(r"Request throughput \(req/s\):\s+(\d+\.\d+)")
output_throughput_pattern = re.compile(r"Output token throughput \(tok/s\):\s+(\d+\.\d+)")

# 遍历目录中的所有文件
for filename in os.listdir(log_directory):
    match = filename_pattern.match(filename)
    if match:
        # 提取length, policy, request pattern
        length = match.group(1)
        policy = match.group(2)
        request_pattern = match.group(3)

        # 读取文件内容
        with open(os.path.join(log_directory, filename), 'r') as file:
            content = file.read()
            
            # 使用正则表达式提取所需的值
            duration_match = duration_pattern.search(content)
            request_throughput_match = request_throughput_pattern.search(content)
            output_throughput_match = output_throughput_pattern.search(content)
            
            if duration_match and request_throughput_match and output_throughput_match:
                duration = float(duration_match.group(1))
                request_throughput = float(request_throughput_match.group(1))
                output_throughput = float(output_throughput_match.group(1))
                
                # 将提取到的数据添加到列表中
                data.append({
                    "length": length,
                    "policy": policy,
                    "request_pattern": request_pattern,
                    "duration": duration,
                    "request_throughput": request_throughput,
                    "output_throughput": output_throughput
                })

# 将数据转化为DataFrame
df = pd.DataFrame(data)

# 对数据进行排序，以便于对比
df.sort_values(by=["length", "request_pattern", "policy"], inplace=True)

# 计算每个分组的speedup
df['speedup'] = df.groupby(["length", "request_pattern"])['duration'].transform(lambda x: x / x.min())

# 按照 length 和 request_pattern 分组，并显示结果
grouped_df = df.groupby(["length", "request_pattern"])

# 输出分组后的数据
for (length, request_pattern), group in grouped_df:
    print(f"\nLength: {length}, Request Pattern: {request_pattern}")
    print(group[["policy", "duration", "speedup", "request_throughput", "output_throughput"]])

# 如果需要保存为CSV文件
df.to_csv("benchmark_results_grouped_with_speedup.csv", index=False)


Length: 1000, Request Pattern: U-distribution
             policy  duration   speedup  request_throughput  output_throughput
1            length     40.86  1.022011               24.48            1568.42
100          random     40.58  1.015008               24.64            1579.22
55   reverse_length     39.98  1.000000               25.01            1602.97

Length: 1000, Request Pattern: decode-different-len-reorder
            policy  duration   speedup  request_throughput  output_throughput
22          length    149.71  1.000000                6.68            3343.09
72          random    151.20  1.009953                6.61            3310.16
87  reverse_length    154.03  1.028856                6.49            3249.37

Length: 1000, Request Pattern: decode-reorder
            policy  duration   speedup  request_throughput  output_throughput
38          length     58.08  1.000000               17.22              86.09
30          random     58.17  1.001550               17.19   