## 结果可视化分析

In [None]:
import math
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# 设定log文件的路径
log_dir = '/Users/yekai/github/saved_out/logs/finalexp/PAN_VGGFace2_r3p9p10_eval0_idx50_lambdaS_1e-4_omiga0599_total120_20250401'
# head = "exp_data_"
head = "output_"
dir_name = os.path.basename(log_dir)
# 定义需要提取的指标名称
metrics = [
    'BRISQUE_mean', 'BRISQUE_std', 'CLIPIQA_mean', 'CLIPIQA_std', 
    'CLIP_Face_IQA_mean', 'CLIP_Face_IQA_std', 'CLIP_IQAC_mean', 'CLIP_IQAC_std',
    'IMS_CLIP_ViT-B/32_mean', 'IMS_CLIP_ViT-B/32_std', 'IMS_VGG-Face_cosine_mean',
    'IMS_VGG-Face_cosine_std', 'LIQE_Quality_mean', 'LIQE_Quality_std',
    'LIQE_Scene_Human_mean', 'LIQE_Scene_Human_std', 'SDS_mean', 'SDS_std', 
    'dreambooth train loss', 'loss', 'lr', 

    'max_noise_r',
    'noise_L0',
    'ciede2000_score',
    'pix_change_mean',
    'change_area_mean',

    'experiment_time_minutes',
]

# 定义正则表达式提取参数和时间戳
param_pattern = re.compile(rf'{head}(.*?)-(\d+)\.log')

# 定义存储结果的字典
data = {}

# 正则表达式用于提取最后一步的时间，确保进度条两个数字相同，且 [hh:mm:ss] 时间前后没有空格
# time_pattern = re.compile(r'meta poison with model ensemble:.*?(\d+)/\1.*?(\d+:\d+:\d+)')
time_pattern = r"meta poison with model ensemble: 100%\|[^\|]+\| (\d+)/\1 \[(\d{1,2}:\d{2}(?::\d{2})?)"

for filename in os.listdir(log_dir):
    if filename.endswith('.log'):
        match = param_pattern.match(filename)
        if match:
            param_str = match.group(1)  # 参数部分
            timestamp = match.group(2)  # 时间戳

            # 读取log文件内容
            with open(os.path.join(log_dir, filename), 'r') as f:
                content = f.read()
            total_time_minutes = None
            # 提取实验耗时（找到最后一个匹配的时间，确保进度条两个数字相同，并且时间前后没有空格）
            time_match = re.findall(time_pattern, content)
            if time_match:
                last_time = time_match[1][1] # 提取最后一个step的时间
                # print(last_time)
                time_map = last_time.split(':')
                if len(time_map) == 3:
                    h, m, s = map(int, last_time.split(':'))
                    total_time_minutes = h * 60 + m + s / 60  # 转换为分钟
                if len(time_map) == 2:
                    m, s = map(int, last_time.split(':'))  # 转换为分钟、秒
                    total_time_minutes = m + s / 60  # 转换为分钟
                # print(total_time_minutes)
            else:
                print('No time found in log file:', filename)
                total_time_minutes = 0

            # 提取各个指标
            result = {}
            for metric in metrics[:-1]:  # 最后一个是时间指标，需要单独处理
                # 正则匹配获取指标值
                match = re.search(fr'{metric}\s+([-\d.]+)', content)
                if match:
                    try:
                        result[metric.replace('/', '_')] = float(match.group(1))
                    except ValueError:
                        break
            
            # 存储实验耗时
            result['experiment_time_minutes'] = total_time_minutes
            

            # 如果参数组合还未在字典中，初始化列表
            if param_str not in data:
                data[param_str] = []
            # max_noise_r 向上方取整
            if 'max_noise_r' in result:
                result['max_noise_r'] = math.ceil(result['max_noise_r'])
            # 存储结果
            data[param_str].append(result)

# 计算平均值并可视化的步骤与之前相同

In [None]:
def sort_df_by_numeric_values(df):
    # 将索引转换为 Series 来提取数值部分
    index_series = df.index.to_series()
    
    # 提取第一列中的所有数值，并将它们转换为数值列表
    numeric_values = index_series.str.findall(r'\d+\.?\d*').apply(lambda x: list(map(float, x)))
    
    # 将提取的数值列表作为新的列添加到 DataFrame 中
    df['numeric_values'] = numeric_values
    
    # 按照提取出的数值列表进行排序
    sorted_df = df.sort_values(by='numeric_values', ascending=True)
    
    # 删除临时列 'numeric_values'
    sorted_df = sorted_df.drop(columns=['numeric_values'])
    
    # 返回排序后的 DataFrame
    return sorted_df


avg_data = {}
for param_str, experiments in data.items():
    # 将同组实验结果转为DataFrame
    df = pd.DataFrame(experiments)
    
    # 去掉每列的最高值和最低值后计算平均值
    def trimmed_mean(series):
        sorted_series = series.sort_values()
        # 去掉一个最高和一个最低值
        if len(sorted_series) > 2:
            trimmed_series = sorted_series[1:-1]
            return trimmed_series.mean()
        else:
            return series.mean()
    
    avg_data[param_str] = df.apply(trimmed_mean).to_dict()

# 将平均结果转换为DataFrame便于分析
avg_df = pd.DataFrame.from_dict(avg_data, orient='index')
avg_df_sorted = sort_df_by_numeric_values(avg_df)
test_kind_num = len(data)

show_metrics = ['IMS_VGG-Face_cosine_mean',
                'ciede2000_score', #
                'SDS_mean', 
                'CLIP_IQAC_mean',
                # 'max_noise_r',#
                'LIQE_Quality_mean',
                # 'pix_change_mean',#
                'IMS_CLIP_ViT-B_32_mean',
                'BRISQUE_mean',

                ]
# 针对每个指标分别绘制图表
save_path = os.path.join("/Users/yekai/github/saved_out/result_save",dir_name)
if not os.path.exists(save_path):
    os.makedirs(save_path)

# for metric in show_metrics:
#     pic_path = os.path.join(save_path, metric + '.png')
#     if os.path.exists(pic_path):
#         print(f"{metric} already exists, showing it")
#         # matplotlib  show saved png file
#         img = mpimg.imread(pic_path)
#         imgplot = plt.imshow(img)
#         plt.axis('off')
#         plt.show()
#     else:
#         plt.figure(figsize=(15, test_kind_num*1),dpi=100)
#         avg_df_sorted[metric].plot(kind='barh')  # 更改为横向柱状图
#         plt.title(f'{metric}')
#         plt.ylabel('Parameter Settings')
#         plt.xlabel(f'Average {metric}')
#         plt.tight_layout()
#         plt.savefig(pic_path)
#         plt.show()
        

In [None]:
# 调整最大显示行数和列数
pd.set_option('display.max_rows', None)  # 显示所有行
pd.set_option('display.max_columns', None)  # 显示所有列
pd.set_option('display.width', 1000)  # 调整宽度，避免列换行
show_df = avg_df_sorted[show_metrics]
show_df

In [None]:
# save to csv
show_df.to_csv(os.path.join(save_path, 'data.csv'))