In [1]:
import pandas as pd
import glob
import os
from collections import defaultdict

In [2]:
BASE_DIR = '/data4/oldrain123/oldrain123/results/ablation_results/classifiers'
files = glob.glob(os.path.join(BASE_DIR, '*_results.csv'))

In [3]:
raw = {}
for fp in files:
    data_name = os.path.basename(fp).replace('_results.csv', '')
    df = pd.read_csv(fp)
    # Rank 컬럼이 있으면 삭제
    if 'Rank' in df.columns:
        df = df.drop(columns=['Rank'])
    raw[data_name] = df

In [4]:
remove_methods = {'AdaBoost', 'RUSBoost', 'SMOTEBoost', 'OUBoost'}
remove_classifiers = {'Default', 'SVM'}
remove_metrics = {'mAP'}
fixed_order = ['Original', 'ROS', 'SMOTE', 'bSMOTE', 'ADASYN', 'MWMOTE', 'CTGAN', 'Ours']
all_methods = set(m for df in raw.values() for m in df['Method'])
methods = [m for m in fixed_order if m in all_methods]
all_metrics     = {m for df in raw.values() for m in df['Metric'] if m not in remove_metrics}
all_classifiers = {c for df in raw.values() for c in df['Classifier'] if c not in remove_classifiers}

dfs = defaultdict(lambda: defaultdict(pd.DataFrame))

In [5]:
for metric in all_metrics:
    for clf in all_classifiers:
        # columns=methods 로 fixed order 적용
        table = pd.DataFrame(index=sorted(raw.keys()), columns=methods, dtype=float)
        for ds, df in raw.items():
            sel = df[(df['Metric']==metric)&(df['Classifier']==clf)]
            for _, row in sel.iterrows():
                if row['Method'] in methods:
                    table.at[ds, row['Method']] = row['Value']
        dfs[metric][clf] = table

In [6]:
beta_res_path = '/data4/oldrain123/oldrain123/results/ablation_results/beta8/*.csv'
files_all = glob.glob(beta_res_path)

In [7]:
metrics_tables_by_beta = {}

for fp in files_all:
    # 파일명에서 정보 추출
    base = os.path.basename(fp)
    ds_part, beta_part = base.split('_results_gaussian_beta_')
    dataset_name = ds_part
    beta_str = beta_part.replace('.csv', '')
    beta_val = float(beta_str)

    # deepcopy 시점: 처음 만나는 β이면 dfs의 구조를 복사
    if beta_val not in metrics_tables_by_beta:
        metrics_tables_by_beta[beta_val] = {
            metric: {clf: df.copy() for clf, df in clf_dict.items()}
            for metric, clf_dict in dfs.items()
        }

    df_ablation = pd.read_csv(fp)
    tbls = metrics_tables_by_beta[beta_val]

    # 각 row에서 해당 metric/Classifier 테이블의 'Ours' 값을 업데이트
    for _, row in df_ablation.iterrows():
        if row['Method'] != 'Ours':
            continue

        metric = row['Metric']
        classifier = row['Classifier']
        value = row['Value']

        # 안전하게 존재 확인 후 업데이트
        if metric in tbls and classifier in tbls[metric]:
            df_table = tbls[metric][classifier]
            if dataset_name in df_table.index:
                df_table.at[dataset_name, 'Ours'] = value

In [8]:
df_table

Unnamed: 0,Original,ROS,SMOTE,bSMOTE,ADASYN,MWMOTE,CTGAN,Ours
abalone19,0.0,0.034,0.03025,0.0,0.02676,0.0148,0.00682,0.0
abalone9-18,0.26607,0.31012,0.33356,0.36135,0.33572,0.34786,0.34758,0.34217
arrhythmia,0.0,0.10634,0.18433,0.13001,0.23433,0.16067,0.0,0.03667
australian,0.84906,0.85077,0.84992,0.84895,0.85048,0.84992,0.8534,0.85119
breast-cancer,0.96328,0.96303,0.96329,0.95954,0.95963,0.96347,0.96138,0.96329
cleveland-0_vs_4,0.36335,0.44934,0.56466,0.56799,0.56633,0.44467,0.46401,0.559
coil_2000,0.09671,0.14017,0.12345,0.12311,0.12282,0.12543,0.09439,0.0926
diabetes,0.64123,0.65616,0.66863,0.66177,0.67087,0.67202,0.64928,0.6665
ecoli-0-3-4_vs_5,0.81734,0.83567,0.84766,0.805,0.83,0.826,0.80335,0.839
ecoli3,0.56482,0.57885,0.62623,0.59206,0.6372,0.60163,0.61555,0.59051


In [None]:
for beta_val, metric_dict in metrics_tables_by_beta.items():
    for metric_name, clf_dict in metric_dict.items():
        for clf_name, df in clf_dict.items():
            # 1) 행별(rank axis=1)로 내림차순 순위 계산 (tie → 최소 순위)
            df_rank = df.rank(axis=1, method='min', ascending=False)

            # 2) '값 (순위)' 포맷팅
            df_fmt = df.round(3).astype(str) + " (" + df_rank.astype(int).astype(str) + ")"

            # 3) 각 컬럼(method)별로 rank 1과 rank 2 개수 집계
            count1 = (df_rank == 1).sum(axis=0)
            count2 = (df_rank == 2).sum(axis=0)
            count_row = count1.astype(str) + " (" + count2.astype(str) + ")"

            # 4) 각 컬럼별 평균 순위 계산
            avg_rank = df_rank.mean(axis=0).round(2).astype(str)

            # 5) 요약 행 추가
            df_fmt.loc['Count 1st(2nd)'] = count_row
            df_fmt.loc['Average Rank']    = avg_rank

            # 원본 테이블 교체
            metrics_tables_by_beta[beta_val][metric_name][clf_name] = df_fmt


In [None]:
metrics_tables_by_beta

In [None]:
import re
import pandas as pd

# 1) Prepare index lists
betas = sorted(metrics_tables_by_beta.keys())
metrics = list(next(iter(metrics_tables_by_beta.values())).keys())
classifiers = list(next(iter(metrics_tables_by_beta.values()))[metrics[0]].keys())

# 2) Create DataFrames for average rank, count1, count2
index = pd.MultiIndex.from_product([classifiers, metrics], names=['Classifier', 'Metric'])
avg_rank_df = pd.DataFrame(index=index, columns=betas, dtype=float)
count1_df = pd.DataFrame(index=index, columns=betas, dtype=int)
count2_df = pd.DataFrame(index=index, columns=betas, dtype=int)

# 3) Populate the tables
for beta in betas:
    for metric in metrics:
        for clf in classifiers:
            df_fmt = metrics_tables_by_beta[beta][metric][clf]
            
            # Extract average rank
            avg_val = float(df_fmt.at['Average Rank', 'Ours'])
            avg_rank_df.at[(clf, metric), beta] = avg_val
            
            # Extract counts
            cnt_str = df_fmt.at['Count 1st(2nd)', 'Ours']  # e.g. "9 (3)"
            match = re.match(r'(\d+)\s*\((\d+)\)', cnt_str)
            if match:
                c1, c2 = int(match.group(1)), int(match.group(2))
            else:
                c1, c2 = 0, 0
            count1_df.at[(clf, metric), beta] = c1
            count2_df.at[(clf, metric), beta] = c2

# Display results
print("Average Rank by Beta:")
display(avg_rank_df)

print("Count of Rank 1 by Beta:")
display(count1_df)

print("Count of Rank 2 by Beta:")
display(count2_df)


In [None]:
import matplotlib.pyplot as plt

# Assume metrics_tables_by_beta is already in memory with formatted strings
# Convert metrics_tables_by_beta to numeric rank summary tables
betas = sorted(metrics_tables_by_beta.keys())
metrics = list(next(iter(metrics_tables_by_beta.values())).keys())
classifiers = list(next(iter(metrics_tables_by_beta.values()))[metrics[0]].keys())

# Prepare containers
records = []

for beta in betas:
    for metric in metrics:
        for clf in classifiers:
            df_fmt = metrics_tables_by_beta[beta][metric][clf]
            # Extract count1 and count2 from summary row
            cnt_str = df_fmt.at['Count 1st(2nd)', 'Ours']  # e.g. "9 (3)"
            m = re.match(r'(\d+)\s*\((\d+)\)', cnt_str)
            c1, c2 = (int(m.group(1)), int(m.group(2))) if m else (0, 0)
            # Extract avg rank from summary row
            avg = float(df_fmt.at['Average Rank', 'Ours'])
            records.append({
                'beta': beta,
                'metric': metric,
                'classifier': clf,
                'avg_rank': avg,
                'count1': c1,
                'count2': c2
            })

df_summary = pd.DataFrame(records)

# Visualization
eps = 1e-4  # linthresh: [–eps, +eps] 구간은 선형
for metric in metrics:
    df_m = df_summary[df_summary['metric'] == metric]
    fig, axes = plt.subplots(1, 2, figsize=(14, 5), constrained_layout=True)

    # ─────────── Subplot A: Average Rank vs Beta ───────────
    ax = axes[0]
    for clf in classifiers:
        df_clf = df_m[df_m['classifier'] == clf]
        ax.plot(
            df_clf['beta'], df_clf['avg_rank'],
            linestyle='-', marker='o', markersize=6, label=clf
        )
    ax.set_xscale('symlog', linthresh=eps)
    ax.set_xlabel('Beta')
    ax.set_ylabel('Average Rank')
    ax.set_title(f'{metric} — Average Rank vs Beta')
    ax.grid(True, which='both', ls='--', lw=0.5)
    # 주요 티크에 0.0 포함
    ticks = [0.0] + sorted([b for b in betas if b>0])
    ax.set_xticks(ticks)
    ax.get_xaxis().set_major_formatter(plt.ScalarFormatter())

    # ─────────── Subplot B: Count1 & Count2 vs Beta ───────────
    ax = axes[1]
    for clf in classifiers:
        df_clf = df_m[df_m['classifier'] == clf]
        ax.plot(
            df_clf['beta'], df_clf['count1'],
            linestyle='-', marker='o', markersize=6, label=f'{clf} Rank1'
        )
    ax.set_xscale('symlog', linthresh=eps)
    ax.set_xlabel('Beta')
    ax.set_ylabel('Count of Ranks')
    ax.set_title(f'{metric} — Count of Rank1 vs Beta')
    ax.grid(True, which='both', ls='--', lw=0.5)
    ax.set_xticks(ticks)
    ax.get_xaxis().set_major_formatter(plt.ScalarFormatter())
    ax.legend(ncol=2, fontsize='small')

    fig.suptitle(f'Performance Summary for {metric}', fontsize=16)
    plt.show()