# Main Results

The following section corresponds to the *Figure 2* of the main paper.

#### Codegemma results

Use the following snippet to generate the AUROC figure comparing CodeGemma with DetectGPT.  
Ensure that the required data as computed by `Anubis` and `detectGPT` are for the target model CodeGemma are available and stored in the `corpus-100-eval-gemma` directory and `baselines/detect-gpt/codegemma2` directory respectively.

In [None]:
import subprocess
import os
import re
import numpy as np
from tqdm import tqdm
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl


os.chdir(".")
print("New Working Directory:", os.getcwd())

eps_pairs = [(10, 100), (10, 90), (10, 80), (10, 70), (10, 60), (10, 50), 
             (20, 100), (20, 90), (20, 80), (20, 70), (20, 60), (20, 50), 
             (30, 100), (30, 90), (30, 80), (30, 70), (30, 60), (30, 50)]

# Run for 5 different seeds
seeds = [42, 123, 456, 789, 999]
results_anubis = {}

for seed in seeds:
    print(f"\nRunning with seed {seed}")
    for e1, e2 in tqdm(eps_pairs):
        cmd = [
            "python3", "evaluation.py",
            "--origstu", "corpus-100-eval-gemma/stability1",
            "--stucorrupt", "corpus-100-eval-gemma/codegemma2",
            "--origllm", "corpus-100-eval-gemma/codegemma1",
            "--eps1", str(e1),
            "--eps2", str(e2),
            "--threshin", "60",
            "--threshout", "0.08",
            "--evalmodel", "2",
            "--seed", str(seed)
        ]
        result = subprocess.run(cmd, capture_output=True, text=True)
        match = re.search(r'roc_auc:\s*([^\s%|]+)', result.stdout)
        if match:
            roc_auc_val = float(match.group(1))
            results_anubis[(e1, e2, seed)] = roc_auc_val
        else:
            print(f"Error for eps1={e1}, eps2={e2}, seed={seed}: No match found")
            print(result.stdout)
            print(result.stderr)

'''
# Print results
for (e1, e2, seed), val in results_anubis.items():
    print(f"eps1={e1}, eps2={e2}, seed={seed}, roc_auc={val}")
'''

old_dir = os.getcwd()
os.chdir("./baselines/detect-gpt")
print("New Working Directory:", os.getcwd())

# Collect all .npy files in the current directory
npy_files = glob.glob("*.npy")

results_baseline = {}

for seed in seeds:
    print(f"\nRunning with seed {seed}")
    for e1, e2 in tqdm(eps_pairs):
        cmd = [
            "python3", "accumulate_run.py",
            "--model", "codegemma2",
            "--data", "stability1",
            "--eps1", str(e1),
            "--eps2", str(e2),
            "--expeps", str(e2),
            "--seed", str(seed)
        ]
        result = subprocess.run(cmd, capture_output=True, text=True)
        match = re.search(r'roc_auc:\s*([^\s%|]+)', result.stdout)
        if match:
            roc_auc_val = float(match.group(1))
            results_baseline[(e1, e2, seed)] = roc_auc_val
        else:
            print(f"Error for eps1={e1}, eps2={e2}, seed={seed}: No match found")
            print(result.stdout)
            print(result.stderr)

'''
# Print results
for (e1, e2, seed), val in results_baseline.items():
    print(f"eps1={e1}, eps2={e2}, seed={seed}, roc_auc={val}")
'''

os.chdir(old_dir)


# Plot results
mpl.rcParams['text.usetex'] = False
plt.rcParams["font.family"] = "serif"
plt.rcParams["mathtext.fontset"] = "dejavuserif"

# Group results by bw
def group_results(results):
    bw_groups = {}
    for (bw, e2, _) in results.keys():
        if (bw, e2) not in bw_groups:
            bw_groups[(bw, e2)] = []
        bw_groups[(bw, e2)].append(results[(bw, e2, _)])
    return bw_groups

bw_groups_anubis = group_results(results_anubis)
bw_groups_baseline = group_results(results_baseline)

fig_width = 9
fig_height = 4
fig, ax = plt.subplots(1, 1, figsize=(fig_width, fig_height))
labels = ["CODEGEMMA"]
sns.set_palette("muted")
anubis_color = sns.color_palette()[0]
baseline_color = 'black'
shade_color = sns.color_palette("pastel")[0]
shade_color_base = sns.color_palette("pastel")[3]

all_eps2_vals = sorted(set(e2 for _, e2, _ in results_anubis.keys()))

for bw in [10, 30]:
    # Plot Anubis Results
    avg_auroc_anubis = []
    std_dev_anubis = []
    for e2 in all_eps2_vals:
        if (bw, e2) in bw_groups_anubis:
            roc_values = bw_groups_anubis[(bw, e2)]
            avg_auroc_anubis.append((e2, np.mean(roc_values)))
            std_dev_anubis.append(np.std(roc_values))
    if avg_auroc_anubis:
        eps2_vals, avg_vals = zip(*avg_auroc_anubis)
        std_vals = std_dev_anubis
        ax.plot(eps2_vals, avg_vals, linestyle='solid', label=f"$\\mathsf{{Anubis}}$", color=anubis_color)
        ax.fill_between(eps2_vals, np.array(avg_vals) - np.array(std_vals), np.array(avg_vals) + np.array(std_vals), color=shade_color, alpha=0.3)
    if bw == 10:
        ax.annotate(f"$\\mathsf{{LB}}\\%$={bw}", (eps2_vals[len(eps2_vals)//2], avg_vals[len(avg_vals)//2]), textcoords="offset points", xytext=(-10,20), ha='center', fontsize=9, arrowprops=dict(arrowstyle="->"))
    elif bw == 30:
        ax.annotate(f"$\\mathsf{{LB}}\\%$={bw}", (eps2_vals[len(eps2_vals)//2], avg_vals[len(avg_vals)//2]), textcoords="offset points", xytext=(-15,-40), ha='center', fontsize=9, arrowprops=dict(arrowstyle="->"))

    # Plot DetectGPT Baseline Results
    avg_auroc_baseline = []
    std_dev_baseline = []
    for e2 in all_eps2_vals:
        if (bw, e2) in bw_groups_baseline:
            roc_values = bw_groups_baseline[(bw, e2)]
            avg_auroc_baseline.append((e2, np.mean(roc_values)))
            std_dev_baseline.append(np.std(roc_values))
    if avg_auroc_baseline:
        eps2_vals, avg_vals = zip(*avg_auroc_baseline)
        std_vals = std_dev_baseline
        ax.plot(eps2_vals, avg_vals, linestyle='dashed', label=f"$\\mathsf{{DetectGPT}}$", color=baseline_color)
        ax.fill_between(eps2_vals, np.array(avg_vals) - np.array(std_vals), np.array(avg_vals) + np.array(std_vals), color=shade_color_base, alpha=0.3)
    if bw == 10:
            ax.annotate("$\\mathsf{LB}\%$="+f"{bw}", (eps2_vals[len(eps2_vals)-1], avg_vals[len(avg_vals)-1]), textcoords="offset points", xytext=(-35,10), ha='center', fontsize=9, arrowprops=dict(arrowstyle="->"))
    elif bw == 30:
        ax.annotate("$\\mathsf{LB}\%$="+f"{bw}", (eps2_vals[len(eps2_vals)-2], avg_vals[len(avg_vals)-2]), textcoords="offset points", xytext=(20,-15), ha='center', fontsize=9, arrowprops=dict(arrowstyle="->"))

ax.set_xlabel('$\\mathsf{UB} \\% \\ \longrightarrow$')
ax.set_xlim([50, 100])
ax.set_ylim([0.48, 1.02])
ax.set_ylabel('AUROC $\\longrightarrow$')
ax.set_title('CODEGEMMA')
ax.grid(False)

# Create legend
line_style_legend = ax.legend(handles=[
    plt.Line2D([0], [0], linestyle='solid', color='black', label="$\\mathsf{Anubis}$"),
    plt.Line2D([0], [0], linestyle='dashed', color='black', label="$\\mathsf{DetectGPT}$")
], loc="upper left", bbox_to_anchor=(0.02, 0.98))

plt.tight_layout()
# plt.savefig("../plots-for-paper/comparison_codegemma.pdf")
plt.show()

#### Deepseek-coder Results

Use the following snippet to generate the AUROC figure comparing CodeGemma with DetectGPT.  
Ensure that the required data as computed by `Anubis` and `detectGPT` are for the target model CodeGemma are available and stored in the `corpus-100-eval-deepseek` directory and `baselines/detect-gpt/deepseek2` directory respectively.

In [None]:
import subprocess
import re
import os
import numpy as np
from tqdm import tqdm

os.chdir(".")
print("New Working Directory:", os.getcwd())

eps_pairs = [
    (10, 100), (10, 90), (10, 80), (10, 70), (10, 60), (10, 50),
    (20, 100), (20, 90), (20, 80), (20, 70), (20, 60), (20, 50),
    (30, 100), (30, 90), (30, 80), (30, 70), (30, 60), (30, 50)
]

# Define multiple seeds for reproducibility
seeds = [42, 123, 456, 789, 999]
results_anubis_ds = {}

for seed in seeds:
    print(f"\nRunning with seed {seed}")
    np.random.seed(seed)  # Set seed for numpy

    for e1, e2 in tqdm(eps_pairs):
        cmd = [
            "python3", "evaluation.py",
            "--origstu", "corpus-100-eval-deepseek/stability1",
            "--stucorrupt", "corpus-100-eval-deepseek/deepseek2",
            "--origllm", "corpus-100-eval-deepseek/deepseek1",
            "--eps1", str(e1),
            "--eps2", str(e2),
            "--threshin", "60",
            "--threshout", "0.08",
            "--evalmodel", "1",
            "--seed", str(seed)
        ]
        result = subprocess.run(cmd, capture_output=True, text=True)
        match = re.search(r'roc_auc:\s*([^\s%|]+)', result.stdout)

        if match:
            roc_auc_val = float(match.group(1))
            results_anubis_ds[(e1, e2, seed)] = roc_auc_val
        else:
            print(f"Error for eps1={e1}, eps2={e2}, seed={seed}: No match found")
            print(result.stdout)
            print(result.stderr)

'''
# Print results
for (e1, e2, seed), val in results_anubis_ds.items():
    print(f"eps1={e1}, eps2={e2}, seed={seed}, roc_auc={val}")
'''

old_dir = os.getcwd()
os.chdir("./baselines/detect-gpt")
print("New Working Directory:", os.getcwd())

# Collect all .npy files in the current directory
npy_files = glob.glob("*.npy")

# Define multiple seeds for reproducibility
results_baseline_ds = {}

for seed in seeds:
    print(f"\nRunning with seed {seed}")
    np.random.seed(seed)  # Set seed for numpy

    for e1, e2 in tqdm(eps_pairs):
        cmd = [
            "python3", "accumulate_run.py",
            "--model", "deepseek2",
            "--data", "stability1",
            "--eps1", str(e1),
            "--eps2", str(e2),
            "--expeps", str(e2),
            "--seed", str(seed)
        ]
        result = subprocess.run(cmd, capture_output=True, text=True)
        match = re.search(r'roc_auc:\s*([^\s%|]+)', result.stdout)

        if match:
            roc_auc_val = float(match.group(1))
            results_baseline_ds[(e1, e2, seed)] = roc_auc_val
        else:
            print(f"Error for eps1={e1}, eps2={e2}, seed={seed}: No match found")
            print(result.stdout)
            print(result.stderr)

'''
# Print results
for (e1, e2, seed), val in results_baseline_ds.items():
    print(f"eps1={e1}, eps2={e2}, seed={seed}, roc_auc={val}")
'''
os.chdir(old_dir)



# Plot results
mpl.rcParams['text.usetex'] = False
plt.rcParams["font.family"] = "serif"
plt.rcParams["mathtext.fontset"] = "dejavuserif"

# Group results by bw
def group_results(results):
    bw_groups = {}
    for (bw, e2, _) in results.keys():
        if (bw, e2) not in bw_groups:
            bw_groups[(bw, e2)] = []
        bw_groups[(bw, e2)].append(results[(bw, e2, _)])
    return bw_groups

bw_groups_anubis = group_results(results_anubis_ds)
bw_groups_baseline = group_results(results_baseline_ds)

fig_width = 9
fig_height = 4
fig, ax = plt.subplots(1, 1, figsize=(fig_width, fig_height))
labels = ["DEEPSEEK-CODER"]
sns.set_palette("muted")
anubis_color = sns.color_palette()[0]
baseline_color = 'black'
shade_color = sns.color_palette("pastel")[0]
shade_color_base = sns.color_palette("pastel")[3]

all_eps2_vals = sorted(set(e2 for _, e2, _ in results_anubis_ds.keys()))

for bw in [10, 30]:
    # Plot Anubis Results
    avg_auroc_anubis = []
    std_dev_anubis = []
    for e2 in all_eps2_vals:
        if (bw, e2) in bw_groups_anubis:
            roc_values = bw_groups_anubis[(bw, e2)]
            avg_auroc_anubis.append((e2, np.mean(roc_values)))
            std_dev_anubis.append(np.std(roc_values))
    if avg_auroc_anubis:
        eps2_vals, avg_vals = zip(*avg_auroc_anubis)
        std_vals = std_dev_anubis
        ax.plot(eps2_vals, avg_vals, linestyle='solid', label=f"$\\mathsf{{Anubis}}$", color=anubis_color)
        ax.fill_between(eps2_vals, np.array(avg_vals) - np.array(std_vals), np.array(avg_vals) + np.array(std_vals), color=shade_color, alpha=0.3)
    if bw == 10:
        ax.annotate(f"$\\mathsf{{LB}}\\%$={bw}", (eps2_vals[len(eps2_vals)//2], avg_vals[len(avg_vals)//2]), textcoords="offset points", xytext=(-10,20), ha='center', fontsize=9, arrowprops=dict(arrowstyle="->"))
    elif bw == 30:
        ax.annotate(f"$\\mathsf{{LB}}\\%$={bw}", (eps2_vals[len(eps2_vals)//2], avg_vals[len(avg_vals)//2]), textcoords="offset points", xytext=(-15,-40), ha='center', fontsize=9, arrowprops=dict(arrowstyle="->"))

    # Plot DetectGPT Baseline Results
    avg_auroc_baseline = []
    std_dev_baseline = []
    for e2 in all_eps2_vals:
        if (bw, e2) in bw_groups_baseline:
            roc_values = bw_groups_baseline[(bw, e2)]
            avg_auroc_baseline.append((e2, np.mean(roc_values)))
            std_dev_baseline.append(np.std(roc_values))
    if avg_auroc_baseline:
        eps2_vals, avg_vals = zip(*avg_auroc_baseline)
        std_vals = std_dev_baseline
        ax.plot(eps2_vals, avg_vals, linestyle='dashed', label=f"$\\mathsf{{DetectGPT}}$", color=baseline_color)
        ax.fill_between(eps2_vals, np.array(avg_vals) - np.array(std_vals), np.array(avg_vals) + np.array(std_vals), color=shade_color_base, alpha=0.3)
    if bw == 10:
            ax.annotate("$\\mathsf{LB}\%$="+f"{bw}", (eps2_vals[len(eps2_vals)-1], avg_vals[len(avg_vals)-1]), textcoords="offset points", xytext=(-35,10), ha='center', fontsize=9, arrowprops=dict(arrowstyle="->"))
    elif bw == 30:
        ax.annotate("$\\mathsf{LB}\%$="+f"{bw}", (eps2_vals[len(eps2_vals)-2], avg_vals[len(avg_vals)-2]), textcoords="offset points", xytext=(20,-15), ha='center', fontsize=9, arrowprops=dict(arrowstyle="->"))

ax.set_xlabel('$\\mathsf{UB} \\% \\ \longrightarrow$')
ax.set_xlim([50, 100])
ax.set_ylim([0.48, 1.02])
ax.set_ylabel('AUROC $\\longrightarrow$')
ax.set_title('DEEPSEEK-CODER')
ax.grid(False)

# Create legend
line_style_legend = ax.legend(handles=[
    plt.Line2D([0], [0], linestyle='solid', color='black', label="$\\mathsf{Anubis}$"),
    plt.Line2D([0], [0], linestyle='dashed', color='black', label="$\\mathsf{DetectGPT}$")
], loc="upper left", bbox_to_anchor=(0.02, 0.98))

plt.tight_layout()
# plt.savefig("../plots-for-paper/comparison_codegemma.pdf")
plt.show()

## Sample size Experiment

The following section corresponds to the *Figure 3* of the main paper. 

Use the following snippets to generate the AUROC heatmap to compare the dependence among UB%, sample size and AUROC.

Ensure that the probability values computed by the target model CodeGemma (resp. Deepseek-coder) are available and stored in the `corpus-100-eval-gemma` (resp. `corpus-100-eval-gemma`) directory.

In [None]:
'''
Codegemma
'''
import subprocess
import os
import re
import numpy as np
from tqdm import tqdm

os.chdir(".")
print("New Working Directory:", os.getcwd())

eps_pairs = [(0, 100), (0,90), (0,80), (0,70)]

# Run for 7 different number of samples
nsamps = [500, 750, 1000, 1250, 1500, 1750, 2000]
nsamp_anubis_ge = {}

for nsamp in tqdm(nsamps):
    # print(f"\nRunning with nsamp {nsamp}")
    for e1, e2 in eps_pairs:
        cmd = [
            "python3", "evaluation.py",
            "--origstu", "corpus-100-eval-gemma/stability1",
            "--stucorrupt", "corpus-100-eval-gemma/codegemma2",
            "--origllm", "corpus-100-eval-gemma/codegemma1",
            "--eps1", str(e1),
            "--eps2", str(e2),
            "--threshin", "60",
            "--threshout", "0.08",
            "--evalmodel", "2",
            "--samps", str(nsamp),
            "--sampthresh", str(nsamp//5)
        ]
        result = subprocess.run(cmd, capture_output=True, text=True)
        match = re.search(r'roc_auc:\s*([^\s%|]+)', result.stdout)
        if match:
            roc_auc_val = float(match.group(1))
            nsamp_anubis_ge[(e1, e2, nsamp)] = roc_auc_val
        else:
            print(f"Error for eps1={e1}, eps2={e2}, nsamp={nsamp}: No match found")
            print(result.stdout)
            print(result.stderr)
'''
# Print results
for (e1, e2, nsamp), val in nsamp_anubis.items():
    print(f"eps1={e1}, eps2={e2}, nsamp={nsamp}, roc_auc={val}")

# Save results to npy file
np.save("../results-for-paper/nsamp_anubis.npy", nsamp_anubis)
print("Results saved to ../results/nsamp_anubis.npy")'''



In [None]:
'''
DeepSeek-Coder
'''

import subprocess
import re
import os
import numpy as np
from tqdm import tqdm

os.chdir(".")
print("New Working Directory:", os.getcwd())

eps_pairs = [(0, 100), (0,90), (0,80), (0,70)]

# Run for 7 different number of samples
nsamps = [500, 750, 1000, 1250, 1500, 1750, 2000]
nsamp_anubis_ds = {}

for nsamp in tqdm(nsamps):
    # print(f"\nRunning with nsamp {nsamp}")
    for e1, e2 in eps_pairs:
        cmd = [
            "python3", "evaluation.py",
            "--origstu", "corpus-100-eval-deepseek/stability1",
            "--stucorrupt", "corpus-100-eval-deepseek/deepseek2",
            "--origllm", "corpus-100-eval-deepseek/deepseek1",
            "--eps1", str(e1),
            "--eps2", str(e2),
            "--threshin", "60",
            "--threshout", "0.08",
            "--evalmodel", "1",
            "--samps", str(nsamp),
            "--sampthresh", str(nsamp//5)
        ]
        result = subprocess.run(cmd, capture_output=True, text=True)
        match = re.search(r'roc_auc:\s*([^\s%|]+)', result.stdout)

        if match:
            roc_auc_val = float(match.group(1))
            nsamp_anubis_ds[(e1, e2, nsamp)] = roc_auc_val
        else:
            print(f"Error for eps1={e1}, eps2={e2}, nsamp={nsamp}: No match found")
            print(result.stdout)
            print(result.stderr)

'''# Print results
for (e1, e2, nsamp), val in nsamp_anubis_ds.items():
    print(f"eps1={e1}, eps2={e2}, nsamp={nsamp}, roc_auc={val}")

# Save results to npy file
np.save("../results-for-paper/nsamp_anubis_ds.npy", nsamp_anubis_ds)
print("Results saved to ../results/nsamp_anubis_ds.npy")
'''

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib as mpl


mpl.rcParams['text.usetex'] = False
plt.rcParams["font.family"] = "serif"
plt.rcParams["mathtext.fontset"] = "dejavuserif"
fontsize = 22


# Prepare DataFrame
data = []
for (e1, e2, nsamp), roc_auc_ds in nsamp_anubis_ds.items():
    roc_auc_ge = nsamp_anubis_ge.get((e1, e2, nsamp), np.nan)
    data.append({'eps2': e2, 'nsamp': nsamp, 'roc_auc_ds': roc_auc_ds, 'roc_auc_ge': roc_auc_ge})

df = pd.DataFrame(data)

# Create pivot tables
pivot_ds = df.pivot_table(index='nsamp', columns='eps2', values='roc_auc_ds')
pivot_ge = df.pivot_table(index='nsamp', columns='eps2', values='roc_auc_ge')

# Transpose the pivot tables
pivot_ds = pivot_ds.T
pivot_ge = pivot_ge.T

# Combine for a common color scale
vmin = min(df['roc_auc_ds'].min(), df['roc_auc_ge'].min())
vmax = max(df['roc_auc_ds'].max(), df['roc_auc_ge'].max())

# Plot heatmaps side by side with shared colorbar
fig, axs = plt.subplots(1, 2, figsize=(18, 8), gridspec_kw={'width_ratios': [1, 1], 'wspace': 0.2})
cbar_ax = fig.add_axes([0.92, 0.2, 0.02, 0.6])  # Position for shared colorbar

# Plot Anubis_ge
sns.heatmap(pivot_ge, annot=True, cmap='Blues', fmt='.2f', vmin=vmin, vmax=vmax, ax=axs[0],
            cbar_ax=cbar_ax, cbar=True, annot_kws={"fontsize": fontsize})
axs[0].set_title('CODEGEMMA', fontsize=fontsize)
axs[0].set_ylabel('$\\mathsf{UB} \\% \\ \\longrightarrow$', fontsize=fontsize)
axs[0].set_xlabel('# Samples $\\longrightarrow$', fontsize=fontsize)
axs[0].tick_params(axis='both', which='major', labelsize=int(fontsize * 0.65))

# Plot Anubis_ds
sns.heatmap(pivot_ds, annot=True, cmap='Blues', fmt='.2f', vmin=vmin, vmax=vmax, ax=axs[1],
            cbar=False, annot_kws={"fontsize": fontsize})
axs[1].set_title('DEEPSEEK-CODER', fontsize=fontsize)
axs[1].set_ylabel('$\\mathsf{UB} \\% \\ \\longrightarrow$', fontsize=fontsize)
axs[1].set_xlabel('# Samples $\\longrightarrow$', fontsize=fontsize)
axs[1].tick_params(axis='both', which='major', labelsize=int(fontsize * 0.65))


# Adjust layout
plt.tight_layout(rect=[0, 0, 0.9, 1])  # Leave space for the colorbar
plt.show()

## Bucket Threshold Experiment

The following section corresponds to the *Figure 4* of the main paper. 

Use the following snippets to generate the AUROC for *different bucket thresholds*.

Ensure that the probability values computed by the target model CodeGemma (resp. Deepseek-coder) are available and stored in the `corpus-100-eval-gemma` (resp. `corpus-100-eval-gemma`) directory.

In [None]:
import subprocess
import os
import re
import numpy as np
from tqdm import tqdm

os.chdir(".")
print("New Working Directory:", os.getcwd())

eps_pairs = [(0, 100), (0, 90), (0, 80), (0, 70), (10, 100), (10, 90), (10, 80), (10, 70), (20, 100), (20, 90), (20, 80), (20, 70)]
bthresh_values = [0.01, 0.05, 0.1, 0.15, 0.2]  # Different values of bthresh

bthresh_anubis = {}

for e1, e2 in tqdm(eps_pairs):
    for bthresh in bthresh_values:
        cmd = [
            "python3", "evaluation.py",
            "--origstu", "corpus-100-eval-gemma/stability1",
            "--stucorrupt", "corpus-100-eval-gemma/codegemma2",
            "--origllm", "corpus-100-eval-gemma/codegemma1",
            "--eps1", str(e1),
            "--eps2", str(e2),
            "--threshin", "60",
            "--threshout", "0.08",
            "--thresh", str(bthresh),
            "--evalmodel", "2"
        ]
        result = subprocess.run(cmd, capture_output=True, text=True)
        match = re.search(r'roc_auc:\s*([^\s%|]+)', result.stdout)
        if match:
            roc_auc_val = float(match.group(1))
            bthresh_anubis[(e1, e2, bthresh)] = roc_auc_val
        else:
            print(f"Error for eps1={e1}, eps2={e2}, bthresh={bthresh}: No match found")
            print(result.stdout)
            print(result.stderr)
'''
# Print results
for (e1, e2, bthresh), val in bthresh_anubis.items():
    print(f"eps1={e1}, eps2={e2}, bthresh={bthresh}, roc_auc={val}")
# Save results to npy file
np.save("../results-for-paper/bthresh_anubis.npy", bthresh_anubis)
print("Results saved to ../results/bthresh_anubis.npy")'''

bthresh_anubis_ds = {}

for e1, e2 in tqdm(eps_pairs):
    for bthresh in bthresh_values:
        cmd = [
            "python3", "evaluation.py",
            "--origstu", "corpus-100-eval-deepseek/stability1",
            "--stucorrupt", "corpus-100-eval-deepseek/deepseek2",
            "--origllm", "corpus-100-eval-deepseek/deepseek1",
            "--eps1", str(e1),
            "--eps2", str(e2),
            "--threshin", "60",
            "--threshout", "0.08",
            "--thresh", str(bthresh),
            "--evalmodel", "1"
        ]
        result = subprocess.run(cmd, capture_output=True, text=True)
        match = re.search(r'roc_auc:\s*([^\s%|]+)', result.stdout)
        if match:
            roc_auc_val = float(match.group(1))
            bthresh_anubis_ds[(e1, e2, bthresh)] = roc_auc_val
        else:
            print(f"Error for eps1={e1}, eps2={e2}, bthresh={bthresh}: No match found")
            print(result.stdout)
            print(result.stderr)

'''# Print results
for (e1, e2, bthresh), val in bthresh_anubis_ds.items():
    print(f"eps1={e1}, eps2={e2}, bthresh={bthresh}, roc_auc={val}")

# Save results to npy file
np.save("../results-for-paper/bthresh_anubis_ds.npy", bthresh_anubis_ds)
print("Results saved to ../results/bthresh_anubis_ds.npy")
'''


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

mpl.rcParams['text.usetex'] = False
plt.rcParams["font.family"] = "serif"
plt.rcParams["mathtext.fontset"] = "dejavuserif"
fontsize = 22

'''
# Load data
bthresh_anubis = np.load("../results-for-paper/bthresh_anubis.npy", allow_pickle=True).item()
bthresh_anubis_ds = np.load("../results-for-paper/bthresh_anubis_ds.npy", allow_pickle=True).item()'''

# Track instance numbers for unique (e1, e2) pairs
instance_map = {}
data = []

# Process Anubis data
for (e1, e2, bthresh), roc_auc in bthresh_anubis.items():
    if (e1, e2) not in instance_map:
        instance_map[(e1, e2)] = len(instance_map)
    data.append({'inst': instance_map[(e1, e2)], 'bthresh': bthresh, 'roc_auc': roc_auc, 'source': 'CODEGEMMA'})

# Process Anubis_DS data
for (e1, e2, bthresh), roc_auc in bthresh_anubis_ds.items():
    if (e1, e2) not in instance_map:
        instance_map[(e1, e2)] = len(instance_map)
    data.append({'inst': instance_map[(e1, e2)], 'bthresh': bthresh, 'roc_auc': roc_auc, 'source': 'DEEPSEEK-CODER'})

df = pd.DataFrame(data)

# Plot scatter plot
plt.figure(figsize=(max(18, len(instance_map) * 1.2), 7))

# Jittering to spread points for visibility
jitter_strength = 0.11
df['inst_jittered'] = df['inst'] + np.random.uniform(-jitter_strength, jitter_strength, len(df))

# Plot points 
point_size = 350
for source in df['source'].unique():
    subset = df[df['source'] == source]
    plt.scatter(subset['inst_jittered'], subset['roc_auc'], s=point_size, label=f"{source}", alpha=0.6)

# Use (e1, e2) as x-axis labels
plt.xticks(list(instance_map.values()), [f"({e1}, {e2})" for (e1, e2) in instance_map.keys()], rotation=0, fontsize=12)

plt.xlabel("$\\mathsf{LB}\%, \\mathsf{UB}\%$", fontsize=fontsize)
plt.ylabel("AUROC", fontsize=fontsize)
plt.legend(loc="best", fontsize=fontsize)
plt.tick_params(axis='both', which='major', labelsize=int(fontsize * 0.65))
plt.grid(False)
plt.tight_layout()

# Show Plot
plt.show()
