In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import json


with open("../data/final_dataset_output.json", "r") as f:
    raw_data = json.load(f)
    
with open("../data/final_dataset.json", "r") as f:
    orig_data = json.load(f)
    
print(orig_data[0].keys())

toxicity_per_uid = {d["uid"]: d["toxic"] for d in raw_data}
prompts_per_uid = {d["uid"]: d["prompt"] for d in raw_data}
orig_prompts_per_uid = {d["uid"]: d["prompt"] for d in orig_data}

def relabel(data, target_fp):
    """
    Relabel the data based on the target false positive rate
    Methodology: order the benign data in the test set by confidence score, and select an appropriate theshold
    """
    benign_data = data[data["label"] == 0]
    benign_data = benign_data.sort_values("confidence")

    # Determine the index at which to split the data
    split_index = int(len(benign_data) * (1-target_fp))
    
    # Get the threshold confidence score
    threshold = benign_data.iloc[split_index]["confidence"]
    
    # print("Number of benign samples: ", len(benign_data))
    # print("Threshold confidence score: ", threshold)
    
    # Relabel the data based on the threshold
    data["prediction"] = (data["confidence"] > threshold).astype(int)
            
    return data 
    

def extract_model_subsets(data_path, base=False, target_fp=-1):
    """
    Extract two subsets from ML model evaluation data
    
    Parameters:
    -----------
    data_path : str
        Path to the TSV file containing evaluation data
    
    Returns:
    --------
    tuple of pandas DataFrames
    """
    
    #cutoff  training_size   threshold       timestamp       label   pred    confidence      uid
    # Read the TSV file
    df = pd.read_csv(data_path, sep='\t', header=None, 
                     names=['cutoff_time', 'training_size', 'threshold', 'evaluation_time', 'label', 'prediction', 'confidence', 'uid'])
    
    df['toxic'] = df['uid'].map(toxicity_per_uid)
    df['input'] = df['uid'].map(prompts_per_uid)
    df['orig_input'] = df['uid'].map(orig_prompts_per_uid)
    
    if target_fp > 0:
        df = relabel(df, target_fp)
    
    if base:
        # Find the earliest cutoff time
        earliest_cutoff = df['cutoff_time'].min()
        earliest_cutoff_subset = df[df['cutoff_time'] == earliest_cutoff]
         # Add toxicity labels
        
        return earliest_cutoff_subset

    else:
        # Find the cutoff time closest to the median of all cutoff times
        cutoff_times = df['cutoff_time'].unique()
    
        def find_closest_subset(row):
            # Get cutoff times earlier than the evaluation time
            earlier_cutoffs = [c for c in cutoff_times if c < row['evaluation_time']]
            
            # If no earlier cutoffs exist, return None
            if not earlier_cutoffs:
                return None
            
            # Find the closest cutoff time
            closest_cutoff = min(earlier_cutoffs, key=lambda x: abs(x - row['evaluation_time']))
            return closest_cutoff
        
        # Apply the function to each row
        df['closest_cutoff'] = df.apply(find_closest_subset, axis=1)
        
        # Remove rows where no earlier cutoff exists
        closest_cutoff_subset = df[df['closest_cutoff'].notna()].copy()
        #test = closest_cutoff_subset[closest_cutoff_subset['cutoff_time'] != closest_cutoff_subset['closest_cutoff']]
        #print(test['uid'])
        #closest_cutoff_subset = closest_cutoff_subset[closest_cutoff_subset['cutoff_time'] == closest_cutoff_subset['closest_cutoff']]
        #print(len(closest_cutoff_subset))
        
        return closest_cutoff_subset

dict_keys(['uid', 'pre_wildchat', 'date', 'prompt', 'source', 'timestamp', 'label'])


In [2]:
import pandas as pd
import numpy as np

def aggregate_model_performance(dataset, time_window=2624016, ignore_non_toxic_jailbreaks=False):
    """
    Aggregate model performance metrics over consecutive time windows.
    
    Parameters:
    -----------
    dataset : pandas.DataFrame
        DataFrame with columns: 'evaluation_time', 'label', 'prediction'
    time_window : float
        Size of time window for aggregation in seconds
    
    Returns:
    --------
    pandas.DataFrame
        Aggregated performance metrics with columns:
        - window_start: Start time of each window
        - false_positive_rate: Proportion of negative labels predicted as positive
        - false_negative_rate: Proportion of positive labels predicted as negative
    """
    # Sort dataset by evaluation time
    df = dataset.sort_values('evaluation_time')
    
    # Calculate window boundaries
    start_time = df['evaluation_time'].min()
    end_time = df['evaluation_time'].max()
    
    # Prepare results storage
    results = []
    
    # Iterate through consecutive windows
    current_window_start = start_time
    while current_window_start < end_time:
        window_end = current_window_start + time_window
        
        # Filter data within current window
        window_data = df[(df['evaluation_time'] >= current_window_start) & 
                         (df['evaluation_time'] < window_end)]
        
        # Skip empty windows
        if len(window_data) == 0:
            current_window_start = window_end
            continue
        
        # Filter data if needed
        if ignore_non_toxic_jailbreaks:
            window_data = window_data[(window_data['label'] == 0) | (window_data['toxic'] == 1)]
        
        # Calculate false positive and false negative rates
        total_negatives = (window_data['label'] == 0).sum()
        total_positives = (window_data['label'] == 1).sum()
        
        false_positives = ((window_data['label'] == 0) & (window_data['prediction'] == 1)).sum()
        false_negatives = ((window_data['label'] == 1) & (window_data['prediction'] == 0)).sum()
        
        false_positive_rate = false_positives / total_negatives if total_negatives > 0 else 0
        false_negative_rate = false_negatives / total_positives if total_positives > 0 else 0
        
        # Store results
        results.append({
            'window_start': current_window_start,
            'false_positive_rate': false_positive_rate,
            'false_negative_rate': false_negative_rate
        })
        
        # Move to next window
        current_window_start = window_end
    
    return pd.DataFrame(results)

In [3]:
import pandas as pd
import numpy as np

def average_model_performance(dataset, ignore_non_toxic_jailbreaks=False, time_start=None, time_end=None):
    """
    Average model performance
    
    Parameters:
    -----------
    dataset : pandas.DataFrame
        DataFrame with columns: 'evaluation_time', 'label', 'prediction'
    
    Returns:
    --------
    Tuple of floats
    """
    # Sort dataset by evaluation time
    df = dataset.sort_values('evaluation_time')
    if time_start is not None:
        df = df[df['evaluation_time'] >= time_start]
    if time_end is not None:
        df = df[df['evaluation_time'] <= time_end]
    
    # Filter data if needed
    if ignore_non_toxic_jailbreaks:
        df = df[(df['label'] == 0) | (df['toxic'] == 1)]
        
    # Calculate false positive and false negative rates
    total_negatives = (df['label'] == 0).sum()
    total_positives = (df['label'] == 1).sum()

    false_positives = ((df['label'] == 0) & (df['prediction'] == 1)).sum()
    false_negatives = ((df['label'] == 1) & (df['prediction'] == 0)).sum()

    false_positive_rate = false_positives / total_negatives if total_negatives > 0 else 0
    false_negative_rate = false_negatives / total_positives if total_positives > 0 else 0

    #  Return results
    return false_positive_rate, false_negative_rate

In [13]:
# Get base model performance and variance
import os
from collections import defaultdict

files = [f for f in os.listdir("../results/") if 'base_' in f]
print(files)
results = defaultdict(list)
details = defaultdict(list)
preds = defaultdict(list)

target_fp = 0.003
for f in files:
    variant = f.split(".tsv")[0].split("_")[1]
    version = f.split(".tsv")[0].split("_")[-1]
    try:
        data = extract_model_subsets("../results/" + f, base=True, target_fp=target_fp)
    except:
        print(f"Error processing {f}")
        continue
    false_positive_rate, false_negative_rate = average_model_performance(data)
    false_negatives = data[(data['prediction'] == 0) & (data['label'] == 1)]['uid']
    if false_negative_rate > 0.5:
        continue
    details[variant].append(false_negatives)
    results[variant].append(false_negative_rate)
    preds[variant].append({d['uid']: d['prediction'] for d in data.to_dict(orient='records')})
    
# Compute average and standard deviation of each list in results
for key, value in results.items():
    results[key] = [np.mean(value), np.std(value)]
    print(f"FNR for {key} dataset @{round(100*target_fp, 2)}% FPR: {round(100*results[key][0], 2)}% +- {round(100*1.96 * results[key][1], 2)}%")

details_base = details
results_base = results 
preds_base = preds

['base_new_1.tsv', 'base_new_2.tsv', 'base_new_8.tsv', 'base_orig_3.tsv', 'base_alt_2.tsv', 'results_base_1_28_1.tsv', 'base_alt_4.tsv', 'results_base_1_7_1.tsv', 'base_new_3.tsv', 'base_orig_6.tsv', 'base_orig_4.tsv', 'base_alt_6.tsv', 'base_alt_10.tsv', 'base_orig_9.tsv', 'base_alt_5.tsv', 'base_new_7.tsv', 'base_orig_7.tsv', 'base_orig_8.tsv', 'base_orig_2.tsv', 'base_alt_3.tsv', 'base_new_5.tsv', 'base_orig_10.tsv', 'base_orig_5.tsv', 'base_alt_1.tsv', 'base_new_10.tsv', 'base_alt_8.tsv', 'base_new_4.tsv', 'base_alt_7.tsv', 'base_orig_1.tsv', 'base_new_6.tsv', 'base_new_9.tsv', 'base_alt_9.tsv']
1.839232390921097e-05
0.0004858513711951
0.0008780533098615
0.0013298545964062
0.0007588490261696
2.3866587071097456e-06
0.013925121165812
1.0
5.773594239144586e-05
0.0020565113518387
0.3233955204486847
0.0002174432302126
6.612436845898628e-05
0.9834699034690856
0.0003593369910959
0.0088454568758606
0.9997534155845642
0.0598009191453456
0.0128068644553422
0.0001265268074348
0.00014557593385

In [21]:
# Get base model performance and variance
import os
from collections import defaultdict

files = [f for f in os.listdir("../results/") if 'results_base_' in f]
print(files)
results = defaultdict(list)
details = defaultdict(list)
preds = defaultdict(list)

target_fp = 0.0031
for f in files:
    variant = f.split(".tsv")[0]
    try:
        data = extract_model_subsets("../results/" + f, base=True, target_fp=target_fp)
    except:
        print(f"Error processing {f}")
        continue
    false_positive_rate, false_negative_rate = average_model_performance(data)
    print(false_positive_rate, false_negative_rate)
    false_negatives = data[(data['prediction'] == 0) & (data['label'] == 1)]['uid']
    # if false_negative_rate > 0.5:
    #     continue
    details[variant].append(false_negatives)
    results[variant].append(false_negative_rate)
    preds[variant].append({d['uid']: d['prediction'] for d in data.to_dict(orient='records')})
    
# Compute average and standard deviation of each list in results
for key, value in results.items():
    results[key] = [np.mean(value), np.std(value)]
    print(f"FNR for {key} dataset @{round(100*target_fp, 2)}% FPR: {round(100*results[key][0], 2)}% +- {round(100*1.96 * results[key][1], 2)}%")

details_base = details
results_base = results 
preds_base = preds

['results_base_1_28_1.tsv', 'results_base_1_7_1.tsv']
2.1044684217486065e-06
0.003068682698209005 0.03096330275229358
0.9999998807907104
0.0030039361922519162 0.04553829955251382
FNR for results_base_1_28_1 dataset @0.31% FPR: 3.1% +- 0.0%
FNR for results_base_1_7_1 dataset @0.31% FPR: 4.55% +- 0.0%


In [4]:
# Get base model performance and variance
import os
from collections import defaultdict

# wildchat_jb = {}
# non_wildchat_jb = {}
# for d in raw_data:
#     if not d['label']:
#         continue
    
#     if 'wildchat' in d['source']:
#         wildchat_jb[d['uid']] = d['prompt']
#     else:
#         non_wildchat_jb[d['uid']] = d['prompt']

def process(prefix="continuous_real", start_time=None, end_time=None, target_fp=0.01, base=False):
    
    files = [f for f in os.listdir("../results/") if prefix in f]
    results = defaultdict(list)
    details = defaultdict(list)
    preds = defaultdict(list)

    for f in files:
        
        variant = "new" if "new" in f else ("orig" if "orig" in f else "alt")
        version = f.split(".tsv")[0].split("_")[-1]
        data = extract_model_subsets("../results/" + f, base=base, target_fp=target_fp)
        false_positive_rate, false_negative_rate = average_model_performance(data, time_start=start_time, time_end=end_time)
        # if false_negative_rate > 0.5:
        #     print(f"Skipping {f} due to high FNR: {false_negative_rate:.4f}")
        #     continue
        false_negatives = data[(data['prediction'] == 0) & (data['label'] == 1)]['uid']
        jailbreaks = data[(data['label'] == 1)]['uid']
        # print(f"Version {version}: FNR = {false_negative_rate:.4f}, FPR = {false_positive_rate:.4f}")
        

        
        details[variant].append(false_negatives)
        results[variant].append(false_negative_rate)
        preds[variant].append({d['uid']: d['prediction'] for d in data.to_dict(orient='records')})
        
    all_orig_false_negatives = {uid for sublist in details['orig'] for uid in sublist}
    uid_counts = {}

    # Compute average and standard deviation of each list in results
    for key, value in results.items():
        # wildchat_fns = [[uid for uid in sublist if uid in wildchat_jb] for sublist in details[key]]
        # non_wildchat_fns = [[uid for uid in sublist if uid in non_wildchat_jb] for sublist in details[key]]
        fn_count = sum([len(sublist) for sublist in details[key]]) / len(details[key])
        # wildchat_fn_count = sum([len(sublist) for sublist in wildchat_fns]) / len(wildchat_fns)
        # non_wildchat_fn_count = sum([len(sublist) for sublist in non_wildchat_fns]) / len(non_wildchat_fns)
        results[key] = [np.mean(value), np.std(value)]
        print(f"{key} dataset (@{round(100*target_fp, 4)}% FPR). {fn_count} FNs. FNR: {round(100*results[key][0], 4)}% +- {round(100 * results[key][1], 4)}%")
        
        # Count how many times each UID was a false negative
        uid_counts[key] = defaultdict(int)
        for sublist in details[key]:
            for uid in sublist:
                if uid in all_orig_false_negatives:
                    continue
                uid_counts[key][uid] += 1
                
    return preds

# print("Pseudo labels, 1% of data, toxic ratio threhsold at 50%")
# process('continuous_pseudo_0.01_new_0.5_0.5_False')

# print("Pseudo labels, 1% of data, toxic ratio threhsold at 60%")
# process('continuous_pseudo_0.01_new_0.6_0.6_False')

# print("Pseudo labels, 1% of data, toxic ratio threhsold at 50%, keep classification labels for negatives")
# process('continuous_pseudo_0.01_new_0.5_0.0_False')

# print("Pseudo labels, 1% of data, toxic ratio threhsold at 50%, discard uncertain below 50%")
# process('continuous_pseudo_0.01_new_0.5_0.0_True')

# print("Pseudo labels, 100% of data, toxic ratio threhsold at 50%")
# process('continuous_pseudo_1_new_0.5_0.5_False')

# print("Use 100% classification labels")
# process('continuous_real_0_new')

# print("Use 99% classification labels")
# process('continuous_real_0.01_new')

# print("Use 95% classification labels")
# process('continuous_real_0.05_new')

# print("Use 90% classification labels")
# process('continuous_real_0.1_new')

# print("Use 0% classification labels")
# process('continuous_real_1_new')

# print("Monthly retrain, real labels")
# process('results_continuous_real_1_1_28')

# print("Monthly retrain, self training")
# process('results_continuous_self_1_1_28')

# print("Weekly retrain, one week, real labels")
# process('results_continuous_real_1_1_7')

# print("Weekly retrain, one week, self training")
# process('results_continuous_self_1_1_7')

# print("Confidence retrain, 10%")
# process('results_continuous_self_0.1_4_7')

# print("Confidence retrain, 50%")
# process('results_continuous_self_0.5_4_7')

# print("Confidence retrain, 25%")
# process('results_continuous_self_0.5_4_7')

# print("Confidence retrain, 1%")
# process('results_continuous_self_0.01_4_7')

# print("Gap, 3 months")
# process('results_continuous_self_gap_4_7_1_12')


In [13]:
## Measure performance on last month of data
last_timestamp = max([d['timestamp'] for d in raw_data])
last_month = last_timestamp - 60*60*24*7
print(last_month)

print("Weekly, self-train, last month")
process('continuous_real_0_new', start_time=last_month, target_fp=0.01)
process('continuous_real_0_new', start_time=last_month, target_fp=0.0013)


print("Monthly, self-train, last month")
process('results_continuous_self_1_1_28', start_time=last_month, target_fp=0.01)
process('results_continuous_self_1_1_28', start_time=last_month, target_fp=0.0011)


print("Baseline, last month")
process('base_new', start_time=last_month, target_fp=0.01, base=True)
process('base_new', start_time=last_month, target_fp=0.0012, base=True)

## Measure performance on last month of data
last_timestamp = max([d['timestamp'] for d in raw_data])
last_month = last_timestamp - 2624016

print("Weekly, self-train, last month")
process('results_continuous_self_1_1_7_1', start_time=last_month, target_fp=0.01)


print("Monthly, self-train, last month")
process('results_continuous_self_1_1_7_4', start_time=last_month, target_fp=0.01)


print("Baseline, last month")
process('results_base_1_7_1', start_time=last_month, target_fp=0.01, base=True)

1702864782.4977107
Weekly, self-train, last month


new dataset (@1.0% FPR). 5.25 FNs. FNR: 1.0638% +- 0.0%
new dataset (@0.13% FPR). 11.0 FNs. FNR: 1.0638% +- 0.0%
Monthly, self-train, last month
alt dataset (@1.0% FPR). 7.0 FNs. FNR: 1.0638% +- 0.0%
alt dataset (@0.11% FPR). 10.0 FNs. FNR: 1.0638% +- 0.0%
Baseline, last month
new dataset (@1.0% FPR). 70.2 FNs. FNR: 1.9149% +- 1.0423%
new dataset (@0.12% FPR). 147.2 FNs. FNR: 3.8298% +- 4.0982%
Weekly, self-train, last month
alt dataset (@1.0% FPR). 93.0 FNs. FNR: 4.2667% +- 0.0%
Monthly, self-train, last month
alt dataset (@1.0% FPR). 123.0 FNs. FNR: 4.0% +- 0.0%
Baseline, last month
alt dataset (@1.0% FPR). 57.0 FNs. FNR: 2.4% +- 0.0%


defaultdict(list,
            {'alt': [{'70efdf2ec9b086079795c442636b55fb': 0,
               '65b9eea6e1cc6bb9f0cd2a47751a186f': 0,
               'f0935e4cd5920aa6c7c996a5ee53a70f': 0,
               '3644a684f98ea8fe223c713b77189a77': 0,
               '274ad4786c3abca69fa097b85867d9a4': 0,
               '9cfdf10e8fc047a44b08ed031e1f0ed1': 0,
               '6da9003b743b65f4c0ccd295cc484e57': 0,
               'c042f4db68f23406c6cecf84a7ebb0fe': 0,
               '4f6ffe13a5d75b2d6a3923922b3922e5': 0,
               'ca9c267dad0305d1a6308d2a0cf1c39c': 0,
               '013a006f03dbc5392effeb8f18fda755': 0,
               '7fa732b517cbed14a48843d74526c11a': 0,
               '90794e3b050f815354e3e29e977a88ab': 0,
               'e6cb2a3c14431b55aa50c06529eaa21b': 0,
               'c22abfa379f38b5b0411bc11fa9bf92f': 0,
               'fed33392d3a48aa149a87a38b875ba4a': 1,
               '1019c8091693ef5c5f55970346633f92': 0,
               '2290a7385ed77cc5592dc2153229f082': 0,
   

In [12]:
## Measure performance on first month of data
data = extract_model_subsets("../results/continuous_real_0_new_1.tsv", base=True)
min_time = min(data['evaluation_time'])
first_month = min_time + 24*60*60*28
print(first_month)

print("Weekly, self-train, first week")
process('continuous_real_0_new', end_time=first_month, target_fp=0.01)
process('continuous_real_0_new', end_time=first_month, target_fp=0.0013)


print("Monthly, self-train, first week")
process('results_continuous_self_1_1_28', end_time=first_month, target_fp=0.01)
process('results_continuous_self_1_1_28', end_time=first_month, target_fp=0.0011)


print("Baseline, first week")
process('base_new', end_time=first_month, target_fp=0.01, base=True)
process('base_new', end_time=first_month, target_fp=0.0012, base=True)


print("Weekly, self-train, first week")
process('results_continuous_self_1_1_7_1', end_time=first_month, target_fp=0.01)


print("Monthly, self-train, first week")
process('results_continuous_self_1_1_7_4', end_time=first_month, target_fp=0.01)


print("Baseline, first week")
process('results_base_1_7_1', end_time=first_month, target_fp=0.01, base=True)

1682467199.0
Weekly, self-train, first week
new dataset (@1.0% FPR). 5.25 FNs. FNR: 0.3747% +- 0.1775%
new dataset (@0.13% FPR). 11.0 FNs. FNR: 0.6959% +- 0.2333%
Monthly, self-train, first week
alt dataset (@1.0% FPR). 7.0 FNs. FNR: 0.6424% +- 0.0%
alt dataset (@0.11% FPR). 10.0 FNs. FNR: 0.6424% +- 0.0%
Baseline, first week
new dataset (@1.0% FPR). 70.2 FNs. FNR: 0.8994% +- 0.3925%
new dataset (@0.12% FPR). 147.2 FNs. FNR: 1.3704% +- 0.4304%
Weekly, self-train, first week
alt dataset (@1.0% FPR). 93.0 FNs. FNR: 1.1568% +- 0.0%
Monthly, self-train, first week
alt dataset (@1.0% FPR). 123.0 FNs. FNR: 0.3856% +- 0.0%
Baseline, first week
alt dataset (@1.0% FPR). 57.0 FNs. FNR: 0.3856% +- 0.0%


defaultdict(list,
            {'alt': [{'70efdf2ec9b086079795c442636b55fb': 0,
               '65b9eea6e1cc6bb9f0cd2a47751a186f': 0,
               'f0935e4cd5920aa6c7c996a5ee53a70f': 0,
               '3644a684f98ea8fe223c713b77189a77': 0,
               '274ad4786c3abca69fa097b85867d9a4': 0,
               '9cfdf10e8fc047a44b08ed031e1f0ed1': 0,
               '6da9003b743b65f4c0ccd295cc484e57': 0,
               'c042f4db68f23406c6cecf84a7ebb0fe': 0,
               '4f6ffe13a5d75b2d6a3923922b3922e5': 0,
               'ca9c267dad0305d1a6308d2a0cf1c39c': 0,
               '013a006f03dbc5392effeb8f18fda755': 0,
               '7fa732b517cbed14a48843d74526c11a': 0,
               '90794e3b050f815354e3e29e977a88ab': 0,
               'e6cb2a3c14431b55aa50c06529eaa21b': 0,
               'c22abfa379f38b5b0411bc11fa9bf92f': 0,
               'fed33392d3a48aa149a87a38b875ba4a': 1,
               '1019c8091693ef5c5f55970346633f92': 0,
               '2290a7385ed77cc5592dc2153229f082': 0,
   

In [9]:
# Get list of FP UIDS

predictions = process(prefix="results_continuous_self_1_1_28", target_fp=0.01, base=False)
false_negatives = [d['uid'] for d in raw_data if d['uid'] in predictions['alt'][0] and not predictions['alt'][0][d['uid']] and d['label'] == 1]
false_positives = [d['uid'] for d in raw_data if d['uid'] in predictions['alt'][0] and predictions['alt'][0][d['uid']] and d['label'] == 0]
errors = {
    'false_negatives': false_negatives,
    'false_positives': false_positives
}

import json
print(json.dumps(errors, indent=2))




alt dataset (@1.0% FPR). 7.0 FNs. FNR: 0.2007% +- 0.0%
{
  "false_negatives": [
    "e243aa93e6b6e031797f86d0858f5e40",
    "9d4f684ba088d28ad1c2ae7d0aee496a",
    "20d1b1738e2f75c4629e32f1011ab25d",
    "9f4241002d82d931ad8c5cec67f17e9a",
    "6fa49450b98696c6c042357d73674657",
    "9cb5248c748f0956843723f498e9d6cd",
    "585774f49060e85e2e0a7579202f075f"
  ],
  "false_positives": [
    "fe9fc289c3ff0af142b6d3bead98a923",
    "b73ce398c39f506af761d2277d853a92",
    "fe73f687e5bc5280214e0486b273a5f9",
    "c3992e9a68c5ae12bd18488bc579b30d",
    "86b122d4358357d834a87ce618a55de0",
    "847cc55b7032108eee6dd897f3bca8a5",
    "170c944978496731ba71f34c25826a34",
    "da0d1111d2dc5d489242e60ebcbaf988",
    "d93ed5b6db83be78efb0d05ae420158e",
    "31b3b31a1c2f8a370206f111127c0dbd",
    "bdb106a0560c4e46ccc488ef010af787",
    "43baa6762fa81bb43b39c62553b2970d",
    "2f29b6e3abc6ebdefb55456ea6ca5dc8",
    "7c82fab8c8f89124e2ce92984e04fb40",
    "602d1305678a8d5fdb372271e980da6a",
    "6e62a992

In [1]:
def calculate_sums():
    # All datasets from the table
    datasets = [
        "Apache", "BGL", "Hadoop", "HDFS", "HealthApp", "Proxifier", "Thunderbird",
        "HPC", "Linux", "Mac", "OpenSSH", "OpenStack", "Spark", "Zookeeper"
    ]
    
    # Create dictionaries to store the PGS and TS values for each system
    sys_pgs = {}
    sys_ts = {}
    lilac_pgs = {}
    lilac_ts = {}
    drain3_pgs = {}
    drain3_ts = {}
    brain_pgs = {}
    brain_ts = {}
    
    # Values from the table for each dataset and system
    # Apache
    sys_pgs["Apache"] = 1.00
    sys_ts["Apache"] = 1.00
    lilac_pgs["Apache"] = 0.99
    lilac_ts["Apache"] = 0.99
    drain3_pgs["Apache"] = 0.99
    drain3_ts["Apache"] = 0.89
    brain_pgs["Apache"] = 0.99
    brain_ts["Apache"] = 0.88
    
    # BGL
    sys_pgs["BGL"] = 0.99
    sys_ts["BGL"] = 0.98
    lilac_pgs["BGL"] = 0.97
    lilac_ts["BGL"] = 0.96
    drain3_pgs["BGL"] = 0.95
    drain3_ts["BGL"] = 0.96
    brain_pgs["BGL"] = 0.88
    brain_ts["BGL"] = 0.79
    
    # Hadoop
    sys_pgs["Hadoop"] = 0.97
    sys_ts["Hadoop"] = 0.91
    lilac_pgs["Hadoop"] = 0.86
    lilac_ts["Hadoop"] = 0.81
    drain3_pgs["Hadoop"] = 0.88
    drain3_ts["Hadoop"] = 0.71
    brain_pgs["Hadoop"] = 0.35
    brain_ts["Hadoop"] = 0.29
    
    # HDFS
    sys_pgs["HDFS"] = 1.00
    sys_ts["HDFS"] = 0.96
    lilac_pgs["HDFS"] = 0.96
    lilac_ts["HDFS"] = 0.84
    drain3_pgs["HDFS"] = 0.97
    drain3_ts["HDFS"] = 0.93
    brain_pgs["HDFS"] = 0.97
    brain_ts["HDFS"] = 0.84
    
    # HealthApp
    sys_pgs["HealthApp"] = 1.00
    sys_ts["HealthApp"] = 0.94
    lilac_pgs["HealthApp"] = 1.00
    lilac_ts["HealthApp"] = 0.94
    drain3_pgs["HealthApp"] = 0.97
    drain3_ts["HealthApp"] = 0.92
    brain_pgs["HealthApp"] = 0.54
    brain_ts["HealthApp"] = 0.44
    
    # Proxifier
    sys_pgs["Proxifier"] = 1.00
    sys_ts["Proxifier"] = 0.50
    lilac_pgs["Proxifier"] = 0.97
    lilac_ts["Proxifier"] = 0.38
    drain3_pgs["Proxifier"] = 0.52
    drain3_ts["Proxifier"] = 0.31
    brain_pgs["Proxifier"] = 0.99
    brain_ts["Proxifier"] = 0.35
    
    # Thunderbird
    sys_pgs["Thunderbird"] = 0.98
    sys_ts["Thunderbird"] = 0.82
    lilac_pgs["Thunderbird"] = 0.82
    lilac_ts["Thunderbird"] = 0.68
    drain3_pgs["Thunderbird"] = 0.87
    drain3_ts["Thunderbird"] = 0.68
    brain_pgs["Thunderbird"] = 0.79
    brain_ts["Thunderbird"] = 0.69
    
    # HPC
    sys_pgs["HPC"] = 1.00
    sys_ts["HPC"] = 0.99
    lilac_pgs["HPC"] = 1.00
    lilac_ts["HPC"] = 1.00
    drain3_pgs["HPC"] = 0.97
    drain3_ts["HPC"] = 0.96
    brain_pgs["HPC"] = 0.78
    brain_ts["HPC"] = 0.75
    
    # Linux
    sys_pgs["Linux"] = 0.85
    sys_ts["Linux"] = 0.93
    lilac_pgs["Linux"] = 0.83
    lilac_ts["Linux"] = 0.86
    drain3_pgs["Linux"] = 0.87
    drain3_ts["Linux"] = 0.80
    brain_pgs["Linux"] = 0.81
    brain_ts["Linux"] = 0.71
    
    # Mac
    sys_pgs["Mac"] = 0.96
    sys_ts["Mac"] = 0.91
    lilac_pgs["Mac"] = 0.78
    lilac_ts["Mac"] = 0.75
    drain3_pgs["Mac"] = 0.83
    drain3_ts["Mac"] = 0.82
    brain_pgs["Mac"] = 0.95
    brain_ts["Mac"] = 0.89
    
    # OpenSSH
    sys_pgs["OpenSSH"] = 0.97
    sys_ts["OpenSSH"] = 0.97
    lilac_pgs["OpenSSH"] = 0.72
    lilac_ts["OpenSSH"] = 0.68
    drain3_pgs["OpenSSH"] = 0.96
    drain3_ts["OpenSSH"] = 0.92
    brain_pgs["OpenSSH"] = 0.99
    brain_ts["OpenSSH"] = 0.92
    
    # OpenStack
    sys_pgs["OpenStack"] = 0.93
    sys_ts["OpenStack"] = 0.86
    lilac_pgs["OpenStack"] = 0.93
    lilac_ts["OpenStack"] = 0.88
    drain3_pgs["OpenStack"] = 0.66
    drain3_ts["OpenStack"] = 0.67
    brain_pgs["OpenStack"] = 1.00
    brain_ts["OpenStack"] = 0.97
    
    # Spark
    sys_pgs["Spark"] = 1.00
    sys_ts["Spark"] = 0.97
    lilac_pgs["Spark"] = 0.91
    lilac_ts["Spark"] = 0.93
    drain3_pgs["Spark"] = 0.97
    drain3_ts["Spark"] = 0.79
    brain_pgs["Spark"] = 0.97
    brain_ts["Spark"] = 0.91
    
    # Zookeeper
    sys_pgs["Zookeeper"] = 0.99
    sys_ts["Zookeeper"] = 0.96
    lilac_pgs["Zookeeper"] = 0.86
    lilac_ts["Zookeeper"] = 0.83
    drain3_pgs["Zookeeper"] = 0.99
    drain3_ts["Zookeeper"] = 0.98
    brain_pgs["Zookeeper"] = 0.99
    brain_ts["Zookeeper"] = 0.95
    
    # Calculate the sum of PGS and TS for each system
    sys_pgs_sum = sum(sys_pgs.values()) / 14
    sys_ts_sum = sum(sys_ts.values()) / 14
    lilac_pgs_sum = sum(lilac_pgs.values()) / 14
    lilac_ts_sum = sum(lilac_ts.values()) / 14
    drain3_pgs_sum = sum(drain3_pgs.values()) / 14
    drain3_ts_sum = sum(drain3_ts.values()) / 14
    brain_pgs_sum = sum(brain_pgs.values()) / 14
    brain_ts_sum = sum(brain_ts.values()) / 14
    
    # Calculate the sum of PGS and TS for each system
    sys_sum = sys_pgs_sum + sys_ts_sum
    lilac_sum = lilac_pgs_sum + lilac_ts_sum
    drain3_sum = drain3_pgs_sum + drain3_ts_sum
    brain_sum = brain_pgs_sum + brain_ts_sum
    
    # Print the results
    print(f"Sum of PGS for sys: {sys_pgs_sum:.2f}")
    print(f"Sum of TS for sys: {sys_ts_sum:.2f}")
    print(f"Total sum (PGS + TS) for sys: {sys_sum:.2f}")
    print()
    
    print(f"Sum of PGS for Lilac: {lilac_pgs_sum:.2f}")
    print(f"Sum of TS for Lilac: {lilac_ts_sum:.2f}")
    print(f"Total sum (PGS + TS) for Lilac: {lilac_sum:.2f}")
    print()
    
    print(f"Sum of PGS for Drain3: {drain3_pgs_sum:.2f}")
    print(f"Sum of TS for Drain3: {drain3_ts_sum:.2f}")
    print(f"Total sum (PGS + TS) for Drain3: {drain3_sum:.2f}")
    print()
    
    print(f"Sum of PGS for Brain: {brain_pgs_sum:.2f}")
    print(f"Sum of TS for Brain: {brain_ts_sum:.2f}")
    print(f"Total sum (PGS + TS) for Brain: {brain_sum:.2f}")
    
    # Return the sums for each system as a dictionary
    return {
        "sys": {"PGS": sys_pgs_sum, "TS": sys_ts_sum, "Total": sys_sum},
        "Lilac": {"PGS": lilac_pgs_sum, "TS": lilac_ts_sum, "Total": lilac_sum},
        "Drain3": {"PGS": drain3_pgs_sum, "TS": drain3_ts_sum, "Total": drain3_sum},
        "Brain": {"PGS": brain_pgs_sum, "TS": brain_ts_sum, "Total": brain_sum}
    }

if __name__ == "__main__":
    results = calculate_sums()
    
    # You could also add code here to generate a visualization of the results
    # For example, using matplotlib to create a bar chart comparing the systems

Sum of PGS for sys: 0.97
Sum of TS for sys: 0.91
Total sum (PGS + TS) for sys: 1.88

Sum of PGS for Lilac: 0.90
Sum of TS for Lilac: 0.82
Total sum (PGS + TS) for Lilac: 1.72

Sum of PGS for Drain3: 0.89
Sum of TS for Drain3: 0.81
Total sum (PGS + TS) for Drain3: 1.70

Sum of PGS for Brain: 0.86
Sum of TS for Brain: 0.74
Total sum (PGS + TS) for Brain: 1.60
