In [1]:
import pandas as pd
import numpy as np
import json
import os
import re

In [2]:
mappings_from_label_to_ids = {
    "imdb": {"negative": 0, "positive": 1},
    "ag_news": {"world": 0, "sports": 1, "business": 2, "science": 3},
    "dbpedia": {
        "agent": 0,
        "work": 1,
        "place": 2,
        "species": 3,
        "unitofwork": 4,
        "event": 5,
        "sportsseason": 6,
        "device": 7,
        "topicalconcept": 8,
    },
    "sst2": {"negative": 0, "positive": 1},
}

In [3]:
with open("all_results_from_perturbations_gpt_models.json") as f:
    gpt_results = json.load(f)
    f.close()

with open("all_results_from_perturbations_llama_models.json") as f:
    llama_results = json.load(f)
    f.close()

In [4]:
def get_prediction(item):
    try:
        if "\\boxed" in item:
            return int(item.split("\\boxed{")[-1].split("}")[0])
        else:
            return re.search(r"\d+", item).group(0)
    except Exception as e:
        return None

In [5]:
def process_results(predictions_labels, dataset):
    predictions = predictions_labels["predictions"]
    labels = predictions_labels["labels"]
    good_labels = mappings_from_label_to_ids[dataset].values()
    predictions = [get_prediction(item) for item in predictions]
    overlaps = 0
    none_predictions = 0
    out_of_distribution_predictions = 0
    for i in range(len(predictions)):
        if predictions[i] is None:
            none_predictions += 1
            continue
        try:
            if int(predictions[i]) == int(labels[i]):
                overlaps += 1
            if int(predictions[i]) not in good_labels:
                out_of_distribution_predictions += 1
        except Exception as e:
            print(e)
            print("Something went wrong with the predictions")
            pass
    print("None predictions: ", none_predictions)
    print("Out of distribution predictions: ", out_of_distribution_predictions)
    print("--------------------")
    return overlaps / len(predictions)

In [6]:
datasets = []
attack_types = []
# conditions = []
accuracy = []
for result in gpt_results:
    # for condition in result["results"].keys():
    attack_types.append(result["attack_type"])
    datasets.append(result["dataset"])
    # conditions.append(condition)
    accuracy.append(process_results(result["results"]["adv"], result["dataset"]))

results_df_gpt = pd.DataFrame(
    {
        "dataset": datasets,
        "attack_type": attack_types,
        # "condition": conditions,
        "accuracy": np.round(accuracy, 4) * 100,
    }
)

print(
    results_df_gpt.pivot_table(
        columns=["dataset", "attack_type"], values="accuracy", aggfunc=np.mean
    ).to_latex(
        # have only one decimal point
        float_format="%.1f",
    )
)

None predictions:  1
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  1
--------------------
None predictions:  1
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
-------

  results_df_gpt.pivot_table(


\begin{tabular}{lrrrrrrrrrrrrrrrr}
\toprule
dataset & \multicolumn{5}{r}{ag_news} & \multicolumn{5}{r}{dbpedia} & \multicolumn{5}{r}{imdb} & sst2 \\
attack_type & bae & deepwordbug & pwws & textbugger & textfooler & bae & deepwordbug & pwws & textbugger & textfooler & bae & deepwordbug & pwws & textbugger & textfooler & glue \\
\midrule
accuracy & 57.0 & 73.0 & 73.0 & 76.0 & 79.0 & 66.0 & 72.0 & 61.0 & 69.0 & 44.0 & 91.0 & 100.0 & 99.0 & 99.0 & 98.0 & 67.0 \\
\bottomrule
\end{tabular}



In [7]:
datasets = []
attack_types = []
# conditions = []
accuracy = []
for result in llama_results:
    # for condition in result["results"].keys():
    attack_types.append(result["attack_type"])
    datasets.append(result["dataset"])
    # conditions.append(condition)
    accuracy.append(process_results(result["results"]["adv"], result["dataset"]))

results_df_llama = pd.DataFrame(
    {
        "dataset": datasets,
        "attack_type": attack_types,
        # "condition": conditions,
        "accuracy": np.round(accuracy, 2) * 100,
    }
)

print(
    results_df_llama.pivot_table(
        columns=["dataset", "attack_type"], values="accuracy", aggfunc=np.mean
    ).to_latex(
        # have only one decimal point
        float_format="%.1f",
    )
)

None predictions:  0
Out of distribution predictions:  1
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  1
--------------------
None predictions:  0
Out of distribution predictions:  2
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  1
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  1
--------------------
None predictions:  0
Out of distribution predictions:  1
--------------------
None predictions:  0
Out of distribution predictions:  2
--------------------
None predictions:  0
Out of distribution predictions:  1
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
-------

  results_df_llama.pivot_table(


### Checking the performance of the models on test sets

In [10]:
datasets = []
attack_types = []
# conditions = []
accuracy = []
for result in gpt_results:
    # for condition in result["results"].keys():
    attack_types.append(result["attack_type"])
    datasets.append(result["dataset"])
    # conditions.append(condition)
    accuracy.append(process_results(result["results"]["test"], result["dataset"]))

results_df_gpt = pd.DataFrame(
    {
        "dataset": datasets,
        "attack_type": attack_types,
        # "condition": conditions,
        "accuracy": np.round(accuracy, 4) * 100,
    }
)

print(
    results_df_gpt.pivot_table(
        columns=["dataset"], values="accuracy", aggfunc=np.mean
    ).to_latex(
        # have only one decimal point
        float_format="%.1f",
    )
)

None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
-------

  results_df_gpt.pivot_table(


In [11]:
datasets = []
attack_types = []
# conditions = []
accuracy = []
for result in llama_results:
    # for condition in result["results"].keys():
    attack_types.append(result["attack_type"])
    datasets.append(result["dataset"])
    # conditions.append(condition)
    accuracy.append(process_results(result["results"]["test"], result["dataset"]))

results_df_llama = pd.DataFrame(
    {
        "dataset": datasets,
        "attack_type": attack_types,
        # "condition": conditions,
        "accuracy": np.round(accuracy, 2) * 100,
    }
)

print(
    results_df_llama.pivot_table(
        columns=["dataset"], values="accuracy", aggfunc=np.mean
    ).to_latex(
        # have only one decimal point
        float_format="%.1f",
    )
)

None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  1
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  1
--------------------
None predictions:  1
Out of distribution predictions:  1
--------------------
None predictions:  0
Out of distribution predictions:  3
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  1
--------------------
None predictions:  0
Out of distribution predictions:  5
--------------------
None predictions:  0
Out of distribution predictions:  2
--------------------
None predictions:  1
Out of distribution predictions:  0
--------------------
None predictions:  0
Out of distribution predictions:  0
--------------------
None predictions:  1
Out of distribution predictions:  0
-------

  results_df_llama.pivot_table(
