In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import clear_output, display
 
from analysis import get_model_list
from post_process_save import get_cls_proper_model_list, get_fully_proper_model_list

sns.set_theme(style="whitegrid")
pd.set_option('display.max_colwidth', 500)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pruned_model_list = {
        "small":["mistralai--Mistral-7B-v0.1", "Open-Orca--Mistral-7B-OpenOrca", "HuggingFaceH4--zephyr-7b-alpha"],
        "large":["meta-llama--Llama-2-13b-chat-hf", "meta-llama--Llama-2-70b-chat-hf", "garage-bAInd--Platypus2-70B-instruct"],
        "openai":["text-davinci-003", "gpt-3.5-turbo", "gpt-4", "gpt-4-1106-preview"],
    }

pruned_models = sum(list(pruned_model_list.values()), [])

In [86]:
data = pd.read_csv("../../../curated_datasets/version_3/data_v3.csv", index_col=0)
len(data)

885

In [87]:
data["ground_truth"].value_counts(normalize=True) 
# unknown 13.89%, #known = 86.1

NO                0.650847
YES               0.168362
Unknown           0.138983
Yes in Fiction    0.041808
Name: ground_truth, dtype: float64

# What categories create confusion in the model? 
Here, confusion is defined as:
- incorrect answers for statements that have known ground truth (per RT, per PT, per Category)
- inconsistent responses for the same statement with different prompts. (+ sankey diagram for qualitative analyses)
    - P0 vs P1-3 
    - voting across prompts (2, 3, 4 votes)
    - vote analysis across categories
- Can model identify the difference between P3 and P4?
    - quantify flipping (yes->no, C and D don't flip)
    - effect of categories


## Incorrect answers for statements that have known ground truth (per RT, per PT, per Category)


In [3]:
def revise_ground_truth(df, prompt_type):
    df = df[df["ground_truth"] != "Unknown"]

    if prompt_type in ["P1", "P2"]: # sometimes P1,2 say Yes in Fiction. What about those? LLMs fault."""
        df["ground_truth"] = df["ground_truth"].replace({"Yes in Fiction": "NO"})
    else:
        df = df[df["ground_truth"] != "Yes in Fiction"]

    return df

def revise_response(df):
    df["new_response"] = df["new_response"].replace({"A":"YES", "B":"NO"})
    # we don't want to count GT conflict if model responses with anything other than Yes or No (Neither, C, D, Bad Output)
    acceptable = ["yes", "no"]
    df = df[df["new_response"].str.lower().isin(acceptable)]
    
    return df


def flip_p4_response(df):
    df["new_response"] = df["new_response"].str.lower().replace({"yes":"no", "no":"yes"})
    return df

def get_response_column(cols):
    if "new_response" in cols:
        return "new_response"
    elif "response_trimmed" in cols: # for non-mcq responses
        return "response_trimmed"
    elif "text_response" in cols: # for openai models
        return "text_response"
    else:
        print("Error! Can't find response columns!")

In [89]:
## Ground Truth conflict

def get_conflict_df():
    global_df = pd.DataFrame()
    bo_or_count_df = pd.DataFrame()
    prompt_types = [f"P{i}" for i in range(5)]

    for model_size in ["small", "large", "openai"]:
        if model_size == "openai":
            directory = f"model_responses/{model_size}"
            response_types = ["2_options", "3_options", "4_options"]
            include_probs=False
        else:
            directory = f"model_responses/{model_size}_model_runs/processed_model_responses_cls"
            response_types = ["2_options", "3_options", "4_options", "4_options/option_probs"]
            include_probs=True
        
        model_list = get_model_list(directory)
        proper_model_list = get_cls_proper_model_list(directory, model_list, include_probs)
        print(model_size, len(proper_model_list))

        for model in proper_model_list:
            for response_type in response_types:
                for prompt_type in prompt_types:
                    filename = f"{directory}/{model}/{response_type}/classification_response_{prompt_type}.csv"
                    df = pd.read_csv(filename, index_col=0)
                    df["new_response"] = df[get_response_column(df.columns)]
                    
                    df = revise_ground_truth(df, prompt_type)
                    original_size = len(df)
                    
                    # Also get ratio of C, D, Neither, and BO to get a sense of model's accuracy
                    # model has to have low of ^ these, and less ground truth conflict
                    other_response = len(df[df["new_response"].str.lower().isin(["c", "d", "neither"])])
                    bad_output = len(df[df["new_response"] == "Bad Output"])

                    ## Add these for per category BO and OT counts
                    df = df.assign(bo_count=lambda r: r["new_response"] == "Bad Output") # Bad Output count
                    df = df.assign(or_count=lambda r: r["new_response"].str.lower().isin(["c", "d", "neither"])) # Other Response count
                    df = df.assign(yes_no_count=lambda r: r["new_response"].str.lower().isin(["a", "b", "yes", "no"])) # yes/no responses
                    bo_or_df = df.groupby("tags").agg({"bo_count":"sum", "or_count":"sum", "yes_no_count":"sum", "new_response":"count"}).T
                    bo_or_df["prompt_type"] = prompt_type
                    bo_or_df["response_type"] = response_type
                    bo_or_df["model"] = model
                    bo_or_df["model_size"] = model_size
                    bo_or_count_df = pd.concat([bo_or_count_df, bo_or_df])

                    df = revise_response(df) # removes BO and Other Responses
                    if prompt_type == "P4":
                        df = flip_p4_response(df)

                    df = df.assign(conflict=lambda r: r["ground_truth"].str.lower() != r["new_response"].str.lower())
                    category_conflicts = df.groupby("tags").agg({"conflict":"sum"}).T
                    
                    category_conflicts["total_response"] = original_size
                    category_conflicts["total_conflict"] = sum(df["conflict"])
                    category_conflicts["total_yes_no"] = len(df)
                    category_conflicts["other_response"] = other_response
                    category_conflicts["bad_output"] = bad_output
                    category_conflicts["prompt_type"] = prompt_type
                    category_conflicts["response_type"] = response_type
                    category_conflicts["model"] = model
                    category_conflicts["model_size"] = model_size

                    # display(category_conflicts)
                    global_df = pd.concat([global_df, category_conflicts])
                    assert other_response + bad_output + len(df) == original_size
    
    return global_df, bo_or_count_df

In [90]:
global_df, bo_or_count_df = get_conflict_df()
global_df = global_df.reset_index(drop=True, names=["index"]) # this is for when we only have conflict counts

small 26
large 7
openai 4


In [91]:
global_df.to_csv("analysis_files/ground_truth_conflict.csv", index=False)

In [92]:
len(global_df)

720

#### Analysis

In [105]:
global_df = pd.read_csv("analysis_files/ground_truth_conflict.csv")

In [106]:
pruned_df = global_df[global_df["model"].isin(pruned_models)]
pruned_df["model"] = pruned_df["model"].apply(lambda x: x.split("--")[-1])

In [None]:
conflict_by_model = pruned_df.groupby(["model_size", "model"]).agg({"total_conflict":"sum", "total_yes_no":"sum" ,"other_response":"sum" ,"bad_output":"sum", "total_response":"sum"})

conflict_by_model["conflict_percent"] = conflict_by_model["total_conflict"]*100/conflict_by_model["total_yes_no"]
conflict_by_model["accuracy"] = 100-conflict_by_model["conflict_percent"]

conflict_by_model["yes_no_response_percent"] = conflict_by_model["total_yes_no"]*100/conflict_by_model["total_response"]

# for those that are supposed to be yes/no
conflict_by_model["other_response_percent"] = conflict_by_model["other_response"]*100/conflict_by_model["total_response"]

# ALSO need to check across all statements, without unknown/yes in fiction filtering
# for those that are supposed to be yes/no
conflict_by_model["bad_output_percent"] = conflict_by_model["bad_output"]*100/conflict_by_model["total_response"]

df = conflict_by_model[["conflict_percent", "accuracy", "yes_no_response_percent", "other_response_percent", "bad_output_percent"]].sort_values("conflict_percent")

df

In [None]:
print(df[["accuracy", "other_response_percent", "bad_output_percent"]].to_latex(index=True,float_format="{:.1f}".format))

In [108]:
conflict_by_rt = pruned_df.groupby(["model_size", "model", "response_type"]).agg({"total_conflict":"sum", "total_yes_no":"sum" ,"other_response":"sum" ,"bad_output":"sum", "total_response":"sum"})
conflict_by_rt["conflict_percent"] = conflict_by_rt["total_conflict"]*100/conflict_by_rt["total_yes_no"]
conflict_by_rt["accuracy"] = 100-conflict_by_rt["conflict_percent"]
conflict_by_rt["yes_no_response_percent"] = conflict_by_rt["total_yes_no"]*100/conflict_by_rt["total_response"]
conflict_by_rt["other_response_percent"] = conflict_by_rt["other_response"]*100/conflict_by_rt["total_response"]
conflict_by_rt["bad_output_percent"] = conflict_by_rt["bad_output"]*100/conflict_by_rt["total_response"]

df = conflict_by_rt.reset_index().pivot_table(index="model", columns="response_type", values="accuracy")
df["diff"] = df.max(axis=1) - df.min(axis=1)

In [None]:
print(df.to_latex(index=True,float_format="{:.1f}".format))

In [41]:
conflict_by_pt = pruned_df.groupby(["model_size", "model", "prompt_type"]).agg({"total_conflict":"sum", "total_yes_no":"sum" ,"other_response":"sum" ,"bad_output":"sum", "total_response":"sum"})
conflict_by_pt["conflict_percent"] = conflict_by_pt["total_conflict"]*100/conflict_by_pt["total_yes_no"]
conflict_by_pt["accuracy"] = 100-conflict_by_pt["conflict_percent"]
conflict_by_pt["yes_no_response_percent"] = conflict_by_pt["total_yes_no"]*100/conflict_by_pt["total_response"]
conflict_by_pt["other_response_percent"] = conflict_by_pt["other_response"]*100/conflict_by_pt["total_response"]
conflict_by_pt["bad_output_percent"] = conflict_by_pt["bad_output"]*100/conflict_by_pt["total_response"]


df = conflict_by_pt.reset_index().pivot_table(index="model", columns="prompt_type", values="accuracy")
df["diff"] = df.max(axis=1) - df.min(axis=1)

In [None]:
print(df.to_latex(index=True,float_format="{:.1f}".format))

In [None]:
## Find best combinations
df = pruned_df[pruned_df["total_yes_no"]/pruned_df["total_response"] >= 0.7] # >=70\% yes_no_repsonse
df["accuracy"] = 100 - df["total_conflict"]*100/df["total_yes_no"]
df = df[["model", "response_type", "prompt_type", "accuracy"]].sort_values("accuracy", ascending=False)
with pd.option_context('display.max_rows', None):
    _df = df.copy(True)
    mp = {"2_options":"2......................", 
          "3_options":"........3...............", 
          "4_options":"...............4........", 
          "4_options/option_probs":
                     ".......................4a"}
    _df["response_type"] = _df["response_type"].apply(lambda x: mp[x])
    display(_df)

In [None]:
print(df.head(10).to_latex(index=False,float_format="{:.1f}".format))

In [None]:
print(df.tail(10).to_latex(index=False,float_format="{:.1f}".format))

In [None]:
print(df.to_latex(index=False,float_format="{:.1f}".format))

In [134]:
bo_or_count_df = bo_or_count_df[bo_or_count_df["model"].isin(pruned_models)]
bo_or_count_df["model"] = bo_or_count_df["model"].apply(lambda x: x.split("--")[-1])

In [159]:
# conflict by category
conflict_by_ct = pruned_df.groupby(["model"]).agg({"Conspiracy":"sum", "Controversy":"sum" ,"Fact":"sum" ,"Fiction":"sum", "Misconception":"sum", "Stereotype":"sum"})
yes_no_count_df = bo_or_count_df.loc["yes_no_count"].groupby(["model"]).agg({"Conspiracy":"sum", "Controversy":"sum" ,"Fact":"sum" ,"Fiction":"sum", "Misconception":"sum", "Stereotype":"sum"})
conflict_by_ct = conflict_by_ct*100/yes_no_count_df

conflict_by_ct_df = conflict_by_ct[["Fact", "Conspiracy", "Controversy", "Misconception", "Stereotype", "Fiction"]]

In [160]:
# max - 2nd max
d = conflict_by_ct_df.apply(max, axis=1) - conflict_by_ct_df.apply(lambda r: r[r.nlargest(2).index.values[1]], axis=1)
d, d.mean()

(model
 Llama-2-13b-chat-hf        2.732201
 Llama-2-70b-chat-hf        4.185463
 Mistral-7B-OpenOrca       11.152448
 Mistral-7B-v0.1           26.180876
 Platypus2-70B-instruct     6.795469
 gpt-3.5-turbo              0.484620
 gpt-4                     10.230259
 gpt-4-1106-preview         7.544915
 text-davinci-003           8.418343
 zephyr-7b-alpha            7.513403
 dtype: float64,
 8.523799723467741)

In [None]:
print(conflict_by_ct_df.to_latex(index=True,float_format="{:.1f}".format))

In [182]:
## for each model and category pair, what is the least conflict in any given prompt or rt?
## i.e what is the optimal number for each model-category pair?

optimal_df = pruned_df.copy(True).set_index(["model", "response_type", "prompt_type"])[["Fact", "Conspiracy", "Controversy", "Misconception", "Stereotype", "Fiction"]]
yes_no_count_df = bo_or_count_df.loc["yes_no_count"].set_index(["model", "response_type", "prompt_type"])[["Fact", "Conspiracy", "Controversy", "Misconception", "Stereotype", "Fiction"]]
optimal_df = optimal_df*100/yes_no_count_df
optimal_df = optimal_df.reset_index()

# shows which rt, pt had the minimum value in the df below
idx_min_df = optimal_df.set_index(["response_type", "prompt_type"]).groupby("model").idxmin()[[
    "Fact", 
    "Conspiracy", 
    "Controversy", 
    "Misconception", 
    "Stereotype",
    "Fiction", 
    ]]

# display(idx_min_df)

min_df = optimal_df.groupby("model").min()[[
    "Fact", 
    "Conspiracy", 
    "Controversy", 
    "Misconception", 
    "Stereotype",
    "Fiction", 
    ]]

min_df

bo_and_other_response_df = idx_min_df.copy(deep=True)

for col in idx_min_df.columns:
    for ix, (rt, pt) in idx_min_df[col].items():
        cat = col.split("_")[0]
        cond = (bo_or_count_df["model"]==ix) & (bo_or_count_df["response_type"]==rt) & (bo_or_count_df["prompt_type"]==pt)
        bo_count = bo_or_count_df[cond].loc["bo_count", cat]
        or_count = bo_or_count_df[cond].loc["or_count", cat]
        tot_count = bo_or_count_df[cond].loc["new_response", cat]
        bo_and_other_response_df.loc[ix, col] = f"({bo_count*100/tot_count:.1f}, {or_count*100/tot_count:.1f})"

bo_and_other_response_df;

In [192]:
df = min_df.applymap(lambda x: f"{x:.1f} ") + bo_and_other_response_df

In [None]:
print(df.to_latex(index=True).replace("(0.0", "(0").replace(" 0.0", " 0"))

In [None]:
# [DONE] conflict by model, by pt across rt (which prompt is good per model)
# conflict by pt across model and rt (which prompt is good?)

# conflict by rt (across model, pt) (which rt is best?)
# [DONE] conflict per rt per model (across pt)

# conflict by category (across everythig else) (which category is most vulnerable)
# [DONE] conflict by model by category (across pt and rt) (which category is vulnerable by model)

## Inconsistent responses for the same statement with different prompts.
Find number of statements that change responses with prompt (per category and overall).

(+ sankey diagram and full-text for qualitative analyses)

- P0 vs P1-3 
- voting across prompts (2, 3, 4 votes)
- vote analysis across categories

Additionally:
- P3 vs P4: did it flip?


## P0 vs P1-3 

In [6]:
## Prompt consistency

def load_df(filename):
    df = pd.read_csv(filename, index_col=0)
    df["new_response"] = df[get_response_column(df.columns)]
    return df

def get_inter_prompt_conflict(ignore_yes_in_fiction, only_yes_in_fiction):
    global_pt_conflict_df = pd.DataFrame()
    
    for model_size in ["small", "large", "openai"]:
        if model_size == "openai":
            directory = f"model_responses/{model_size}"
            response_types = ["2_options", "3_options", "4_options"]
            include_probs=False
        else:
            directory = f"model_responses/{model_size}_model_runs/processed_model_responses_cls"
            response_types = ["2_options", "3_options", "4_options", "4_options/option_probs"]
            include_probs=True
        
        model_list = get_model_list(directory)
        proper_model_list = get_cls_proper_model_list(directory, model_list, include_probs)
        print(model_size, len(proper_model_list))

        for model in proper_model_list:
            for response_type in response_types:

                dir = f"{directory}/{model}/{response_type}"
                
                p0_df = load_df(f"{dir}/classification_response_P0.csv")
                
                if ignore_yes_in_fiction:
                    p0_df = p0_df[p0_df["ground_truth"]!="Yes in Fiction"]
                if only_yes_in_fiction:
                    p0_df = p0_df[p0_df["ground_truth"]=="Yes in Fiction"]
                
                # Get ratio BO to get a sense of model's accuracy
                p0_df = p0_df.assign(p0_bad_output=lambda r: r["new_response"] == "Bad Output")

                for prompt_type in ["P1", "P2", "P3"]:
                    df = load_df(f"{dir}/classification_response_{prompt_type}.csv")
                    
                    if ignore_yes_in_fiction:
                        df = df[df["ground_truth"]!="Yes in Fiction"]
                    if only_yes_in_fiction:
                        df = df[df["ground_truth"]=="Yes in Fiction"]
                    
                    # Get ratio BO to get a sense of model's accuracy
                    bad_output = len(df[df["new_response"] == "Bad Output"])
                    df["p0_response"] = p0_df["new_response"]
                    df["p0_bad_output"] = p0_df["p0_bad_output"]

                    df = df.assign(p0_conflict=lambda r: r["p0_response"].str.lower() != r["new_response"].str.lower())
                    df = df.assign(bad_output=lambda r: r["new_response"] == "Bad Output")
                    df = df.assign(both_bad_output=lambda r: (r["new_response"] == "Bad Output") & (r["p0_response"] == "Bad Output"))

                    assert len(df) == len(p0_df)
                    
                    df["total"] = ""
                    conflicts = df.groupby("tags").agg({"p0_conflict":"sum", "p0_bad_output":"sum", "bad_output":"sum", "total":"count", "both_bad_output":"sum"})
                    conflicts["model_size"] = model_size
                    conflicts["model"] = model
                    conflicts["response_type"] = response_type
                    conflicts["prompt_type"] = prompt_type

                    # display(conflicts)
                    global_pt_conflict_df = pd.concat([global_pt_conflict_df, conflicts])
    
    return global_pt_conflict_df

global_pt_conflict_df = get_inter_prompt_conflict(True, False).reset_index() # ignore Yes in Fiction since it is SUPPOSED to conflict, for total fconflict calculation (by model and by rt)
global_pt_conflict_df_yes_in_fiction = get_inter_prompt_conflict(False, True).reset_index()

small 26
large 7
openai 4
small 26
large 7
openai 4


In [26]:
global_pt_conflict_df.to_csv("analysis_files/P0_conflict.csv", index=False)

In [None]:
global_pt_conflict_df = pd.read_csv("analysis_files/P0_conflict.csv")
global_pt_conflict_df.head(2)

In [47]:
pruned_df = global_pt_conflict_df[global_pt_conflict_df["model"].isin(pruned_models)]
pruned_df["model"] = pruned_df["model"].apply(lambda x: x.split("--")[-1])

In [None]:
pt_conflict = pruned_df.pivot(
    index = ['model_size', 'model', 'response_type', 'tags', 'total', 'p0_bad_output'],
    columns = ["prompt_type"],
    values = ["p0_conflict", 'bad_output', 'both_bad_output'],
)

pt_conflict

In [None]:
pt_conflict = pt_conflict.reset_index()
pt_conflict.columns = ['model_size', 'model', 'response_type', 'tags', 'total', 'p0_bad_output', 
                       'p0_conflict_w_p1','p0_conflict_w_p2','p0_conflict_w_p3', 
                       'p1_bad_output', 'p2_bad_output', 'p3_bad_output',
                       'p1_both_bad_output', 'p2_both_bad_output', 'p3_both_bad_output',
                       ]
pt_conflict.head(2)

#### Brainstorm all questions

set 1
- [DONE] [x] which model is most inconsistent? (by model across rt, category, pt)
- [x] which category is most consistent and inconsistent? (by category across model, rt, pt)
- [] which prompt is closest to P0? (across model, category, rt)
- [] ?? which rt produces most consistent / inconsistent responses?

set 2
- [DONE] [x] which prompt is most inconsistent by model? (acorss rt, category)
- [DONE] [x] which category is most inconsistent by model? (across rt, pt)
- [] ?? which rt is most inconsistent by model? (across category, pt)

set 3
- [x] which prompt is more/less conistent by model by rt? (across category)
- [DONE][x] which prompt is more/less consistent by model by category? (across rt)

In [None]:
# which prompt is more/less conistent by model by rt? (across category)
pt_conflict_by_rt = pt_conflict.groupby(['model', 'response_type']).sum() #['p0_conflict_w_p1', 'p0_conflict_w_p2', 'p0_conflict_w_p3']

# which prompt is most inconsistent by model? (acorss rt, category)
pt_conflict_by_rt["all_pt_conflict"] = pt_conflict_by_rt["p0_conflict_w_p1"] \
                                        + pt_conflict_by_rt["p0_conflict_w_p2"] \
                                        + pt_conflict_by_rt["p0_conflict_w_p3"]

pt_conflict_by_rt["conflict_percent"] = pt_conflict_by_rt["all_pt_conflict"]*100/(3*pt_conflict_by_rt["total"]) # 3 for P1, P2, P3
pt_conflict_by_rt["p1_conflict_percent"] = pt_conflict_by_rt["p0_conflict_w_p1"]*100/pt_conflict_by_rt["total"]
pt_conflict_by_rt["p2_conflict_percent"] = pt_conflict_by_rt["p0_conflict_w_p2"]*100/pt_conflict_by_rt["total"]
pt_conflict_by_rt["p3_conflict_percent"] = pt_conflict_by_rt["p0_conflict_w_p3"]*100/pt_conflict_by_rt["total"]

# pt_conflict_by_rt = pt_conflict_by_rt.sort_values("conflict_percent")
display(pt_conflict_by_rt.head(10))

In [None]:
df = pt_conflict_by_rt[['conflict_percent']].reset_index()#.sort_values("conflict_percent")
df = df.pivot_table(index="model", columns="response_type", values="conflict_percent")
df
print(df.to_latex(index=True,float_format="{:.1f}".format))

In [None]:
# which model is most inconsistent? (by model across rt, category, pt)
pt_conflict_by_model = pt_conflict.groupby(['model_size', 'model']).sum()

# which prompt is most inconsistent by model? (acorss rt, category)
pt_conflict_by_model["all_pt_conflict"] = pt_conflict_by_model["p0_conflict_w_p1"] \
                                        + pt_conflict_by_model["p0_conflict_w_p2"] \
                                        + pt_conflict_by_model["p0_conflict_w_p3"]

pt_conflict_by_model["all_bo"] = pt_conflict_by_model["p1_both_bad_output"] \
                                        + pt_conflict_by_model["p2_both_bad_output"] \
                                        + pt_conflict_by_model["p3_both_bad_output"]

pt_conflict_by_model = pt_conflict_by_model.sort_values("all_pt_conflict")
display(pt_conflict_by_model.head(10))

In [52]:
pt_conflict_by_model["bo_percent"] = pt_conflict_by_model["all_bo"]*100/(3*pt_conflict_by_model["total"]) # 3 for P1, P2, P3
pt_conflict_by_model["conflict_percent"] = pt_conflict_by_model["all_pt_conflict"]*100/(3*pt_conflict_by_model["total"]) # 3 for P1, P2, P3
pt_conflict_by_model["p1_conflict_percent"] = pt_conflict_by_model["p0_conflict_w_p1"]*100/pt_conflict_by_model["total"]
pt_conflict_by_model["p2_conflict_percent"] = pt_conflict_by_model["p0_conflict_w_p2"]*100/pt_conflict_by_model["total"]
pt_conflict_by_model["p3_conflict_percent"] = pt_conflict_by_model["p0_conflict_w_p3"]*100/pt_conflict_by_model["total"]

In [None]:
df = pt_conflict_by_model[['p1_conflict_percent', 'p2_conflict_percent', 'p3_conflict_percent', 'conflict_percent', 'bo_percent']].sort_values("conflict_percent")
df

In [None]:
print(df.to_latex(index=True,float_format="{:.1f}".format))

In [None]:
# which category is most consistent and inconsistent? (by category across model, rt, pt)
# pt_conflict_by_category = pt_conflict.groupby(['tags']).sum()

# pt_conflict_by_category["all_pt_conflict"] = pt_conflict_by_category["p0_conflict_w_p1"] \
#                                         + pt_conflict_by_category["p0_conflict_w_p2"] \
#                                         + pt_conflict_by_category["p0_conflict_w_p3"]

# pt_conflict_by_category = pt_conflict_by_category.sort_values("all_pt_conflict")
# display(pt_conflict_by_category.head(10))


# which category is most inconsistent by model? (across rt, pt)
pt_conflict_by_category_by_model = pt_conflict.groupby(['model', 'tags']).sum()

pt_conflict_by_category_by_model["all_pt_conflict"] = pt_conflict_by_category_by_model["p0_conflict_w_p1"] \
                                        + pt_conflict_by_category_by_model["p0_conflict_w_p2"] \
                                        + pt_conflict_by_category_by_model["p0_conflict_w_p3"]

pt_conflict_by_category_by_model["conflict_percent"] = pt_conflict_by_category_by_model["all_pt_conflict"]*100/(pt_conflict_by_category_by_model["total"]*3)
pt_conflict_by_category_by_model["p1_conflict_percent"] = pt_conflict_by_category_by_model["p0_conflict_w_p1"]*100/pt_conflict_by_category_by_model["total"]
pt_conflict_by_category_by_model["p2_conflict_percent"] = pt_conflict_by_category_by_model["p0_conflict_w_p2"]*100/pt_conflict_by_category_by_model["total"]
pt_conflict_by_category_by_model["p3_conflict_percent"] = pt_conflict_by_category_by_model["p0_conflict_w_p3"]*100/pt_conflict_by_category_by_model["total"]

display(pt_conflict_by_category_by_model.head(12)) 

In [58]:
df = pt_conflict_by_category_by_model[['p1_conflict_percent', 'p2_conflict_percent','p3_conflict_percent', 'conflict_percent']]\
    .reset_index()

DF = pd.DataFrame()

for model, group_df in df.groupby("model"):
    del group_df["model"]
    group_df.set_index("tags")
    group_df = group_df.pivot_table(columns=["tags"])
    group_df["model"] = model
    DF = pd.concat([DF, group_df])

df = DF.reset_index().rename(columns={"index":"conflict"}).set_index(["model"])

In [None]:
d = df[df["conflict"]!="conflict_percent"][["conflict", "Fact", "Conspiracy", "Controversy", "Misconception", "Stereotype", "Fiction"]]
print(d.to_latex(index=True,float_format="{:.1f}".format))

In [None]:
cats = ["Fact", "Conspiracy", "Controversy", "Misconception", "Stereotype", "Fiction"]
for cat in cats:
    # display(df.reset_index().set_index("conflict").groupby("model")[cat].idxmin()) # which prompt has the least conflict in each category
    display(df.reset_index().set_index("conflict").groupby("model")[cat].idxmax()) # which prompt has the most conflict in each category

In [None]:
d = df[df["conflict"]=="conflict_percent"][["Fact", "Conspiracy", "Controversy", "Misconception", "Stereotype", "Fiction"]]
print(d.to_latex(index=True,float_format="{:.1f}".format))

In [69]:
pt_conflict_by_category_by_model = pt_conflict_by_category_by_model.reset_index()

In [None]:
## analyze this: is there a common most conflict category for most models?
common_category = pt_conflict_by_category_by_model.loc[
    pt_conflict_by_category_by_model.groupby(['model'])["conflict_percent"].idxmax()][['model', "tags", "conflict_percent"]
]
display(common_category)

display(common_category["tags"].value_counts())

"""
Fiction          15
Misconception     8
Fact              7
Stereotype        7
"""
# This means Fiction is the category is the most conflict in 15 models
# Conspiracy and Controversy are never the 'most conflict' in any models

### Only prompt conflict in Yes in Fiction

In [None]:
df = global_pt_conflict_df_yes_in_fiction[global_pt_conflict_df_yes_in_fiction["model"].isin(pruned_models)]
df["model"] = df["model"].apply(lambda x: x.split("--")[-1])

df = df.pivot(
    index = ['model_size', 'model', 'response_type', 'tags', 'total', 'p0_bad_output'],
    columns = ["prompt_type"],
    values = ["p0_conflict", 'bad_output', 'both_bad_output'],
)

df = df.reset_index()
df.columns = ['model_size', 'model', 'response_type', 'tags', 'total', 'p0_bad_output', 
            'p0_conflict_w_p1','p0_conflict_w_p2','p0_conflict_w_p3', 
            'p1_bad_output', 'p2_bad_output', 'p3_bad_output',
            'p1_both_bad_output', 'p2_both_bad_output', 'p3_both_bad_output',
            ]
display(df.head(2))


# which model is most inconsistent? (by model across rt, category, pt)
pt_conflict_by_model = df.groupby(['model']).sum()

# which prompt is most inconsistent by model? (acorss rt, category)
pt_conflict_by_model["all_pt_conflict"] = pt_conflict_by_model["p0_conflict_w_p1"] \
                                        + pt_conflict_by_model["p0_conflict_w_p2"] \
                                        + pt_conflict_by_model["p0_conflict_w_p3"]

pt_conflict_by_model["all_bo"] = pt_conflict_by_model["p1_both_bad_output"] \
                                        + pt_conflict_by_model["p2_both_bad_output"] \
                                        + pt_conflict_by_model["p3_both_bad_output"]

pt_conflict_by_model = pt_conflict_by_model.sort_values("all_pt_conflict")
display(pt_conflict_by_model.head(10))

pt_conflict_by_model["bo_percent"] = pt_conflict_by_model["all_bo"]*100/(3*pt_conflict_by_model["total"]) # 3 for P1, P2, P3
pt_conflict_by_model["conflict_percent"] = pt_conflict_by_model["all_pt_conflict"]*100/(3*pt_conflict_by_model["total"]) # 3 for P1, P2, P3
pt_conflict_by_model["p1_conflict_percent"] = pt_conflict_by_model["p0_conflict_w_p1"]*100/pt_conflict_by_model["total"]
pt_conflict_by_model["p2_conflict_percent"] = pt_conflict_by_model["p0_conflict_w_p2"]*100/pt_conflict_by_model["total"]
pt_conflict_by_model["p3_conflict_percent"] = pt_conflict_by_model["p0_conflict_w_p3"]*100/pt_conflict_by_model["total"]

df = pt_conflict_by_model[['p1_conflict_percent', 'p2_conflict_percent', 'p3_conflict_percent', 'conflict_percent', 'bo_percent']].sort_values("conflict_percent")
display(df)

df = pt_conflict_by_model[['p1_conflict_percent', 'p2_conflict_percent', 'p3_conflict_percent', 'conflict_percent']]
print(df.to_latex(index=True,float_format="{:.1f}".format))

## Voting across prompts and categories
- voting across prompts (2, 3, 4 votes)
- vote analysis across categories

Qs:
- [TODO] [ ] per model
- [x] vote per model, per rt, per category
- [TODO] [x] per model per cateory
- [x] per category (across models and rt) 

In [65]:
## Prompt consistency

global_vote_df = pd.DataFrame()

def load_df(filename):
    df = pd.read_csv(filename, index_col=0)
    df["new_response"] = df[get_response_column(df.columns)]
    return df

for model_size in ["small", "large", "openai"]:
    if model_size == "openai":
        directory = f"model_responses/{model_size}"
        response_types = ["2_options", "3_options", "4_options"]
        include_probs=False
    else:
        directory = f"model_responses/{model_size}_model_runs/processed_model_responses_cls"
        response_types = ["2_options", "3_options", "4_options", "4_options/option_probs"]
        include_probs=True
    
    model_list = get_model_list(directory)
    proper_model_list = get_cls_proper_model_list(directory, model_list, include_probs)
    print(model_size, len(proper_model_list))

    for model in proper_model_list:
        for response_type in response_types:
            cols = ['text', 'tags', 'sub_tags', 'ground_truth', 'paper link', 'data_source']
            intermediate_df = load_df(f"{directory}/{model}/{response_type}/classification_response_P0.csv")[cols]

            for prompt_type in ["P0", "P1", "P2", "P3", "P4"]:
                df = load_df(f"{directory}/{model}/{response_type}/classification_response_{prompt_type}.csv")
                assert len(intermediate_df[cols].compare(df[cols])) == 0 # no difference
                intermediate_df[prompt_type + "_response"] = df["new_response"]
            
            intermediate_df["model_size"] = model_size
            intermediate_df["model"] = model
            intermediate_df["response_type"] = response_type
            
            global_vote_df = pd.concat([global_vote_df, intermediate_df], ignore_index=True)

small 26
large 7
openai 4


In [66]:
def count_vote(row):
    mp = {}
    for i in range(4):
        col = f"P{i}_response"
        val = row[col]
        if val in mp:
            mp[val] += 1
        else:
            mp[val] = 1

    return max(list(mp.values()))

global_vote_df["vote"] = global_vote_df.apply(count_vote, axis=1)
# vote = 1 means every prompt have a different output!

In [67]:
global_vote_df.to_csv("analysis_files/vote_across_prompts.csv", index=False)

In [15]:
global_vote_df = pd.read_csv("analysis_files/vote_across_prompts.csv")
print(len(global_vote_df))

global_vote_df = global_vote_df[global_vote_df["ground_truth"]!="Yes in Fiction"]
len(global_vote_df)

127440


122112

In [17]:
pruned_df = global_vote_df[global_vote_df["model"].isin(pruned_models)]
pruned_df["model"] = pruned_df["model"].apply(lambda x: x.split("--")[-1])

In [None]:
# per category (across models and rt) 
per_cat = pruned_df.groupby(["vote", "tags"]).agg({"text":"count"}).rename(columns={"text":"count"})
# display(per_cat)

## sort by category, rename legend title
g = sns.catplot(
    data=per_cat.reset_index(), kind="bar",
    x="vote", y="count", hue="tags",
    hue_order=["Fact", "Conspiracy", "Controversy", "Misconception", "Fiction", "Stereotype"],
)

In [None]:
# pruned_df.groupby("model").agg({"vote":"count"})
df = pruned_df.groupby(["model", "vote"]).size().unstack(fill_value=0)
df["total"] = df.sum(axis=1)
for i in range(1,5):
    df[i] = df[i]*100/df["total"]

df

In [None]:
print(df[[1,2,3,4]].sort_values(4, ascending=False).to_latex(index=True,float_format="{:.1f}".format))

In [58]:
# per model per cateory
per_cat_per_model = pruned_df.groupby(["model_size", "model", "vote", "tags"]).agg({"text":"count"}).rename(columns={"text":"Count"})
per_cat_per_model = per_cat_per_model.groupby(["model_size", "model", "tags"]).apply(lambda x:100 * x / float(x.sum()))
# display(per_cat)

## sort by category, rename legend title
def draw_per_model_per_cat_vote(model_size, col_order):
    df = per_cat_per_model.reset_index().rename(columns={"tags":"Categories", "vote":"Vote Count", "model":"Model"})
    df = df[df["model_size"]==model_size]
    g = sns.catplot(
        data=df, kind="bar",
        x="Vote Count", y="Count", hue="Categories", col="Model", #row="model",
        hue_order=["Fact", "Conspiracy", "Controversy", "Misconception", "Stereotype", "Fiction"],
        col_order=col_order
    )
    g.set(ylim=(0, 100))
    g.set_axis_labels(y_var="% vote frequency by category")
    # return g

In [None]:
draw_per_model_per_cat_vote("openai", col_order=['text-davinci-003','gpt-3.5-turbo','gpt-4','gpt-4-1106-preview'])
draw_per_model_per_cat_vote("large", col_order=['Llama-2-13b-chat-hf','Llama-2-70b-chat-hf','Platypus2-70B-instruct'])
draw_per_model_per_cat_vote("small", col_order=['Mistral-7B-v0.1','Mistral-7B-OpenOrca','zephyr-7b-alpha'])

In [None]:
# vote per model, per rt, per category
per_cat_per_model_per_rt = pruned_df.groupby(["model_size", "model", "tags", "response_type", "vote"]).agg({"text":"count"}).rename(columns={"text":"Count"})
per_cat_per_model_per_rt.head(2)

In [110]:
def draw_per_rt_per_cat_vote(model):
    df = per_cat_per_model_per_rt.reset_index().rename(columns={"tags":"Categories", "vote":"Vote Count", "model":"Model", "response_type":"Response Type"})
    df = df[df["Model"]==model]
    g = sns.catplot(
        data=df, kind="bar",
        x="Vote Count", y="Count", hue="Categories", col="Response Type", #row="model",
        hue_order=["Fact", "Conspiracy", "Controversy", "Misconception", "Stereotype", "Fiction"],
    )
    g.figure.subplots_adjust(top=0.8)
    g.figure.suptitle(model)
    g.savefig(f"analysis_files/figs/per_rt_vote/per_model_per_rt_per_cat_vote_{model}.png")

In [None]:
for model in pruned_models:
    model = model.split("--")[-1]
    draw_per_rt_per_cat_vote(model);

In [None]:
# per model per category count of 4 votes

df = pruned_df.groupby(["model","tags", "vote"]).size().unstack(fill_value=0)
df["total"] = df.sum(axis=1)
for i in range(1,5):
    df[i] = df[i]*100/df["total"]

df = df[[4]].reset_index().pivot_table(index="model", columns="tags", values=4)[["Fact", "Conspiracy", "Controversy", "Misconception", "Stereotype", "Fiction"]]
df

In [None]:
print(df.to_latex(index=True,float_format="{:.1f}".format))

In [None]:
# per model per rt count of 4 votes

df = pruned_df.groupby(["model","response_type", "vote"]).size().unstack(fill_value=0)
df["total"] = df.sum(axis=1)
for i in range(1,5):
    df[i] = df[i]*100/df["total"]

df = df[[4]].reset_index().pivot_table(index="model", columns="response_type", values=4)
df

In [None]:
print(df.to_latex(index=True,float_format="{:.1f}".format))

### P3 vs P4 (Did it flip?)
From p3->p4
- Neither, C, D, should remain the same
- BO in p3 or p4 should be ignored
- Yes/A should become No/B and vice versa

"flipped" -> higher is better

In [60]:
global_vote_df = pd.read_csv("analysis_files/vote_across_prompts.csv")
global_vote_df = global_vote_df[global_vote_df["ground_truth"]!="Yes in Fiction"]

global_vote_df = global_vote_df[global_vote_df["model"].isin(pruned_models)]
global_vote_df["model"] = global_vote_df["model"].apply(lambda x: x.split("--")[-1])

HANDLE BO. SO far:
- Accuracy: BO is ignored. Only yes/no responses are considered. *This means is a model produces all BO, and 1/2 correct answer, it gets to be 100% accurate, which is not good.*
- Consistency: BO is NOT ignored. This mant BO--BO was considered consistent, Yes/NO-BO is inconsistent. *This means a model gets points for producing BO-BO, which is not good? maybe?*
- Resolution: BO is ignored. Only yes/no is considered. *This means if a model produces all BO and 1/2 yes-no, it is considerd 100% correct, which is not good.*

In [None]:
# ignore BO
flip_df = global_vote_df[["model_size", "model", "response_type", "tags", "ground_truth", "P3_response", "P4_response"]].copy(deep=True)
flip_df = flip_df[(flip_df["P3_response"]!="Bad Output") & (flip_df["P4_response"]!="Bad Output")]
## ALSO SHOW THE NUMBER OF BO. Because we dont want to give a model points if it keeps produces 
# BO while we penalize a model for producing a reasonable but incorrect answer

def is_cd_good(p3, p4):
    # both should be c or d or Neither
    return p3 in ["Neither", "C", "D"] and p3==p4

def is_ab_good(p3, p4):
    # Yes to No or vice versa
    # A to B or vie versa
    mp = {
        "A":"Yes",
        "YES":"Yes",
        "Yes":"Yes",
        "B":"No",
        "NO":"No",
        "No":"No",
    }
    opposite_mp = {
        "Yes":"No",
        "No":"Yes"
    }
    return p3 in mp and p4 in mp and mp[p3]==opposite_mp[mp[p4]]

flip_df["cd_good"] = flip_df.apply(lambda r: is_cd_good(r["P3_response"], r["P4_response"]), axis=1)
flip_df["ab_good"] = flip_df.apply(lambda r: is_ab_good(r["P3_response"], r["P4_response"]), axis=1)

flip_df["flipped"] = (flip_df["cd_good"]+flip_df["ab_good"]).astype(int)
flip_df

In [None]:
# per model, per rt, per tag flip
df = flip_df.groupby(["model_size", "model", "response_type", "tags"]).agg({"flipped":["sum", "count"]})
df = df.reset_index()
df.columns = ["model_size", "model", "response_type", "tags", "flipped_count", "total_count"] # here total is total among which flipped is correct
df.to_csv("analysis_files/p3_flipped.csv", index=False)
df

In [None]:
# per model
per_model_flip = flip_df.groupby(["model"]).agg({"flipped":["sum", "count"]})
per_model_flip.columns = ["flipped_sum", "should_flip_count"]
per_model_flip["percent"] = per_model_flip["flipped_sum"]*100/per_model_flip["should_flip_count"]

per_model_flip.reset_index().to_csv("analysis_files/p3_flipped_by_model.csv", index=False)

per_model_flip.sort_values("percent", ascending=False)

In [None]:
# Find percent of BO
bo_df = global_vote_df[["model_size", "model", "response_type", "tags", "ground_truth", "P3_response", "P4_response"]].copy(deep=True)
bo_df["bo"] = (bo_df["P3_response"]=="Bad Output") | (bo_df["P4_response"]=="Bad Output")
bo_df = bo_df.groupby(["model"]).agg({"bo":["sum", "count"]})
bo_df.columns = ['_'.join(col).strip() for col in bo_df.columns.values]
bo_df["bo_percent"] = bo_df["bo_sum"]*100/bo_df["bo_count"]
bo_df

In [None]:
per_model_flip["bo_output"] = bo_df["bo_percent"]
per_model_flip

In [None]:
df = per_model_flip[["percent", "bo_output"]].sort_values("percent", ascending=False)
print(df.to_latex(index=True,float_format="{:.1f}".format))

In [None]:
# per model, per tag flip
flip_df["tags"] = pd.Categorical(flip_df["tags"], ["Fact", "Conspiracy", "Controversy", "Misconception", "Fiction", "Stereotype"])
per_model_per_tag_flip = flip_df.groupby(["model", "tags"]).agg({"flipped":["sum", "count"]})
per_model_per_tag_flip.columns = ["flipped_sum", "should_flip_count"]
per_model_per_tag_flip["percent"] = per_model_per_tag_flip["flipped_sum"]*100/per_model_per_tag_flip["should_flip_count"]
# per_model_per_tag_flip = per_model_per_tag_flip.sort_values("percent", ascending=False)

# Find percent of BO
bo_df = global_vote_df[["model_size", "model", "response_type", "tags", "ground_truth", "P3_response", "P4_response"]].copy(deep=True)
bo_df["bo"] = (bo_df["P3_response"]=="Bad Output") | (bo_df["P4_response"]=="Bad Output")
bo_df = bo_df.groupby(["model", "tags"]).agg({"bo":["sum", "count"]})
bo_df.columns = ['_'.join(col).strip() for col in bo_df.columns.values]
bo_df["bo_percent"] = bo_df["bo_sum"]*100/bo_df["bo_count"]

per_model_per_tag_flip["bo_percent"] = bo_df["bo_percent"]
per_model_per_tag_flip["bo_count"] = bo_df["bo_sum"]
per_model_per_tag_flip["total_response_count"] = bo_df["bo_count"]
# bo_sum + should_flip_count == total_response_count

per_model_per_tag_flip.head(2)

In [None]:
df = per_model_per_tag_flip[["percent", "bo_percent"]]
df["string"] = df.apply(lambda r: f"{r['percent']:.1f} ({r['bo_percent']:.1f})", axis=1)
df["string"] = df["string"].astype(str)
df = df[["string"]]
df = df.reset_index().set_index("tags")
df = df.reset_index().pivot_table(index="model", columns="tags", aggfunc=lambda x: x)
df

In [None]:
print(df.to_latex(index=True,float_format="{:.1f}".format))

In [None]:
# per model, per rt flip
per_model_per_rt_flip = flip_df.groupby(["model", "response_type"]).agg({"flipped":["sum", "count"]})
per_model_per_rt_flip.columns = ["flipped_sum", "should_flip_count"]
per_model_per_rt_flip["percent"] = per_model_per_rt_flip["flipped_sum"]*100/per_model_per_rt_flip["should_flip_count"]

# Find percent of BO
bo_df = global_vote_df[["model_size", "model", "response_type", "tags", "ground_truth", "P3_response", "P4_response"]].copy(deep=True)
bo_df["bo"] = (bo_df["P3_response"]=="Bad Output") | (bo_df["P4_response"]=="Bad Output")
bo_df = bo_df.groupby(["model", "response_type"]).agg({"bo":["sum", "count"]})
bo_df.columns = ['_'.join(col).strip() for col in bo_df.columns.values]
bo_df["bo_percent"] = bo_df["bo_sum"]*100/bo_df["bo_count"]

per_model_per_rt_flip["bo_percent"] = bo_df["bo_percent"]
per_model_per_rt_flip["bo_count"] = bo_df["bo_sum"]
per_model_per_rt_flip["total_response_count"] = bo_df["bo_count"]
# bo_sum + should_flip_count == total_response_count

df = per_model_per_rt_flip[["percent", "bo_percent"]]
df["string"] = df.apply(lambda r: f"{r['percent']:.1f} ({r['bo_percent']:.1f})", axis=1)
df["string"] = df["string"].astype(str)
df = df[["string"]]
df = df.reset_index().set_index("response_type")
df = df.reset_index().pivot_table(index="model", columns="response_type", aggfunc=lambda x: x)
display(df.head())
print(df.to_latex(index=True,float_format="{:.1f}".format))

## Model confidence (MCQ prob)
Only consider models that did not randomize and/or have realtively good combined probability.
- How do confidence scores change for responses that remained the same. "Number of statements with ≥20% points absolute change of confidence as compared to prompt 0."