In [None]:
from sklearn.metrics import f1_score, recall_score, precision_score
import pandas as pd
import numpy as np
from glob import glob
import statistics
import joblib

def calc_scores(preds, average):
    preds_sample = preds.sample(frac=1, replace=True)
    f1 = f1_score(preds_sample["completion"], preds_sample["preds"], average=average)
    recall = recall_score(preds_sample["completion"], preds_sample["preds"], average=average)
    precision = precision_score(preds_sample["completion"], preds_sample["preds"], average=average)
    return f1, recall, precision


def bootstrapped_ci(preds, model, regime, average="weighted", n_replicate=10000, n_cores=10):
    f1_estimate = f1_score(preds["completion"], preds["preds"], average=average)
    recall_estimate = recall_score(preds["completion"], preds["preds"], average=average)
    precision_estimate = precision_score(preds["completion"], preds["preds"], average=average)
    
    out = np.vstack(joblib.Parallel(n_jobs=n_cores)(joblib.delayed(calc_scores)(preds, average) for _ in range(n_replicate)))
    f1, recall, precision = out[:,0], out[:,1], out[:,2]
    
    ci = pd.DataFrame({"model":[model],
                    "regime":[regime],
                    "f1_estimate":[f1_estimate],
                    "f1_ci.low":[np.percentile(f1, 2.5)],
                    "f1_ci.high":[np.percentile(f1, 97.5)],
                    "recall_estimate":[recall_estimate],
                    "recall_ci.low":[np.percentile(recall, 2.5)],
                    "recall_ci.high":[np.percentile(recall, 97.5)],
                    "precision_estimate":[precision_estimate],
                    "precision_ci.low":[np.percentile(precision, 2.5)],
                    "precision_ci.high":[np.percentile(precision, 97.5)]
                    }
                    )
    return ci


def thread_to_df(obj):
    import json
    from json_repair import repair_json

    obj['json'] = obj['preds'].replace({", {":", ",
                                                 "}}":"}",
                                                 "```json":"",
                                                 "```":""}, regex=True).str.strip()
    preds = pd.DataFrame()
    for i, thread in enumerate(obj.iterrows()):
        thread = repair_json(thread[1]['json'])

        try:
            comment = pd.DataFrame.from_dict([json.loads(thread)['comment']])
            reply = pd.DataFrame.from_dict(json.loads(thread)['replies'])
            comment['reply_id'] = 0
            output = pd.concat([comment, reply],axis=0)
            output['post_id'] = i+1
        except KeyError:
            comment = pd.DataFrame.from_dict([json.loads(thread)['comment']])
            comment['reply_id'] = 0
            output = comment
            output['post_id'] = i+1
        preds = pd.concat([preds, output],axis=0)
    preds["preds"] = preds["stanceTrump"] + ", " + preds["stanceClinton"]
    
    return preds.reset_index(drop=True)


def thread_ci(test_cleaned, preds_cleaned, model, regime):
    cis = pd.DataFrame()
    test_cleaned["reply_id_sum"] = test_cleaned.groupby("post_id")["reply_id"].transform("sum")
    preds_cleaned["reply_id_sum"] = preds_cleaned.groupby("post_id")["reply_id"].transform("sum")
                
    preds = pd.DataFrame({"completion": test_cleaned["completion"],
                            "preds": preds_cleaned["preds"]})
    
    ci = bootstrapped_ci(preds=preds, model=model, regime=regime)
    ci["reply_set"] = "overall"
    ci["reply_set_id"] = "overall"
    ci["dataset"] = "thread"
    cis = pd.concat([cis, ci], axis=0)

    for reply in [0, 1, 2, 3, 4, 5]:
        reply_id_sum = reply * (reply+1) / 2

        test_cleaned_subset = test_cleaned.loc[test_cleaned['reply_id_sum']==reply_id_sum,:]
        preds_cleaned_subset = preds_cleaned.loc[preds_cleaned['reply_id_sum']==reply_id_sum,:]
        
        preds = pd.DataFrame({"completion": test_cleaned_subset["completion"],
                            "preds": preds_cleaned_subset["preds"]})
        
        ci = bootstrapped_ci(preds=preds, model=model, regime=regime)
        ci["reply_set"] = reply
        ci["reply_set_id"] = "overall"
        ci["dataset"] = "thread"
        cis = pd.concat([cis, ci], axis=0)
        
        for r in range(reply+1):
            test_cleaned_subset_r = test_cleaned_subset.loc[test_cleaned_subset['reply_id']==r,:]
            preds_cleaned_subset_r = preds_cleaned_subset.loc[preds_cleaned_subset['reply_id']==r,:]
            
            preds = pd.DataFrame({"completion": test_cleaned_subset_r["completion"],
                            "preds": preds_cleaned_subset_r["preds"]})
        
            ci = bootstrapped_ci(preds=preds, model=model, regime=regime)
            ci["reply_set"] = reply
            ci["reply_set_id"] = r
            ci["dataset"] = "thread"
            cis = pd.concat([cis, ci], axis=0)
        
    for r in [0, 1, 2, 3, 4, 5]:
        test_cleaned_r = test_cleaned.loc[test_cleaned['reply_id']==r,:]
        preds_cleaned_r = preds_cleaned.loc[preds_cleaned['reply_id']==r,:]
        
        preds = pd.DataFrame({"completion": test_cleaned_r["completion"],
                        "preds": preds_cleaned_r["preds"]})
    
        ci = bootstrapped_ci(preds=preds, model=model, regime=regime)
        ci["reply_set"] = "overall"
        ci["reply_set_id"] = r
        ci["dataset"] = "thread"
        cis = pd.concat([cis, ci], axis=0)
                
    return cis


def combine_prediction(df):
    df['reply_set'] = df.groupby('post_id')['reply_id'].transform('max')
    df = df.pivot(index=['post_id', 'reply_set'], columns='reply_id', values='preds').reset_index().fillna("")
    
    df['preds'] = df[0] + ", " + df[1] + ", " + df[2] + ", " + df[3] + ", " + df[4]
    
    return df

target = [
    "bert-base-uncased",
    "sentence-transformers/all-mpnet-base-v2"
    "microsoft/deberta-v3-base",
    "DeBERTa-v3-base-MNLI",
    "google/flan-t5-xxl",
    "mistralai/Mistral-7B-v0.3",
    "mistralai/Mistral-7B-Instruct-v0.3",
    "meta-llama/Meta-Llama-3-8B",
    "meta-llama/Meta-Llama-3-70B",
    "meta-llama/Meta-Llama-3-8B-Instruct",
    "meta-llama/Meta-Llama-3-70B-Instruct",
    "davinci",
    "ada",
    "gpt4o"
]

# Prompt Engineering

In [None]:
cis = pd.DataFrame()
x_label = {"prompt1":"Minimal",
           "prompt2":"Sentence",
           "prompt3":"Context"
           }

for task in ["fb", "semeval"]:
    for num_data in ["prompt1", "prompt2", "prompt3"]:
        for t in ["Trump", "Clinton", "Joint"]:
            preds = pd.read_csv(f"predicted_labels/{task}_davinci_zero_{num_data}.csv")
            
            if task == "fb":
                if t != "Joint":
                    idx = 0 if t == "Trump" else 1
                    preds["completion"] = preds["completion"].apply(lambda x:x.split(", ")[idx])
                    preds["preds"] = preds["preds"].apply(lambda x:x.split(", ")[idx])
                    
            elif task == "semeval":
                if t != "Joint":
                    preds = preds.loc[preds["completion"].str.startswith(f"{t.title()}"),:]
                    
            ci = bootstrapped_ci(preds, "davinci", num_data)
            ci["target"] = t
            ci["dataset"] = task
            cis = pd.concat([cis, ci],axis=0)
            
cis['regime'] = cis['regime'].replace(x_label, regex=True)
cis.to_csv("tables-and-plots/raw_scores/prompt_engineering.csv", index=False)

# Bootstrapping for Tasks 1 and 2: Twitter and Facebook

In [None]:
task = "fb" # "semeval"
PATH = "predicted_labels/"

In [None]:
cis = pd.DataFrame()

## Zero shot
for m in target:
    for f in glob(f"{task}_{m.split("/")[-1]}_zero.csv", root_dir=PATH):
        preds = pd.read_csv(PATH+f)
        
        ci = bootstrapped_ci(preds=preds, model=m.split("/")[-1], regime="zero-shot")
        ci["dataset"] = task
        cis = pd.concat([cis, ci], axis=0)
            
## One shot
num_data = "one-shot"
for m in target:
    pooled = pd.DataFrame()
    
    for i, f in enumerate(glob(f"{task}_{m.split("/")[-1]}_one_eg_*.csv", root_dir=PATH)):
        preds = pd.read_csv(PATH+f)
        pooled = pd.concat([pooled, pd.DataFrame({f"preds_{i}":preds['preds']})],axis=1)
    preds_mode = pooled.mode(axis=1)[0]
    preds["preds"] = preds_mode
    
    ci = bootstrapped_ci(preds, model=m.split("/")[-1], regime="one-shot")
    ci["dataset"] = task
    cis = pd.concat([cis, ci], axis=0)
        
## Fine tuned
regime = ["10", "100", "1000", "all"]
for m in target:
    for num_data in regime:
        for f in glob(f"{task}_{m.split("/")[-1]}_{num_data}.csv", root_dir=PATH):
            preds = pd.read_csv(PATH+f)
            
            ci = bootstrapped_ci(preds, model=m.split("/")[-1], regime=num_data)
            ci["dataset"] = task
            cis = pd.concat([cis, ci], axis=0)
            
cis.to_csv("tables-and-plots/raw_scores/tasks_1_2.csv", index=False)

# Bootstrapping for Task 3: Facebook comment threads

In [None]:
test_cleaned = pd.read_csv("data/thread_test_cleaned.csv")

## JSON zero-shot and instruction-tuned

In [None]:
cis = pd.DataFrame()
for num_data in ["zero", "instruction_tuned"]:
    for m in ["Meta-Llama-3-8B-Instruct", "Meta-Llama-3-70B-Instruct", "gpt4o"]:    
        if not (num_data == "instruction_tuned" and m == "gpt4o"):
            preds = pd.read_csv(f"predicted_labels/thread_{m}_{num_data}.csv")
            preds_cleaned = thread_to_df(preds)
            preds_cleaned.to_csv(f"predicted_labels/thread_{m}_{num_data}_cleaned.csv", index=False)
            combine_prediction(preds_cleaned).to_csv(f"predicted_labels/thread_{m}_{num_data}_thread_joint_score.csv", index=False)
            ci = thread_ci(test_cleaned, preds_cleaned, m, num_data)
            cis = pd.concat([cis, ci],axis=0)

cis.to_csv("tables-and-plots/raw_scores/task3.csv", index=False)

## GPT-4o baseline zero-shot

In [None]:
preds_baseline = pd.read_csv("predicted_labels/thread_gpt4o_baseline.csv")
combine_prediction(preds_baseline).to_csv(f"predicted_labels/thread_gpt4o_baseline_thread_joint_score.csv", index=False)
cis = thread_ci(test_cleaned, preds_baseline, "gpt4o", "baseline")
cis.to_csv("tables-and-plots/raw_scores/task3_baseline.csv", index=False)

## Threadwise joint score

In [None]:
test_baseline_thread_joint = combine_prediction(test_cleaned.rename(columns={"completion":"preds"}))
cis = pd.DataFrame()
for num_data in ["zero", "instruction_tuned", "baseline"]:
    for m in ["Meta-Llama-3-8B-Instruct", "Meta-Llama-3-70B-Instruct", "gpt4o"]:    
        try:
            preds_cleaned = pd.read_csv(f"predicted_labels/thread_{m}_{num_data}_thread_joint_score.csv")
        except FileNotFoundError:
            continue
    
        preds = pd.DataFrame({"completion": test_baseline_thread_joint["preds"],
                              "preds": preds_cleaned["preds"]})
                            
        ci = bootstrapped_ci(preds)
        ci["reply_set"] = "overall"
        ci["dataset"] = "thread"
        cis = pd.concat([cis, ci], axis=0)
        
        for reply in [0, 1, 2, 3, 4, 5]:
            test_baseline_thread_joint_subset = test_baseline_thread_joint.loc[test_baseline_thread_joint['reply_set']==reply,:]
            preds_cleaned_subset = preds_cleaned.loc[preds_cleaned['reply_set']==reply,:]
            
            preds = pd.DataFrame({"completion": test_baseline_thread_joint_subset["preds"],
                                "preds": preds_cleaned_subset["preds"]})
            
            ci = bootstrapped_ci(preds)
            ci["reply_set"] = reply
            ci["dataset"] = "thread"
            cis = pd.concat([cis, ci], axis=0)

cis.to_csv("tables-and-plots/raw_scores/task3_threadwise_joint.csv", index=False)