# Set-Up

In [None]:
from experiment import load_dataset, clean_datasets, generate_args
from src.logger import Logger
from src.builder import DatasetBuilder
from datasets import Dataset
import pandas as pd
import os

logger = Logger(log_filename="performances_climateFEVER")
dataset_builder = DatasetBuilder(seed=42)
args = generate_args(dataset_builder, ['climateFEVER_evidence', 'climateFEVER_evidence_climabench', 'climateFEVER_claims'], logger)

HF_REPO = "anonymous"

In [None]:
def prepare_dataset(dataset_name="climateFEVER_evidence"):
    seed=42
    dataset_max_size = 10000
        
    train, test, dev = load_dataset(dataset_name)
    
    y_train = train[args[dataset_name]['label_columns']]
    
    X_test = test[args[dataset_name]["input_columns"]]
    y_test = test[args[dataset_name]['label_columns']]
    
    return test

In [None]:
def compute_macro_label(df):
    # Define the logic for the macro label
    def get_macro_label(labels):
        labels_set = set(labels)
        if "SUPPORTS" in labels_set and "REFUTES" in labels_set:
            return "DISPUTED"
        elif "SUPPORTS" in labels_set:
            return "SUPPORTS"
        elif "REFUTES" in labels_set:
            return "REFUTES"
        elif labels_set == {"NOT_ENOUGH_INFO"}:
            return "NOT_ENOUGH_INFO"
        else:
            return "NOT_ENOUGH_INFO"

    # Group by the claim and compute the macro label for each group
    macro_labels = df.groupby(['claim', 'claim_label'])["pred_pair_label"].apply(get_macro_label)

    return macro_labels

# Finetuned

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

def evaluate_climatefever(test_dataset, model=f"{HF_REPO}/climateFEVER_evidence_climabench_42_distilRoBERTa", max_length=512):
    pipe = pipeline("text-classification", model=model, token=os.environ['HUB_TOKEN'],  padding="max_length", truncation=True, max_length=max_length, device=0)

    inputs_list = []
    label_list = []
    for i, r in test_dataset.iterrows():
        inputs_list += [{'text':r['text'], 'text_pair':r['query']}]
        label_list += [r['label']]
        
    outputs_list = pipe.predict(inputs_list)
    
    y_pred = [l['label'] for l in outputs_list]
    test_dataset['pred_pair_label'] = y_pred
    
    macro_labels = compute_macro_label(test_dataset).reset_index()
    
    return macro_labels

In [None]:
test = prepare_dataset("climateFEVER_evidence_climabench")
macro_labels = evaluate_climatefever(test, model=f"{HF_REPO}/climateFEVER_evidence_climabench_42_distilRoBERTa", max_length=512)
logger.add_precomputed_f1_score(
    y_test=macro_labels['claim_label'], 
    y_pred=macro_labels['pred_pair_label'],
    dataset_name="climateFEVER_claim_climabench_agg", 
    model_type="distilRoBERTa", 
    n_labels=4,
)
logger.save()

In [None]:
test = prepare_dataset("climateFEVER_evidence")
macro_labels = evaluate_climatefever(test, model=f"{HF_REPO}/climateFEVER_evidence_42_distilRoBERTa", max_length=512)
logger.add_precomputed_f1_score(
    y_test=macro_labels['claim_label'], 
    y_pred=macro_labels['pred_pair_label'],
    dataset_name="climateFEVER_claim_agg", 
    model_type="distilRoBERTa", 
    n_labels=4,
)
logger.save()

In [None]:
test = prepare_dataset("climateFEVER_evidence")

macro_labels = evaluate_climatefever(test, model=f"{HF_REPO}/climateFEVER_evidence_42_longformer", max_length=4096)

logger.add_precomputed_f1_score(
    y_test=macro_labels['claim_label'], 
    y_pred=macro_labels['pred_pair_label'],
    dataset_name="climateFEVER_claim_agg", 
    model_type="longformer", 
    n_labels=4,
)
logger.save()

In [None]:
test = prepare_dataset("climateFEVER_evidence_climabench")

macro_labels = evaluate_climatefever(test, model=f"{HF_REPO}/climateFEVER_evidence_climabench_42_longformer", max_length=4096)
logger.add_precomputed_f1_score(
    y_test=macro_labels['claim_label'], 
    y_pred=macro_labels['pred_pair_label'],
    dataset_name="climateFEVER_claim_climabench_agg", 
    model_type="longformer", 
    n_labels=4,
)
logger.save()

# Human Baseline

In [None]:
import pandas as pd

In [None]:
from utils import Generator
generator = Generator()

In [None]:
train, test, dev, _ = generator.load_dataset("climateFEVER_evidence")

In [None]:
import re
import numpy as np

def parse_evidences_votes(text):
    text=text.replace(",\n", ",")
    pattern = r"array\(\[(.*?)\]"
    matches = re.search(pattern, text)
    
    if matches:
        extracted_list = matches.group(1).split(', ')
        extracted_list = [None if item == 'None' else item.strip().strip("'") for item in extracted_list]
    else:
        extracted_list = np.nan
    return extracted_list

test['votes'] = test['evidences'].apply(parse_evidences_votes)

In [None]:
exploded_df = test.join(pd.DataFrame(test.pop('votes').tolist(), index=test.index))

In [None]:
exploded_df.rename(columns={
    0:"annotator1",
    1:"annotator2",
    2:"annotator3",
    3:"annotator4",
    4:"annotator5",
}, inplace=True)

In [None]:
from sklearn.metrics import classification_report

f1_scores = []

for i in [1,2,3,5]:
    annotator = "annotator"+str(i)
    
    y_true = exploded_df[~exploded_df[annotator].isna()]['label']
    y_pred = exploded_df[~exploded_df[annotator].isna()][annotator]
    
    report = classification_report(y_true=y_true, y_pred=y_pred, output_dict=True, zero_division=0.0)
    
    print(report)
    
    f1_scores += [report['macro avg']['f1-score']]

In [None]:
import numpy as np
import scipy.stats as stats

# Data
data = f1_scores

# Step 1: Compute the mean
mean = np.mean(data)

# Step 2: Compute the standard deviation
std_dev = np.std(data, ddof=1)  # ddof=1 for sample standard deviation

# Step 3: Sample size
n = len(data)

# Step 4: Compute the Standard Error of the Mean (SEM)
sem = std_dev / np.sqrt(n)

# Step 5: Determine the confidence level (95% -> Z = 1.96)
confidence_level = 0.95
z_score = stats.norm.ppf((1 + confidence_level) / 2)

# Step 6: Calculate the Margin of Error (ME)
margin_of_error = z_score * sem

# Step 7: Compute the confidence interval
confidence_interval = (mean - margin_of_error, mean + margin_of_error)

mean, confidence_interval


## Claims

In [None]:
def compute_macro_label_annotator(df, annotator):
    # Define the logic for the macro label
    def get_macro_label(labels):
        labels_set = set(labels)
        if "SUPPORTS" in labels_set and "REFUTES" in labels_set:
            return "DISPUTED"
        elif "SUPPORTS" in labels_set:
            return "SUPPORTS"
        elif "REFUTES" in labels_set:
            return "REFUTES"
        elif labels_set == {"NOT_ENOUGH_INFO"}:
            return "NOT_ENOUGH_INFO"
        else:
            return "NOT_ENOUGH_INFO"

    # Group by the claim and compute the macro label for each group
    macro_labels = df.groupby(['claim', 'claim_label'])[annotator].apply(get_macro_label)

    return macro_labels

In [None]:
claims_f1 = []
for i in [1,2,3,5]:
    annotator = "annotator"+str(i)
    
    subset_test = exploded_df[~exploded_df[annotator].isna()].copy()
    
    macro_labels = compute_macro_label_annotator(subset_test, annotator).reset_index()
    
    report = classification_report(y_true=macro_labels['claim_label'], y_pred=macro_labels[annotator], output_dict=True, zero_division=0.0)
    
    print(report)
    
    claims_f1 += [report['macro avg']['f1-score']]

In [None]:
import numpy as np
import scipy.stats as stats

# Data
data = claims_f1

# Step 1: Compute the mean
mean = np.mean(data)

# Step 2: Compute the standard deviation
std_dev = np.std(data, ddof=1)  # ddof=1 for sample standard deviation

# Step 3: Sample size
n = len(data)

# Step 4: Compute the Standard Error of the Mean (SEM)
sem = std_dev / np.sqrt(n)

# Step 5: Determine the confidence level (95% -> Z = 1.96)
confidence_level = 0.95
z_score = stats.norm.ppf((1 + confidence_level) / 2)

# Step 6: Calculate the Margin of Error (ME)
margin_of_error = z_score * sem

# Step 7: Compute the confidence interval
confidence_interval = (mean - margin_of_error, mean + margin_of_error)

mean, confidence_interval

# LLM

In [None]:
import os
import json
import re

from zero_shot import load_dict, extract_prompt, update_question, map_lobbymap_stance, prepare_content


# Open the JSON file
with open(os.path.join("llm", "mappings", "task_description.json"), 'r', encoding='utf-8') as file:
    task_descriptions = json.load(file)

# Open the JSON file
with open(os.path.join("llm", "mappings", "label_annotation.json"), 'r', encoding='utf-8') as file:
    label_readable_mapping = json.load(file)

prompts = load_dict("llm/prompts.json")

import re

def parse_label_explanation(text):
    # Regular expression to extract Label and Explanation
    label_pattern = r'Label:\s*(.*)'
    explanation_pattern = r'Explanation:\s*(.*)'

    # Find the label
    label_match = re.search(label_pattern, text)
    label = label_match.group(1) if label_match else None

    # Find the explanation
    explanation_match = re.search(explanation_pattern, text, re.DOTALL)
    explanation = explanation_match.group(1).strip() if explanation_match else None
    
    label = label.replace('[', "").replace(']', "")

    return label, explanation

def find_errors(dataset_name, gpt4o=False):
        
    # Loading data from saved file
    results = []
    if gpt4o:
        result_file_name = f"llm/outputs/gpt-4o/{dataset_name}.jsonl"
    else:
        result_file_name = f"llm/outputs/full/{dataset_name}.jsonl"
        # result_file_name = f"llm/outputs/{dataset_name}.jsonl"


    with open(result_file_name, 'r') as file:
        for line in file:
            # Parsing the JSON string into a dict and appending to the list of results
            json_object = json.loads(line.strip())
            results.append(json_object["response"]['body']["choices"][0]['message']['content'])
    
    labels = []
    explainations = []
    
    for result in results:
        label, explanation = parse_label_explanation(result)
        labels += [label]
        explainations += [explanation]
    
    #test = pd.read_parquet(os.path.join("doccano", "random", "parquet", f"{dataset_name}.pkl"))
    test = pd.read_parquet(os.path.join("data", "llm_green_nlp_tasks", f"{dataset_name}.pkl"))

    test['gpt-4o-mini_label'] = labels
    test['gpt-4o-mini_explanation'] = explainations
    
    
    if dataset_name in label_readable_mapping:
        label2id = {v.lower(): k for k, v in label_readable_mapping[dataset_name]['labels'].items()}
        test['gpt-4o-mini_label'] = test['gpt-4o-mini_label'].str.lower().map(label2id)  
        
    return test

In [None]:
import pandas as pd
test = find_errors("climateFEVER_evidence")

In [None]:
# Use a pipeline as a high-level helper
def evaluate_climatefever(test_dataset):          
    test_dataset['pred_pair_label'] = test["gpt-4o-mini_label"].copy()
    
    macro_labels = compute_macro_label(test_dataset).reset_index()
    
    return macro_labels

In [None]:
macro_labels = evaluate_climatefever(test)
logger.add_precomputed_f1_score(
    y_test=macro_labels['claim_label'], 
    y_pred=macro_labels['pred_pair_label'],
    dataset_name="climateFEVER_claim_agg", 
    model_type="gpt-4o-mini", 
    n_labels=4,
)
logger.save()