# Set Up

In [None]:
import ast

import pandas as pd
import numpy as np
from utils import Generator
import os
info_df = pd.read_excel(os.path.join(os.getcwd(), "paper_utils", "other", "paper_list.xlsx"), sheet_name="datasets")

In [None]:
generator = Generator()

# Dataset sizes

In [None]:
sizes = dict()

for dataset_name in set(generator.dataset_list)-set(["ClimaINS_ours", "climateFEVER_evidence_climabench"]):
    # print(dataset_name)
    train, test, dev = generator.loading_raw_datasets(dataset_name)
    # print("loaded")
    df = pd.concat([train, test, dev])
    mx = df['token_counts'].max()
    df = 0

    sizes[dataset_name] = {
        "train": len(train),
        "test": len(test),
        "dev": len(dev),
        "token": mx
    }
    
sorted_by_values = dict(sorted(sizes.items(), key=lambda item: item[1]["test"]))
    
for dataset_name in sorted_by_values.keys():
    line = ""
    line += dataset_name.replace("_", "\\_").replace("&", "\\&") + info_df[info_df["name"]==dataset_name]["source"].values[0].replace("citet", "cite") + " & "
    line += str(sorted_by_values[dataset_name]["train"]) + " & "
    line += str(sorted_by_values[dataset_name]["dev"]) + " & "
    line += str(sorted_by_values[dataset_name]["test"]) + " & "
    line += str(sorted_by_values[dataset_name]["token"]) + "\\\\"

    print(line)

# Dataset Imbalanced Ratio

In [None]:
import ast

In [None]:
from collections import Counter

def imbalance_ratio(df, label_column):
    counter = Counter(df[label_column])
    majority_class_count = max(counter.values())
    minority_class_count = min(counter.values())
    imbalance_ratio = majority_class_count / minority_class_count
    return imbalance_ratio

In [None]:
imbalance_ratios = dict()
test_imbalance_ratios = dict()
raw_imbalance_ratios = dict()
summary_count_values = dict()

for dataset_name in set(generator.dataset_list)-set(["ClimaINS_ours", "climateFEVER_evidence_climabench"]):
    print(dataset_name)
    raw_train, _, _ = generator.loading_raw_datasets(dataset_name)
    train, test, _, _ = generator.load_nlp_tasks_dataset(dataset_name)
    
    if dataset_name == "logicClimate":
        raw_train['label'] = raw_train['label'].apply(ast.literal_eval)
        raw_train = raw_train[['label']].explode('label')
        test['label'] = test['label'].apply(ast.literal_eval)
        test = test[['label']].explode('label')
        train['label'] = train['label'].apply(ast.literal_eval)
        train = train[['label']].explode('label')
    elif dataset_name == "lobbymap_query":
        raw_train = raw_train[['label']].explode('label')
        test = test[['label']].explode('label')
        train = train[['label']].explode('label')

    raw_imbalance_ratios[dataset_name] = imbalance_ratio(raw_train, generator.args[dataset_name]['label_columns'])
    imbalance_ratios[dataset_name] = imbalance_ratio(train, generator.args[dataset_name]['label_columns'])
    test_imbalance_ratios[dataset_name] = imbalance_ratio(test, generator.args[dataset_name]['label_columns'])
    summary_count_values[dataset_name] = raw_train[generator.args[dataset_name]['label_columns']].value_counts()
    

In [None]:
sorted_by_values = dict(sorted(raw_imbalance_ratios.items(), key=lambda item: item[1]))

In [None]:
for dataset_name in sorted_by_values.keys():
    line = ""
    line += dataset_name.replace("_", "\\_").replace("&", "\\&") + info_df[info_df["name"]==dataset_name]["source"].values[0].replace("citet", "cite") + " & "
    line += str(np.round(raw_imbalance_ratios[dataset_name],1)) + " & "
    line += str(np.round(imbalance_ratios[dataset_name],1)) + " & "
    line += str(np.round(test_imbalance_ratios[dataset_name],1)) + " & "
    if generator.args[dataset_name]["weighted_loss"]:
        line += "\\cmark"
    line += " \\\\"
    print(line)

# Dataset Language and Noise Ratio

In [None]:
train, test, dev = generator.loading_raw_datasets(dataset_name)

In [None]:
print("dataset & non-english & non-clean & noise & word salad \\")

for dataset_name in set(generator.dataset_list)-set(["ClimaINS_ours", "climateFEVER_evidence_climabench"]):
    train, test, dev = generator.loading_raw_datasets(dataset_name)
    
    line = ""
    
    line += dataset_name.replace("_", "\\_").replace("&", "\\&")  + info_df[info_df["name"]==dataset_name]["source"].values[0].replace("citet", "cite") + " & "
    line += str(np.round(100*(train["language"].value_counts().sum() - train["language"].value_counts()['en']) / train["language"].value_counts().sum(), 2)) + " \\%  & "
    line += str(np.round(100*(train["gibberish"].value_counts().sum() - train["gibberish"].value_counts()['clean']) / train["gibberish"].value_counts().sum(), 2)) + " \\%  & "
    line += str(np.round(100*(train["gibberish"].value_counts()['noise'] if "noise" in train["gibberish"].value_counts() else 0) / train["gibberish"].value_counts().sum(), 2)) + " \\%  & "
    line += str(np.round(100*(train["gibberish"].value_counts()['word salad'] if "word salad" in train["gibberish"].value_counts() else 0) / train["gibberish"].value_counts().sum(), 2)) + " \\%  \\\\"
    print(line)

# Duplicates and contamination

In [None]:
train, test, dev = generator.loading_raw_datasets(dataset_name)

In [None]:
dataset_name

In [None]:
print("dataset & text duplicates & cleaned text duplicates & exact duplicates & Dataset Size \\\\")

for dataset_name in set(generator.dataset_list)-set(["ClimaINS_ours", "climateFEVER_climabench", "lobbymap_query"]):
    train = pd.read_parquet("data/cleaned_datasets_archive/"+dataset_name+"/train.pkl") 
    test = pd.read_parquet("data/cleaned_datasets_archive/"+dataset_name+"/test.pkl") 
    dev = pd.read_parquet("data/cleaned_datasets_archive/"+dataset_name+"/dev.pkl")
        
    full_dataset = pd.concat([train, test, dev], ignore_index=True)
    
    if dataset_name == "climateFEVER_evidence_climabench":
        dataset_name = "climateFEVER_evidence"

    print(
        dataset_name.replace("_", "\\_").replace("&", "\\&")  + info_df[info_df["name"]==dataset_name]["source"].values[0].replace("citet", "cite"), "&",
        len(full_dataset[full_dataset.duplicated(subset=["text"], keep=False)]), "&",
        len(full_dataset[full_dataset.duplicated(subset=["clean_text"], keep=False)]), "&",
        len(full_dataset[full_dataset.duplicated(subset=["clean_text", "label"], keep=False)]), "&",
        len(full_dataset),
        "\\\\"
    )

In [None]:

    line = ""
    
    line += dataset_name.replace("_", "\\_").replace("&", "\\&")  + info_df[info_df["name"]==dataset_name]["source"].values[0].replace("citet", "cite") + " & "
    line += str(np.round(100*(train["language"].value_counts().sum() - train["language"].value_counts()['en']) / train["language"].value_counts().sum(), 2)) + " \\%  & "
    line += str(np.round(100*(train["gibberish"].value_counts().sum() - train["gibberish"].value_counts()['clean']) / train["gibberish"].value_counts().sum(), 2)) + " \\%  & "
    line += str(np.round(100*(train["gibberish"].value_counts()['noise'] if "noise" in train["gibberish"].value_counts() else 0) / train["gibberish"].value_counts().sum(), 2)) + " \\%  & "
    line += str(np.round(100*(train["gibberish"].value_counts()['word salad'] if "word salad" in train["gibberish"].value_counts() else 0) / train["gibberish"].value_counts().sum(), 2)) + " \\%  \\\\"
    print(line)

In [None]:
train, test, dev = generator.loading_raw_datasets("climatext")

train[train["gibberish"] == "noise"]

In [None]:
import os
import pandas as pd
from experiment import load_dataset

path = os.path.join(os.getcwd(), "experiment_results", "cartography", 'distilRoBERTa')

# Collect all CSV files in the directory
csv_files = [file for file in os.listdir(path) if file.endswith('.tsv')]
new_csv_files = []
for dataset_name in generator.dataset_list:
    if dataset_name+"_train_42.tsv" in csv_files:
        new_csv_files += [(dataset_name, os.path.join(path, dataset_name+"_train_42.tsv"))]
csv_files=new_csv_files

for idx, (dataset_name, file) in enumerate(csv_files):
    print("Processing file: ", dataset_name)
    
    train, _, _ = load_dataset(dataset_name)
    
    carto_df = pd.read_csv(file, sep="\t")
    
    print(len(carto_df))

In [None]:
for dataset_name in generator.dataset_list:
    train , test, dev, _ = generator.load_nlp_tasks_dataset(dataset_name)
    print("Processing file: ", dataset_name)
    print(len(train))

In [None]:
print("dataset & text duplicates & exact duplicates & Dataset Size \\\\")

for dataset_name in set(generator.dataset_list)-set(["ClimaINS_ours", "climateFEVER_climabench", "lobbymap_query"]):
    train = pd.read_parquet("data/green_nlp_tasks/"+dataset_name+"/train.pkl") 
    test = pd.read_parquet("data/green_nlp_tasks/"+dataset_name+"/test.pkl") 
    dev = pd.read_parquet("data/green_nlp_tasks/"+dataset_name+"/dev.pkl")
        
    full_dataset = pd.concat([train, test, dev], ignore_index=True)
    
    if dataset_name == "climateFEVER_evidence_climabench":
        dataset_name = "climateFEVER_evidence"

    print(
        dataset_name.replace("_", "\\_").replace("&", "\\&")  + info_df[info_df["name"]==dataset_name]["source"].values[0].replace("citet", "cite"), "&",
        len(full_dataset[full_dataset.duplicated(subset=["text"], keep=False)]), "&",
        len(full_dataset[full_dataset.duplicated(subset=["text", "label"], keep=False)]), "&",
        len(full_dataset),
        "\\\\"
    )