In [12]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [17]:
def get_random_and_freq_baseline(dataset: str, test:str):
    train_df = pd.read_csv(f"{dataset}/train.csv")
    test_df = pd.read_csv(f"{dataset}/{test}")

    train_labels = train_df["label"].values
    labels_stats = dict(Counter(train_labels))
    
    all_accuracies_random = []
    all_accuracies_freq = []
    all_f1_random = []
    all_f1_freq = []
    all_precision_random = []
    all_precision_freq = []
    all_recall_random = []
    all_recall_freq = []
    
    for _ in range(1000):

        random_predictions = np.random.choice(list(labels_stats.keys()), size=len(test_df))
        count_labels = list(labels_stats.values())
        prob_labels = [x / sum(count_labels) for x in count_labels]
        freq_predictions = np.random.choice(list(labels_stats.keys()), size=len(test_df), p=prob_labels)
        correct_predictions = test_df["label"].values
        
        all_accuracies_freq.append(accuracy_score(correct_predictions, freq_predictions))
        all_accuracies_random.append(accuracy_score(correct_predictions, random_predictions))
        
        all_f1_freq.append(f1_score(correct_predictions, freq_predictions, average="weighted"))
        all_f1_random.append(f1_score(correct_predictions, random_predictions, average="weighted"))
        
        all_precision_freq.append(precision_score(correct_predictions, freq_predictions, average="weighted"))
        all_precision_random.append(precision_score(correct_predictions, random_predictions, average="weighted"))
        
        all_recall_freq.append(recall_score(correct_predictions, freq_predictions, average="weighted"))
        all_recall_random.append(recall_score(correct_predictions, random_predictions, average="weighted"))
        
    print(f"Accuracy random: {np.mean(all_accuracies_random)}")
    print(f"Precision random: {np.mean(all_precision_random)}")
    print(f"Recall random: {np.mean(all_recall_random)}")
    print(f"F1 random: {np.mean(all_f1_random)}")
    
    
    
    print(f"Accuracy freq: {np.mean(all_accuracies_freq)}")
    print(f"Precision freq: {np.mean(all_precision_freq)}")
    print(f"Recall freq: {np.mean(all_recall_freq)}")
    print(f"F1 freq: {np.mean(all_f1_freq)}")
    print('----' * 10)

In [20]:
for dataset in ["bigbench", "coarsegrained", "finegrained"]:
    print(dataset)
    get_random_and_freq_baseline(dataset, "test.csv")
    
    if dataset != "bigbench":
        print('climate')
        get_random_and_freq_baseline(dataset, "climate_test.csv")
    print('---' * 10)


bigbench
Accuracy random: 0.49970238095238095
Precision random: 0.5018756395795523
Recall random: 0.49970238095238095
F1 random: 0.4999438146148811
Accuracy freq: 0.5013571428571428
Precision freq: 0.5013693745253403
Recall freq: 0.5013571428571428
F1 freq: 0.5010694117475375
----------------------------------------
------------------------------
coarsegrained
Accuracy random: 0.24942105263157896
Precision random: 0.4135434995353205
Recall random: 0.24942105263157896
F1 random: 0.2986321743579108
Accuracy freq: 0.41536842105263155
Precision freq: 0.41346144306844373
Recall freq: 0.41536842105263155
F1 freq: 0.41375048085621596
----------------------------------------
climate
Accuracy random: 0.24927329192546582
Precision random: 0.5080008544018725
Recall random: 0.24927329192546582
F1 random: 0.32351433939685575
Accuracy freq: 0.4464223602484473
Precision freq: 0.5088130521486752
Recall freq: 0.4464223602484473
F1 freq: 0.4680894153139386
----------------------------------------
------