### This file creates the trivial baselines for the paper (table 4 in the paper) 

In [None]:
import argparse
import os
import random
import warnings
import json
import wandb
import shutil
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from pathlib import Path
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

In [None]:
DIMS = [
    'Inappropriateness',
    'Toxic Emotions',
    'Excessive Intensity',
    'Emotional Deception',
    'Missing Commitment',
    'Missing Seriousness',
    'Missing Openness',
    'Missing Intelligibility',
    'Unclear Meaning',
    'Missing Relevance',
    'Confusing Reasoning',
    'Other Reasons',
    'Detrimental Orthography',
    'Reason Unclassified'
]

In [None]:
data_dir = '../../data/'

In [None]:
df = pd.read_csv(data_dir+'appropriateness-corpus/appropriateness_corpus_conservative_w_folds.csv')

#### Human-upperbound

In [None]:
for repeat in range(5):
    for k in range(5):
        out_dicts = []
        for i in [6,7,8]:
            df_ann = pd.read_csv(data_dir+'appropriateness-corpus/appropriateness_corpus_annotator{}.csv'.format(i))
            split_dict = {x: y for x, y in zip(df['post_id'].tolist(), df['fold{}.{}'.format(repeat,k)].tolist())}
            df_ann['fold'] = df_ann['post_id'].apply(lambda x: split_dict[x])
            out_dict = {}
            prec = 0
            rec = 0
            macroF1 = 0
            for j, dim in enumerate(DIMS):
                labels = df[df['fold{}.{}'.format(repeat,k)]=='TEST'][dim].tolist()
                predictions = df_ann[df_ann['fold']=='TEST'][dim].tolist()
                scores = precision_recall_fscore_support(labels, predictions, average='macro')
                prec += scores[0]
                rec += scores[1]
                macroF1 += scores[2]
                out_dict['eval_'+dim+'_precision'] = scores[0]
                out_dict['eval_'+dim+'_recall'] = scores[1]
                out_dict['eval_'+dim+'_macroF1'] = scores[2]
            out_dict['eval_mean_precision'] = prec/len(DIMS)
            out_dict['eval_mean_recall'] = rec/len(DIMS)
            out_dict['eval_mean_F1'] = macroF1/len(DIMS)
            out_dicts.append(out_dict)
        d = {}
        for l, _ in out_dicts[0].items():
            d[l] = np.mean([d[l] for d in out_dicts])
        if not os.path.isdir(data_dir+'models/human-upperbound/fold{}.{}'.format(repeat,k)):
            os.mkdir(data_dir+'models/human-upperbound/fold{}.{}'.format(repeat,k))
        with open(data_dir+'models/human-upperbound/fold{}.{}/test_results.json'.format(repeat,k), 'w') as f:
            json.dump(d, f)
        #print(out_dicts)

#### Majority baseline

In [None]:
for repeat in range(5):
    for k in range(5):
        split_dict = {x: y for x, y in zip(df['post_id'].tolist(), df['fold{}.{}'.format(repeat,k)].tolist())}
        df['fold'] = df['post_id'].apply(lambda x: split_dict[x])
        out_dict = {}
        prec = 0
        rec = 0
        macroF1 = 0
        for dim in DIMS:
            labels = df[df['fold']=='TEST'][dim].tolist()
            most_common = max(set(labels), key = labels.count)
            scores = precision_recall_fscore_support(labels, [most_common for _ in range(len(labels))], average='macro')
            prec += scores[0]
            rec += scores[1]
            macroF1 += scores[2]
            out_dict['eval_'+dim+'_precision'] = scores[0]
            out_dict['eval_'+dim+'_recall'] = scores[1]
            out_dict['eval_'+dim+'_macroF1'] = scores[2]
        out_dict['eval_mean_precision'] = prec/len(DIMS)
        out_dict['eval_mean_recall'] = rec/len(DIMS)
        out_dict['eval_mean_F1'] = macroF1/len(DIMS)
        if not os.path.isdir(data_dir+'models/majority-baseline/fold{}.{}'.format(repeat,k)):
            os.mkdir(data_dir+'models/majority-baseline/fold{}.{}'.format(repeat,k))
        with open(data_dir+'models/majority-baseline/fold{}.{}/test_results.json'.format(repeat,k), 'w') as f:
            json.dump(out_dict, f)

#### Random baseline

In [None]:
for repeat in range(5):
    for k in range(5):
        split_dict = {x: y for x, y in zip(df['post_id'].tolist(), df['fold{}.{}'.format(repeat,k)].tolist())}
        df['fold'] = df['post_id'].apply(lambda x: split_dict[x])
        out_dict = {}
        prec = 0
        rec = 0
        macroF1 = 0
        for dim in DIMS:
            labels = df[df['fold']=='TEST'][dim]
            scores = precision_recall_fscore_support(labels, np.random.randint(len(list(set(labels))), size=labels.shape), average='macro')
            prec += scores[0]
            rec += scores[1]
            macroF1 += scores[2]
            out_dict['eval_'+dim+'_precision'] = scores[0]
            out_dict['eval_'+dim+'_recall'] = scores[1]
            out_dict['eval_'+dim+'_macroF1'] = scores[2]
        out_dict['eval_mean_precision'] = prec/len(DIMS)
        out_dict['eval_mean_recall'] = rec/len(DIMS)
        out_dict['eval_mean_F1'] = macroF1/len(DIMS)
        if not os.path.isdir(data_dir+'models/random-baseline/fold{}.{}'.format(repeat,k)):
            os.mkdir(data_dir+'models/random-baseline/fold{}.{}'.format(repeat,k))
        with open(data_dir+'models/random-baseline/fold{}.{}/test_results.json'.format(repeat,k), 'w') as f:
            json.dump(out_dict, f)