In [None]:
import os
import ast
import pandas as pd
import seaborn as sns
sns.set_theme()
sns.set_context("paper")
sns.set_style("whitegrid")

# 
bin_data_names = ['twitter-hate-speech-tsa', 'civil-comments', 'gibert-2018-shs', 'us-election-2020', 'cmsb-tsd']
multi_class_data_names = ['founta-2018-thas', 'davidson-thon', 'ami']
eval_data_names = bin_data_names + multi_class_data_names
check_variants = ["baseline", 
                    "sampling_modifiedRS_oversampling", "sampling_modifiedRS_undersampling", "sampling_weightedRS_combi", 
                    "augmentation_bert", 
                    "th", "wce", "fl", "wfl",
                    "augmentation_external_data_preprocessing", "augmentation_abusive_lexicon", ]

cc = True
if cc:
    eval_data_names = ['civil-comments', 'civil-comments-5k-7p5', 'civil-comments-20k', 'civil-comments-40k']
    check_variants = ['augmentation_abusive_lexicon', 
        'augmentation_external_data_preprocessing', 'baseline', 'fl',
        'sampling_modifiedRS_oversampling',
        'sampling_modifiedRS_undersampling', 'th']

data_cols = ["data_name", "num_classes", "data_type", "size", "rho", "distribution"]
variant_cols = ['variant', 'sampling_modifiedRS_rho', 'sampling_weightedRS_percentage', 'augmentation_rho', 'augmentation_percentage', 'augmentation_top_k', 'wce_alpha', 'fl_gamma', "augmentation_bert_top_k"]
metric_suffices = ['f1_macro', 'f1_per_label_0', 'f1_per_label_1', 'f1_per_label_2', 'f1_per_label_3', 'f1_per_label_4', 'accuracy', 'precision_macro', 'precision_weighted', 'recall_macro', 'recall_weighted', 'auprc']
metrics_cols = [f'train_{suffix}' for suffix in metric_suffices] + [f'val_{suffix}' for suffix in metric_suffices] + [f'test_{suffix}' for suffix in metric_suffices]
cols_raw = data_cols + variant_cols + ['pl_seed'] + metrics_cols + ["best_epoch"] + ['mlflow_run_id']

cols_seeds_mean = data_cols + variant_cols[:-1] + ['val_f1_macro', 'test_f1_macro', 'test_f1_macro_std'] + [metric for metric in metrics_cols if metric not in ['val_f1_macro', 'test_f1_macro', 'test_f1_macro_std']]

results_name = "results"
if cc:
    results_name += "_cc"
results_excel_path = f"results/{results_name}_all.xlsx"
cols_results = data_cols + variant_cols[:-1] + ['test_f1_macro', 'test_f1_macro_std']

In [None]:
data2distribution = {}
data2size = {}
data2rho = {}

def read_from_csv(data_name, header=0, names=None):
    if "tsv" in data_name:
        data = pd.read_csv(data_name,
                            sep='\t',
                            encoding = "utf-8",
                            engine = "python",
                            header = header,
                            names = names)
    elif "csv" in data_name:
        data = pd.read_csv(data_name,
                        encoding = "utf-8",
                        engine = "python",
                        header = header,
                        names = names)
    else:
        raise NotImplementedError("Given data file type is not supported yet.")
    return data
def get_data_info_by_data_name(data_name, label_col, category2label, train_filename, val_filename=None, test_filename=None):
    data = read_from_csv(f"./data/{data_name}/{train_filename}")
    label_counts = data[label_col].value_counts()
    if val_filename:
        val_data = read_from_csv(f"./data/{data_name}/{val_filename}")
        label_counts += val_data[label_col].value_counts()
    if test_filename:
        test_data = read_from_csv(f"./data/{data_name}/{test_filename}")
        label_counts += test_data[label_col].value_counts()
    data2size[data_name] = sum(label_counts.values)
    data2distribution[data_name] = {f"{label}_{cat}": round(label_counts[label]/data2size[data_name], 3) for cat, label in category2label.items()}
    data2rho[data_name] = round(max(label_counts.values) / min(label_counts.values), 2)

get_data_info_by_data_name("twitter-hate-speech-tsa", "label", {"non-hate": 0, "hate": 1}, "data_clean.csv")
get_data_info_by_data_name("civil-comments", "label", {"non-toxic": 0, "toxic": 1}, "data_clean.csv")
get_data_info_by_data_name("civil-comments-5k-7p5", "label", {"non-toxic": 0, "toxic": 1}, "data_clean.csv")
get_data_info_by_data_name("civil-comments-20k", "label", {"non-toxic": 0, "toxic": 1}, "data_clean.csv")
get_data_info_by_data_name("civil-comments-40k", "label", {"non-toxic": 0, "toxic": 1}, "data_clean.csv")
get_data_info_by_data_name("gibert-2018-shs", "label", {"no-hate": 0, "hate": 1}, "data_clean.csv")
get_data_info_by_data_name("us-election-2020", "label", {"non-HOF": 0, "HOF": 1}, "train_clean.csv", test_filename="test_clean.csv")
get_data_info_by_data_name("cmsb-tsd", "label", {"non-sexist": 0, "sexist": 1}, "data_clean.csv")
get_data_info_by_data_name("waseem-and-hovy-2016", "label", {"neither": 0, "racism/sexism": 1}, "data_clean.csv")
get_data_info_by_data_name("founta-2018-thas", "label_multi", {"normal": 0, "spam": 1, "abusive": 2, "hateful": 3}, "data_clean.csv")
get_data_info_by_data_name("ami", "label_multi", {"discredit": 0, "stereotype": 1, "dominance": 2, "sexual harassment": 3, "derailing": 4}, "train_clean.csv", test_filename="test_clean.csv")
get_data_info_by_data_name("davidson-thon", "label_multi", {"hate speech": 0, "offensive language": 1, "neither": 2}, "data_clean.csv")


data_name_orig2display = {'twitter-hate-speech-tsa': 'Twitter-Hate-Speech', 
                            'civil-comments': 
                                    'CC-5k-rho=11.5', 
                                    # 'Civil-Comments',
                            'civil-comments-20k': 'CC-20k-rho=11.5', 
                            'civil-comments-40k': 'CC-40k-rho=11.5',
                            'civil-comments-5k-7p5': 'CC-5k-rho=7.5',
                            'gibert-2018-shs': 'Gibert-2018', 
                            'us-election-2020': 'US-Election-2020', 
                            'cmsb-tsd': 'CMSB', 
                            'waseem-and-hovy-2016': 'Waseem-and-Hovy-2016',
                            'founta-2018-thas': 'Founta-2018', 
                            'davidson-thon': 'Davidson-2017', 
                            'ami': 'AMI-2018'}

for data_name in eval_data_names:
    print(f"{data_name_orig2display[data_name]}\t{data2size[data_name]}\t{data2distribution[data_name]}\t{data2rho[data_name]}")

In [None]:
def get_value_by_run_id(data_dir, run_id, key):
    value = None
    for root, _, files in os.walk(data_dir + run_id):
        if key in files:
            with open(f"{root}/{key}", "r") as f:
                value = f.readlines()[-1]
                if "train" in key or "val" in key or "test" in key:
                    value = value.split()[1]
                if value == "sampling_modifiedRS": # Forgot to specify in the variant value
                    value += "_oversampling"
                if value == "sampling_weightedRS":
                    value += "_combi"
                if "tensor" in value:
                    value = "-"
                try:
                    value = ast.literal_eval(value)
                    if isinstance(value, list):
                        value = [round(v, 2) for v in value]
                        value = tuple(value)
                except (ValueError, SyntaxError):
                    pass
                break
    if value is None:
        value = "-"
        if "train" in key or "val" in key or "test" in key:
            if "per_label" in key:
                value = 0
            else:
                raise RuntimeError(f"This run log {data_dir + run_id} does not have test metrics. ")
    return value

def get_best_epoch_by_run_id(data_dir, run_id):
    ckpt_dir = f"{data_dir}{run_id}/artifacts/model_checkpoints/"
    ckpt = os.listdir(ckpt_dir)[0]
    epoch = ast.literal_eval(ckpt.split("epoch=")[1][1:2])
    return epoch

def get_log_by_data_name(data_name):
    data_runs_dir = f'/mounts/data/proj/zhangyaq/imbalanced_text_classification_logs/{data_name}/'
    run_ids = os.listdir(data_runs_dir)
    rows = []
    for run_id in run_ids:
        run_result = {col_name: get_value_by_run_id(data_runs_dir, run_id, col_name) for col_name in cols_raw[:-2]}
        run_result['best_epoch'] = get_best_epoch_by_run_id(data_runs_dir, run_id)
        run_result['mlflow_run_id'] = run_id
        rows.append(run_result)
    df = pd.DataFrame(columns=cols_raw, data=rows)
    #  - add wfl(alpha=1.0) => fl
    df.loc[(df["variant"] == "wfl") & (df["wce_alpha"] == "-"),  "variant"] = "fl" 
    #  - combine these two columns
    def fix_aug_top_k(row):
        if row["augmentation_top_k"] != "-" or row["augmentation_bert_top_k"] != "-":
            if row["augmentation_top_k"] != "-":
                return row["augmentation_top_k"]
            else:
                return row["augmentation_bert_top_k"]
        else:
            return "-"
    df["augmentation_top_k"] = df.apply(lambda row: fix_aug_top_k(row), axis=1)
    df = df.drop('augmentation_bert_top_k', axis=1)
    return df

def aggregate_results(data_name, writer):
    # Get raw mlflow logs:
    df = get_log_by_data_name(data_name)
    df.to_excel(writer, sheet_name=f"{data_name}_raw")
    assert len(df) % 3 == 0
    # Aggregate results from 3 seeds
    agg_target = {metric: 'mean' for metric in metrics_cols}
    agg_target.update({col: 'first' for col in data_cols})
    df['test_f1_macro_std'] = df.loc[:, 'test_f1_macro']
    agg_target.update({'test_f1_macro_std': 'std'})
    df_seeds_mean = df.groupby(variant_cols[:-1], as_index=False).agg(agg_target)
    #  - change metric to be 00.00
    for metric in metrics_cols + ['test_f1_macro_std']:
        df_seeds_mean[metric] = df_seeds_mean[metric].apply(lambda x: ast.literal_eval("{0:.2f}".format(x*100)))
    def convert_str2number(x):
        if type(x) == str and x != "-":
            return ast.literal_eval(x)
        else:
            return x
    for var in variant_cols[1:-1]:
        df_seeds_mean[var] = df_seeds_mean[var].apply(lambda x: convert_str2number(x))
    df_seeds_mean.loc[:, "size"] = [data2size[data_name]] * len(df_seeds_mean)
    df_seeds_mean.loc[:, "distribution"] = [data2distribution[data_name]] * len(df_seeds_mean)
    df_seeds_mean.loc[:, "rho"] = [data2rho[data_name]] * len(df_seeds_mean)
    #  - re-order the columns
    df_seeds_mean = df_seeds_mean[cols_seeds_mean]
    if data_name == "founta-2018-thas":
        df_seeds_mean = df_seeds_mean[~((df_seeds_mean["data_name"] == data_name) & (df_seeds_mean["variant"] == "augmentation_external_data"))]
        df_seeds_mean.loc[(df_seeds_mean["data_name"] == data_name) & (df_seeds_mean["variant"] == "augmentation_external_data_oversampling"), "variant"] = "augmentation_external_data"
        df_seeds_mean = df_seeds_mean[~((df_seeds_mean["data_name"] == data_name) & (df_seeds_mean["variant"] == "augmentation_external_data_oversampling"))]
    df_seeds_mean.to_excel(writer, sheet_name=f"{data_name}_seeds-mean")
    df_seeds_mean = df_seeds_mean[df_seeds_mean["variant"].isin(check_variants)]
    df_val_f1_max = df_seeds_mean.loc[df_seeds_mean.groupby(["variant"])['val_f1_macro'].idxmax()].reset_index(drop=True)
    return df, df_seeds_mean, df_val_f1_max

In [None]:
data_names = eval_data_names

writer = pd.ExcelWriter(results_excel_path)
df_all_raw_results = pd.DataFrame(columns=cols_raw)
df_all_mean_results = pd.DataFrame(columns=cols_seeds_mean)
df_all_agg_results = pd.DataFrame(columns=cols_results)
for data_name in data_names:
    print(f"Extracting logs for {data_name} ...")
    df, df_seeds_mean, df_val_f1_max = aggregate_results(data_name, writer)
    df_all_raw_results = pd.concat([df_all_raw_results, df], ignore_index=True)
    df_all_mean_results = pd.concat([df_all_mean_results, df_seeds_mean], ignore_index=True)
    df_all_agg_results = pd.concat([df_all_agg_results, df_val_f1_max], ignore_index=True)
df_all_raw_results.to_csv(f"results/raw_{results_name}.csv", index=False)
df_all_mean_results.to_csv(f"results/mean_{results_name}.csv", index=False)
df_all_agg_results = df_all_agg_results.sort_values(by=['data_name', 'test_f1_macro']).reset_index(drop=True)
df_all_agg_results.to_csv(f"results/agg_{results_name}.csv", index=False)
df_all_agg_results.to_excel(writer, sheet_name=f"best_of_all")

variants = df_all_agg_results.variant.unique()
df_table = pd.DataFrame(columns=data_names)
for variant in variants:
    for data_name in data_names:
        condition = (df_all_agg_results["variant"] == variant) & (df_all_agg_results["data_name"] == data_name)
        if len(df_all_agg_results[condition]) > 0:
            df_table.loc[variant, data_name] = f"{df_all_agg_results[condition]["test_f1_macro"].values[0]} ({df_all_agg_results[condition]["test_f1_macro_std"].values[0]})"
df_table = df_table.reindex(check_variants)

df_table.to_excel(writer, sheet_name=f"Table")
writer.close()

In [None]:
df_table

In [None]:
df_all_agg_results = df_all_agg_results[df_all_agg_results["variant"].isin(['augmentation_abusive_lexicon', 
        'augmentation_external_data_preprocessing', 'baseline', 'fl',
        'sampling_modifiedRS_oversampling',
        'sampling_modifiedRS_undersampling', 'th'])]
df_all_agg_results = df_all_agg_results.sort_values(by=['data_name', 'test_f1_macro']).reset_index(drop=True)
# df_all_agg_results.variant.unique()
df_all_agg_results.to_csv(f"results/agg_{results_name}.csv", index=False)