In [16]:
import os
import ast
import pandas as pd

# 'gao-2018-fhc', 
bin_data_names = ['bretschneider-th-main', 'twitter-hate-speech-tsa', 'bretschneider-th-school', 'gibert-2018-shs', 'us-election-2020', 'cmsb-tsd', 'waseem-and-hovy-2016']
multi_class_data_names = ['founta-2018-thas', 'ami', 'davidson-thon']
data_names = bin_data_names + multi_class_data_names

data_cols = ["data_name", "num_classes", "data_type", "size", "rho", "distribution"]
variant_cols = ['variant', 'sampling_modifiedRS_rho', 'sampling_weightedRS_percentage', 'augmentation_rho', 'wce_alpha', 'fl_gamma']
metric_suffices = ['f1_macro', 'f1_per_label_0', 'f1_per_label_1', 'f1_per_label_2', 'f1_per_label_3', 'f1_per_label_4', 'accuracy', 'precision_macro', 'precision_weighted', 'recall_macro', 'recall_weighted', 'auprc']
metrics_cols = [f'train_{suffix}' for suffix in metric_suffices] + [f'val_{suffix}' for suffix in metric_suffices] + [f'test_{suffix}' for suffix in metric_suffices]
cols_raw = data_cols + variant_cols + ['pl_seed'] + metrics_cols + ['mlflow_run_id']
                
cols_seeds_mean = data_cols + variant_cols + ['val_f1_macro', 'test_f1_macro', 'test_f1_macro_std'] + [f'test_{suffix}' for suffix in metric_suffices[1:]]

results_excel_path = "results_all.xlsx"
cols_results = data_cols + variant_cols + ['test_f1_macro', 'test_f1_macro_std']

In [9]:
data2distribution = {}
data2size = {}
data2rho = {}

def read_from_csv(data_name, header=0, names=None):
    if "tsv" in data_name:
        data = pd.read_csv(data_name,
                            sep='\t',
                            encoding = "utf-8",
                            engine = "python",
                            header = header,
                            names = names)
    elif "csv" in data_name:
        data = pd.read_csv(data_name,
                        encoding = "utf-8",
                        engine = "python",
                        header = header,
                        names = names)
    else:
        raise NotImplementedError("Given data file type is not supported yet.")
    return data
def get_data_info_by_data_name(data_name, label_col, category2label, train_filename, val_filename=None, test_filename=None):
    data = read_from_csv(f"./data/{data_name}/{train_filename}")
    label_counts = data[label_col].value_counts()
    if val_filename:
        val_data = read_from_csv(f"./data/{data_name}/{val_filename}")
        label_counts += val_data[label_col].value_counts()
    if test_filename:
        test_data = read_from_csv(f"./data/{data_name}/{test_filename}")
        label_counts += test_data[label_col].value_counts()
    data2size[data_name] = sum(label_counts.values)
    data2distribution[data_name] = {f"{label}_{cat}": round(label_counts[label]/data2size[data_name], 3) for cat, label in category2label.items()}
    data2rho[data_name] = round(max(label_counts.values) / min(label_counts.values), 2)

get_data_info_by_data_name("ami", "label_multi", {"discredit": 0, "stereotype": 1, "dominance": 2, "sexual harassment": 3, "derailing": 4}, "train_clean.csv", test_filename="test_clean.csv")
get_data_info_by_data_name("bretschneider-th-main", "label", {"neutral": 0, "harassment": 1}, "main_data_clean.csv")
get_data_info_by_data_name("bretschneider-th-school", "label", {"neutral": 0, "harassment": 1}, "school_data_clean.csv")
get_data_info_by_data_name("cmsb-tsd", "label", {"non-sexist": 0, "sexist": 1}, "data_clean.csv")
get_data_info_by_data_name("davidson-thon", "label_multi", {"hate speech": 0, "offensive language": 1, "neither": 2}, "data_clean.csv")
get_data_info_by_data_name("founta-2018-thas", "label_multi", {"normal": 0, "spam": 1, "abusive": 2, "hateful": 3}, "data_clean.csv")
get_data_info_by_data_name("gao-2018-fhc", "label", {"non-hateful": 0, "hateful": 1}, "data_clean.csv")
get_data_info_by_data_name("gibert-2018-shs", "label", {"no-hate": 0, "hate": 1}, "data_clean.csv")
get_data_info_by_data_name("twitter-hate-speech-tsa", "label", {"non-hate": 0, "hate": 1}, "data_clean.csv")
get_data_info_by_data_name("us-election-2020", "label", {"non-HOF": 0, "HOF": 1}, "train_clean.csv", test_filename="test_clean.csv")
get_data_info_by_data_name("waseem-and-hovy-2016", "label", {"neither": 0, "racism/sexism": 1}, "data_clean.csv")
data2distribution, data2size, data2rho

({'ami': {'0_discredit': 0.514,
   '1_stereotype': 0.142,
   '2_dominance': 0.121,
   '3_sexual harassment': 0.176,
   '4_derailing': 0.046},
  'bretschneider-th-main': {'0_neutral': 0.936, '1_harassment': 0.064},
  'bretschneider-th-school': {'0_neutral': 0.92, '1_harassment': 0.08},
  'cmsb-tsd': {'0_non-sexist': 0.867, '1_sexist': 0.133},
  'davidson-thon': {'0_hate speech': 0.058,
   '1_offensive language': 0.774,
   '2_neither': 0.168},
  'founta-2018-thas': {'0_normal': 0.717,
   '1_spam': 0.163,
   '2_abusive': 0.085,
   '3_hateful': 0.035},
  'gao-2018-fhc': {'0_non-hateful': 0.715, '1_hateful': 0.285},
  'gibert-2018-shs': {'0_no-hate': 0.888, '1_hate': 0.112},
  'twitter-hate-speech-tsa': {'0_non-hate': 0.93, '1_hate': 0.07},
  'us-election-2020': {'0_non-HOF': 0.883, '1_HOF': 0.117},
  'waseem-and-hovy-2016': {'0_neither': 0.735, '1_racism/sexism': 0.265}},
 {'ami': 2245,
  'bretschneider-th-main': 2898,
  'bretschneider-th-school': 1396,
  'cmsb-tsd': 13631,
  'davidson-tho

In [10]:
def get_value_by_run_id(data_dir, run_id, key):
    value = None
    for root, _, files in os.walk(data_dir + run_id):  
        if key in files:
            with open(f"{root}/{key}", "r") as f:
                value = f.readlines()[-1]
                if "train" in key or "val" in key or "test" in key:
                    value = value.split()[1]
                if value == "sampling_modifiedRS": # Forgot to specify in the variant value
                    value += "_oversampling"
                if value == "sampling_weightedRS":
                    value += "_combi"
                if "tensor" in value:
                    value = "-"
                try:
                    value = ast.literal_eval(value)
                    if isinstance(value, list):
                        value = [round(v, 2) for v in value]
                        value = tuple(value)
                except (ValueError, SyntaxError):
                    pass
                break
    if value is None:
        value = "-"
        if "train" in key or "val" in key or "test" in key:
            value = 0
    return value

def get_log_by_data_name(data_name):
    data_runs_dir = f'./logs/{data_name}/'
    run_ids = os.listdir(data_runs_dir)
    rows = []
    for run_id in run_ids:
        run_result = {col_name: get_value_by_run_id(data_runs_dir, run_id, col_name) for col_name in cols_raw[:-1]}
        run_result['mlflow_run_id'] = run_id
        rows.append(run_result)
    df = pd.DataFrame(columns=cols_raw, data=rows)
    #  - add wfl(alpha=1.0) => fl
    df.loc[(df["variant"] == "wfl") & (df["wce_alpha"] == "-"),  "variant"] = "fl" 
    return df

def aggregate_results(data_name, writer):
    # Get raw mlflow logs:
    df = get_log_by_data_name(data_name)
    df.to_excel(writer, sheet_name=f"{data_name}_raw")
    assert len(df) % 3 == 0
    # Aggregate results from 3 seeds
    agg_target = {metric: 'mean' for metric in metrics_cols}
    agg_target.update({col: 'first' for col in data_cols})
    df['test_f1_macro_std'] = df.loc[:, 'test_f1_macro']
    agg_target.update({'test_f1_macro_std': 'std'})
    df_seeds_mean = df.groupby(variant_cols, as_index=False).agg(agg_target)
    #  - change metric to be 00.00
    for metric in metrics_cols + ['test_f1_macro_std']:
        df_seeds_mean[metric] = df_seeds_mean[metric].apply(lambda x: ast.literal_eval("{0:.2f}".format(x*100)))
    
    #  - re-order the columns
    df_seeds_mean.loc[:, "size"] = [data2size[data_name]] * len(df_seeds_mean)
    df_seeds_mean.loc[:, "distribution"] = [data2distribution[data_name]] * len(df_seeds_mean)
    df_seeds_mean.loc[:, "rho"] = [data2rho[data_name]] * len(df_seeds_mean)
    
    df_seeds_mean = df_seeds_mean[cols_seeds_mean]
    df_seeds_mean.to_excel(writer, sheet_name=f"{data_name}_seeds-mean")
    df_val_f1_max = df_seeds_mean.loc[df_seeds_mean.groupby(["variant"])['val_f1_macro'].idxmax()].reset_index(drop=True)
    return df, df_seeds_mean, df_val_f1_max

In [15]:
writer = pd.ExcelWriter(results_excel_path)
df_all_raw_results = pd.DataFrame(columns=cols_raw)
df_all_mean_results = pd.DataFrame(columns=cols_seeds_mean)
df_all_agg_results = pd.DataFrame(columns=cols_results)
for data_name in data_names:
    print(f"Extracting logs for {data_name} ...")
    df, df_seeds_mean, df_val_f1_max = aggregate_results(data_name, writer)
    df_all_raw_results = pd.concat([df_all_raw_results, df], ignore_index=True)
    df_all_mean_results = pd.concat([df_all_mean_results, df_seeds_mean], ignore_index=True)
    df_all_agg_results = pd.concat([df_all_agg_results, df_val_f1_max], ignore_index=True)
df_all_agg_results = df_all_agg_results.sort_values(by=['data_name', 'test_f1_macro']).reset_index(drop=True)
df_all_agg_results.to_excel(writer, sheet_name=f"best_of_all")

variants = df_all_agg_results.variant.unique()
df_table = pd.DataFrame(columns=data_names)
for variant in variants:
    for data_name in data_names:
        condition = (df_all_agg_results["variant"] == variant) & (df_all_agg_results["data_name"] == data_name)
        if len(df_all_agg_results[condition]) > 0:
            df_table.loc[variant, data_name] = df_all_agg_results[condition]["test_f1_macro"].values[0]
df_table.to_excel(writer, sheet_name=f"Table")
writer.close()

Extracting logs for bretschneider-th-main ...


  df_all_raw_results = pd.concat([df_all_raw_results, df], ignore_index=True)
  df_all_mean_results = pd.concat([df_all_mean_results, df_seeds_mean], ignore_index=True)
  df_all_agg_results = pd.concat([df_all_agg_results, df_val_f1_max], ignore_index=True)


Extracting logs for bretschneider-th-school ...




Extracting logs for cmsb-tsd ...
Extracting logs for gao-2018-fhc ...
Extracting logs for gibert-2018-shs ...
Extracting logs for twitter-hate-speech-tsa ...




Extracting logs for us-election-2020 ...
Extracting logs for waseem-and-hovy-2016 ...
Extracting logs for ami ...
Extracting logs for davidson-thon ...
Extracting logs for founta-2018-thas ...


In [12]:
df_all_agg_results[["data_name", "variant", "test_f1_macro"]]

Unnamed: 0,data_name,variant,test_f1_macro
0,ami,baseline,54.65
1,bretschneider-th-main,baseline,68.89
2,bretschneider-th-main,fl,69.11
3,bretschneider-th-main,th,70.60
4,bretschneider-th-main,sampling_weightedRS_oversampling,70.68
...,...,...,...
58,waseem-and-hovy-2016,baseline,86.72
59,waseem-and-hovy-2016,th,86.82
60,waseem-and-hovy-2016,wfl,86.82
61,waseem-and-hovy-2016,sampling_modifiedRS_oversampling,87.16


In [13]:
df_table

Unnamed: 0,bretschneider-th-main,bretschneider-th-school,cmsb-tsd,gao-2018-fhc,gibert-2018-shs,twitter-hate-speech-tsa,us-election-2020,waseem-and-hovy-2016,ami,davidson-thon,founta-2018-thas
baseline,68.89,69.82,84.36,70.06,76.89,87.21,75.62,86.72,54.65,74.7,62.7
fl,69.11,71.08,84.72,,77.45,,74.44,86.62,,74.74,63.55
th,70.6,74.11,84.91,,,,77.06,86.82,,74.33,61.9
sampling_weightedRS_oversampling,70.68,73.95,,,,,76.51,87.46,,74.77,
sampling_weightedRS_combi,71.46,74.65,,,,,73.39,86.35,,74.66,62.46
wce,71.75,74.87,,,,,77.02,86.62,,75.48,64.33
wfl,71.83,73.02,84.84,,76.39,,74.73,86.82,,75.52,64.22
sampling_modifiedRS_oversampling,73.72,76.23,84.83,,77.25,,73.49,87.16,,75.48,63.98
sampling_modifiedRS_undersampling,,,,,,,,,,76.57,64.38
augmentation_wordnet,,,,,,,69.49,,,,


In [5]:
# mlrun_ids = []

# with open("../outputs/bretschneider-th-school_sampling_weightedRS_seed0_output.txt", "r") as f:
#     log_contents = f.readlines()
#     mlrun_ids += [log.split("/")[-1][:-2] for log in log_contents if "MLflow Saved Child Search" in log]

# print(mlrun_ids)

In [20]:
import os
bin_data_names = ['bretschneider-th-main', 'twitter-hate-speech-tsa', 'bretschneider-th-school', 'gibert-2018-shs', 'us-election-2020', 'cmsb-tsd', 'waseem-and-hovy-2016']
multi_class_data_names = ['founta-2018-thas', 'ami', 'davidson-thon']

for data_name in data_names:
    data_runs_dir = f'./logs/{data_name}/'
    run_ids = os.listdir(data_runs_dir)
    logs = dict()
    for run_id in run_ids:
        ckpt_dir = f"{data_runs_dir}{run_id}/artifacts/model_checkpoints/"
        try:
            ckpt = os.listdir(ckpt_dir)[0]
            if "wfl" in ckpt:
                logs[run_id] = ckpt
                print(f"{run_id}\t{ckpt}")
        except NotADirectoryError:
            pass
len(logs)

53ef7d25b587488694e5515b97c394bb	bretschneider-th-main-wfl-Trial_4-wce_alpha=0.9-fl_gamma=2.0-seed21-epoch=08-val_f1_macro=0.79.ckpt
b9a97dc883fa40138e793c6fb4f40652	bretschneider-th-main-wfl-Trial_0-wce_alpha=0.936-fl_gamma=0.2-seed21-epoch=03-val_f1_macro=0.79.ckpt
7b65afb9523f4e5d94c1b07704a2e196	bretschneider-th-main-wfl-Trial_31-wce_alpha=0.99-fl_gamma=5.0-seed21-epoch=03-val_f1_macro=0.68.ckpt
60e8481df0384370bd80f1b75be357a1	bretschneider-th-main-wfl-Trial_23-wce_alpha=0.1-fl_gamma=2.0-seed42-epoch=00-val_f1_macro=0.48.ckpt
c4d6c282478e4cb496cc63e22e77dae9	bretschneider-th-main-wfl-Trial_5-wce_alpha=0.75-fl_gamma=5.0-seed0-epoch=02-val_f1_macro=0.80.ckpt
230f4a654e9349a1bbf9a9269babdbca	bretschneider-th-main-wfl-Trial_45-wce_alpha=0.99-fl_gamma=0.5-seed21-epoch=03-val_f1_macro=0.72.ckpt
6924f5f531c141abbc1bd10652b96c7e	bretschneider-th-main-wfl-Trial_12-wce_alpha=0.99-fl_gamma=0.5-seed42-epoch=03-val_f1_macro=0.78.ckpt
d7da2578c2a24c1da120b04415a05b65	bretschneider-th-main-wfl-T

90