# SameSideStanceClassification Shared Task Ground-Truth Evaluation

- [eval script source](https://git.webis.de/code-research/arguana/same-side-classification/-/blob/master/evaluation/evaluation.py)


In [None]:
%%bash

rm -r output_emnlp21/
rm -r output_emnlp21_logs/

In [2]:
# common params
batch_size = 8
acc_steps = 64
num_epoch = 3  # int(run_name.rsplit("_", 1)[-1])

# run_name = f"{model_name.replace('/', '-')}-{data_name}_{seq_len}_{batch_size}-acc{acc_steps}_{num_epoch}"

In [32]:
import json
import os
from pathlib import Path

fn_base = Path("output")

data = list()
for fn in sorted(fn_base.iterdir()):
    run_name = fn.name

    sfn = fn / "eval_results_same-b.json"
    if not sfn.exists():
        continue

    parts = run_name.split("_")
    model_name, data_name = parts[0].rsplit("-", 1)
    seq_len = int(parts[1])
    batch_size, acc_steps = parts[2].split("-acc")
    batch_size, acc_steps = int(batch_size), int(acc_steps)
    num_epoch = int(parts[3])
    
    if data_name not in ("within", "cross"):
        print(f"### Unknown dataset! Run: {run_name}, supposed data: {data_name} ###")
        continue

    data.append((run_name, model_name, data_name, seq_len, batch_size, acc_steps, num_epoch))

In [4]:
pad_length = [max(len(str(x[i])) for x in data) for i in range(len(data[0]))]
for parts in data:
    # print(f"{parts[0]:<{pad_length[0]}}")  # run_name
    print(
        # model_name, data_name
        f"{parts[1]:<{pad_length[1]}}  {parts[2]:<{pad_length[2]}}  "
        # seq_len, batch_size
        f"{parts[3]:>{pad_length[3]}}  {parts[4]:>{pad_length[4]}}  "
        # acc_steps, num_epoch
        f"{parts[5]:>{pad_length[5]}}  {parts[6]:>{pad_length[6]}}"
    )

albert-base-v1                               cross   256  16  64   3
albert-base-v1                               within  256  16  64   3
albert-base-v2                               cross   128  32  64   3
albert-base-v2                               cross   256  12  64   3
albert-base-v2                               cross   256  16  64  10
albert-base-v2                               cross   512   8  64   3
albert-base-v2                               within  128  32  64   3
albert-base-v2                               within  256  16  64  10
albert-base-v2                               within  256  16  64   3
albert-base-v2                               within  512   8  64   3
albert-large-v2                              within  256   8  64   3
bert-base-cased                              cross   256  16  64   3
bert-base-cased                              cross   512   8  64   3
bert-base-cased                              within  256  16  64   3
bert-base-cased                   

In [5]:
cuda_devs = "1"

In [None]:
for run_name, model_name, data_name, seq_len, batch_size, acc_steps, num_epoch in data:
    load_name = f"./output/{run_name}"
    log_dir = f"./output_emnlp21_logs/{run_name}"

    # create folder for logging
    ! mkdir -p {log_dir}

    ! \
        PYTHONASYNCIODEBUG=1 \
        HF_MLFLOW_LOG_ARTIFACTS=TRUE \
        MLFLOW_EXPERIMENT_NAME=same-stance-test \
        CUDA_VISIBLE_DEVICES={cuda_devs} \
        python trainer.py \
        --do_test --do_pred \
        --model_name_or_path {load_name} \
        --task_name same-b \
        --data_dir ./data/argmining/ground_truth/{data_name} \
        --output_dir ./output_emnlp21/{run_name} \
        --run_name {run_name} \
        --max_seq_length {seq_len} \
        --per_device_eval_batch_size {batch_size} \
        --overwrite_output_dir \
        --overwrite_cache \
        --logging_steps 100 \
        > >(tee -a {log_dir}/stdout.log) \
        2> >(tee -a {log_dir}/stderr.log >&2)

## Compute metrics and rank

In [None]:
import pickle
from pathlib import Path

import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#from tqdm import tqdm_notebook as tqdm
from tqdm.notebook import tqdm

tqdm().pandas()

Load gold labels

In [9]:
fn_ground_truth_p = "data/ground_truth.p"

with open(fn_ground_truth_p, "rb") as fp:
    within_test_df = pickle.load(fp)
    cross_test_df = pickle.load(fp)

Load predictions and compute metrics

In [35]:
metrics = dict()

for run_name, model_name, data_name, seq_len, batch_size, acc_steps, num_epoch in data:
    fn_preds = Path(f"./output_emnlp21/{run_name}/pred_results_same-b.txt")
    with fn_preds.open("r") as fp:
        fp.readline()
        pred_data = [line.rstrip().split("\t") for line in fp]
    pred_data = [(int(id_), int(label)) for id_, label in pred_data]
    df_preds = pd.DataFrame.from_records(pred_data, columns=["id", "label"], index="id")

    df_gold = within_test_df if data_name == "within" else cross_test_df

    labels_truth = df_gold["is_same_side"].astype("bool").to_numpy()
    label_preds = df_preds["label"].astype("bool").to_numpy()

    metrics[run_name] = {
        "precision": precision_score(labels_truth, label_preds, average="binary"),
        "recall": recall_score(labels_truth, label_preds, average="binary"),
        "f1": f1_score(labels_truth, label_preds, average="binary"),
        "accuracy": accuracy_score(labels_truth, label_preds),
    }

Rank and plot

In [45]:
pad_length = [max(len(str(x[i])) for x in data) for i in range(len(data[0]))]

for task, subdata in (
    ("within", [x for x in data if x[2] == "within"]),
    ("cross", [x for x in data if x[2] == "cross"])
):
    print()
    print("-" * 30)
    print(f"Task: {task.upper()}")
    print("-" * 30)
    for run_name, model_name, data_name, seq_len, batch_size, acc_steps, num_epoch in subdata:
        run_metric = metrics[run_name]

        print(
            # model_name, data_name, seq_len
            f"{model_name:<{pad_length[1]}}"
            #f"  {data_name:<{pad_length[2]}}"
            f"  {seq_len:>{pad_length[3]}}"
            f"  |  {run_metric['precision'] * 100:>6.02f}%"
            f" {run_metric['recall'] * 100:>6.02f}%"
            f" {run_metric['f1'] * 100:>6.02f}%"
            f" {run_metric['accuracy'] * 100:>6.02f}%"
        )


------------------------------
Task: WITHIN
------------------------------
albert-base-v1                               256  |   75.00%  54.76%  63.30%  68.25%
albert-base-v2                               128  |   62.96%  26.98%  37.78%  55.56%
albert-base-v2                               256  |   70.31%  71.43%  70.87%  70.63%
albert-base-v2                               256  |   72.84%  46.83%  57.00%  64.68%
albert-base-v2                               512  |   77.27%  67.46%  72.03%  73.81%
albert-large-v2                              256  |   50.00% 100.00%  66.67%  50.00%
bert-base-cased                              256  |   78.43%  63.49%  70.18%  73.02%
bert-base-cased                              512  |   79.76%  53.17%  63.81%  69.84%
bert-base-uncased                            128  |   66.67%   9.52%  16.67%  52.38%
bert-base-uncased                            256  |   74.11%  65.87%  69.75%  71.43%
bert-base-uncased                            256  |   70.71%  55.56%  62.2

In [53]:
pad_length = [max(len(str(x[i])) for x in data) for i in range(len(data[0]))]

for task, subdata in (
    ("within", [x for x in data if x[2] == "within"]),
    ("cross", [x for x in data if x[2] == "cross"])
):
    print()
    print("=" * 79)
    print(f"Task: {task.upper()}")
    print("=" * 79)
    
    for seq_len, subdata in (
        (128, [x for x in subdata if x[3] == 128]),
        (256, [x for x in subdata if x[3] == 256]),
        (512, [x for x in subdata if x[3] == 512])
    ):
        print()
        print(f"Sequence length: {seq_len}")
        print("-" * 79)

        subdata = sorted(subdata, key=lambda x: metrics[x[0]]["f1"], reverse=True)

        for run_name, model_name, data_name, seq_len, batch_size, acc_steps, num_epoch in subdata:
            run_metric = metrics[run_name]

            print(
                # model_name, data_name, seq_len
                f"{model_name:<{pad_length[1]}}"
                #f"  {data_name:<{pad_length[2]}}"
                #f"  {seq_len:>{pad_length[3]}}"
                f"  |  {run_metric['precision'] * 100:>6.02f}%"
                f" {run_metric['recall'] * 100:>6.02f}%"
                f" {run_metric['f1'] * 100:>6.02f}%"
                f" {run_metric['accuracy'] * 100:>6.02f}%"
            )


Task: WITHIN

Sequence length: 128
-------------------------------------------------------------------------------
albert-base-v2                               |   62.96%  26.98%  37.78%  55.56%
bert-base-uncased                            |   66.67%   9.52%  16.67%  52.38%

Sequence length: 256
-------------------------------------------------------------------------------
albert-base-v2                               |   70.31%  71.43%  70.87%  70.63%
bert-base-cased                              |   78.43%  63.49%  70.18%  73.02%
bert-base-uncased                            |   74.11%  65.87%  69.75%  71.43%
albert-large-v2                              |   50.00% 100.00%  66.67%  50.00%
albert-base-v1                               |   75.00%  54.76%  63.30%  68.25%
bert-base-uncased                            |   70.71%  55.56%  62.22%  66.27%
albert-base-v2                               |   72.84%  46.83%  57.00%  64.68%
squeezebert-squeezebert-uncased              |   75.51%  29.37

---

### Verification of test labels with submitted predictions

- [leaderboard](https://webis.de/events/sameside-19/)

In [None]:
%%bash

mkdir temp
cd temp
wget https://raw.githubusercontent.com/Querela/argmining19-same-side-classification/5e650104d86c347d6aceab38d728c805a7eb5f9c/data/within_traindev_proepi512_BCE_0.1/new_within_results.csv
wget https://raw.githubusercontent.com/Querela/argmining19-same-side-classification/5e650104d86c347d6aceab38d728c805a7eb5f9c/data/cross_traindev_proepi512_BCE/cross_results.csv

In [23]:
import pickle

import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


within_pred_df = pd.read_csv("temp/new_within_results.csv", index_col='id')
cross_pred_df = pd.read_csv("temp/cross_results.csv", index_col='id')
with open("data/ground_truth.p", "rb") as fp:
    within_test_df = pickle.load(fp)
    cross_test_df = pickle.load(fp)


within_merged_df = within_pred_df.merge(within_test_df, how="inner", left_on="id", right_on="id")
cross_merged_df = cross_pred_df.merge(cross_test_df, how="inner", left_on="id", right_on="id")

for task, merged_df in (("within", within_merged_df), ("cross", cross_merged_df)):
    results_df = merged_df[["label", "is_same_side"]]

    labels_truth = results_df["is_same_side"].astype("bool").to_numpy()
    label_preds = results_df["label"].astype("bool").to_numpy()

    metrics = {
        "precision": precision_score(labels_truth, label_preds, average="binary"),
        "recall": recall_score(labels_truth, label_preds, average="binary"),
        "f1": f1_score(labels_truth, label_preds, average="binary"),
        "accuracy": accuracy_score(labels_truth, label_preds),
    }
    
    print(f"### {task.upper()} ###")
    for k, v in metrics.items():
        print(f"  {k:<10} {v*100:05.02f}")

#test_df.info()
#within_test_df.info()
#merged_df.info()

### WITHIN ###
  precision  79.31
  recall     73.02
  f1         76.03
  accuracy   76.98
### CROSS ###
  precision  72.32
  recall     72.29
  f1         72.30
  accuracy   72.31
