## Let's do the normal evaluation

In [30]:
! python3 /workspace/project_git/keypoint-analysis-sharedtask/src-py/track_1_kp_matching.py /workspace/project_git/keypoint-analysis-sharedtask/data /workspace/ceph_data/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/siamese-data/preds/bert-base-uncased-kp_as_anchor-2021-05-24_15-58-35-our_valid-preds.json our_valid

mAP strict= 0.844355603203168 ; mAP relaxed = 0.947050810808719


---

## Let's trick the metric now
 - Load the evaluation code
 - Change the code such that we use only the top 25% of our predictions for each topic stance pair
 - Store the arguments used
 - Delete all the other arguments from our prediction file
 - Save the new prediction file
 - Do the normal evaluation again

In [1]:
import sys
import pandas as pd
from sklearn.metrics import precision_recall_curve, average_precision_score
from matplotlib import pyplot
import numpy as np
import os
import json

In [4]:
def calc_mean_average_precision(df, label_column):
    precisions = [get_ap(group, label_column) for _, group in df.groupby(["topic", "stance"])]
    return np.mean(precisions)

def evaluate_predictions(merged_df):
    mAP_strict = calc_mean_average_precision(merged_df, "label_strict")
    mAP_relaxed = calc_mean_average_precision(merged_df, "label_relaxed")
    print(f"mAP strict= {mAP_strict} ; mAP relaxed = {mAP_relaxed}")

def load_kpm_data(gold_data_dir, subset):
    arguments_file = os.path.join(gold_data_dir, f"arguments_{subset}.csv")
    key_points_file = os.path.join(gold_data_dir, f"key_points_{subset}.csv")
    labels_file = os.path.join(gold_data_dir, f"labels_{subset}.csv")

    arguments_df = pd.read_csv(arguments_file)
    key_points_df = pd.read_csv(key_points_file)
    labels_file_df = pd.read_csv(labels_file)

    return arguments_df, key_points_df, labels_file_df


def get_predictions(predictions_file, labels_df, arg_df):
    arg_df = arg_df[["arg_id", "topic", "stance"]]
    predictions_df = load_predictions(predictions_file)
    #make sure each arg_id has a prediction
    predictions_df = pd.merge(arg_df, predictions_df, how="left", on="arg_id")

    #handle arguements with no matching key point
    predictions_df["key_point_id"] = predictions_df["key_point_id"].fillna("dummy_id")
    predictions_df["score"] = predictions_df["score"].fillna(0)

    #merge each argument with the gold labels
    merged_df = pd.merge(predictions_df, labels_df, how="left", on=["arg_id", "key_point_id"])

    merged_df.loc[merged_df['key_point_id'] == "dummy_id", 'label'] = 0
    merged_df["label_strict"] = merged_df["label"].fillna(0)
    merged_df["label_relaxed"] = merged_df["label"].fillna(1)
    return merged_df


"""
this method chooses the best key point for each argument
and generates a dataframe with the matches and scores
"""
def load_predictions(predictions_dir):
    arg =[]
    kp = []
    scores = []
    with open(predictions_dir, "r") as f_in:
        res = json.load(f_in)
        for arg_id, kps in res.items():
            best_kp = max(kps.items(), key=lambda x: x[1])
            arg.append(arg_id)
            kp.append(best_kp[0])
            scores.append(best_kp[1])
        return pd.DataFrame({"arg_id" : arg, "key_point_id": kp, "score": scores})

In [5]:
gold_data_dir = "/workspace/project_git/keypoint-analysis-sharedtask/data"
predictions_file = "/workspace/ceph_data/data-in-progress/data-research/arguana/arg-generation/keypoint-analysis-sharedtask/siamese-data/preds/bert-base-uncased-kp_as_anchor-2021-05-24_15-58-35-our_valid-preds.json"
subset_name = "our_valid"

In [40]:
manipulate_predictions_df = pd.read_json(predictions_file)
manipulate_predictions_df.head()

Unnamed: 0,arg_4_0,arg_4_1,arg_4_3,arg_4_4,arg_4_5,arg_4_7,arg_4_9,arg_4_10,arg_4_11,arg_4_12,...,arg_15_224,arg_15_225,arg_15_132,arg_15_156,arg_15_197,arg_15_214,arg_15_215,arg_15_222,arg_15_120,arg_15_159
kp_4_0,-0.113464,0.05094,-0.027673,0.126287,0.339497,0.210822,0.342209,-0.199425,0.454778,0.408215,...,,,,,,,,,,
kp_4_1,0.833244,0.116367,-0.400722,0.130977,-0.081358,-0.195857,0.158396,0.970414,-0.230579,-0.154799,...,,,,,,,,,,
kp_4_2,0.000899,0.707487,0.422451,0.708077,-0.067458,0.657558,0.000632,-0.049008,0.277821,0.409436,...,,,,,,,,,,
kp_4_3,-0.082468,0.079495,0.775373,0.159886,0.238906,0.20804,-0.002771,-0.338597,0.226339,0.001494,...,,,,,,,,,,
kp_4_4,0.036106,0.001145,0.176084,0.181782,0.618537,0.414745,0.389428,0.012436,0.66542,0.391511,...,,,,,,,,,,


In [41]:
args_in_top25 = []
def get_ap(df, label_column, top_percentile=0.5):
    global args_in_top25
    top = int(len(df)*top_percentile)
    df = df.sort_values('score', ascending=False).head(top)

    top = int(len(df)*0.5)
    df = df.sort_values('score', ascending=False).head(top)

    args_in_top25 += df['arg_id'].tolist()

    return average_precision_score(y_true=df[label_column].tolist(), y_score=df["score"].tolist())

In [42]:
arg_df, kp_df, labels_df = load_kpm_data(gold_data_dir, subset=subset_name)
merged_df = get_predictions(predictions_file, labels_df, arg_df)
evaluate_predictions(merged_df)

mAP strict= 0.9126890639942158 ; mAP relaxed = 0.966407574792025


In [43]:
manipulate_predictions_df.head()

Unnamed: 0,arg_4_0,arg_4_1,arg_4_3,arg_4_4,arg_4_5,arg_4_7,arg_4_9,arg_4_10,arg_4_11,arg_4_12,...,arg_15_224,arg_15_225,arg_15_132,arg_15_156,arg_15_197,arg_15_214,arg_15_215,arg_15_222,arg_15_120,arg_15_159
kp_4_0,-0.113464,0.05094,-0.027673,0.126287,0.339497,0.210822,0.342209,-0.199425,0.454778,0.408215,...,,,,,,,,,,
kp_4_1,0.833244,0.116367,-0.400722,0.130977,-0.081358,-0.195857,0.158396,0.970414,-0.230579,-0.154799,...,,,,,,,,,,
kp_4_2,0.000899,0.707487,0.422451,0.708077,-0.067458,0.657558,0.000632,-0.049008,0.277821,0.409436,...,,,,,,,,,,
kp_4_3,-0.082468,0.079495,0.775373,0.159886,0.238906,0.20804,-0.002771,-0.338597,0.226339,0.001494,...,,,,,,,,,,
kp_4_4,0.036106,0.001145,0.176084,0.181782,0.618537,0.414745,0.389428,0.012436,0.66542,0.391511,...,,,,,,,,,,


In [45]:
manipulate_predictions_df.drop(list(set(manipulate_predictions_df.columns) - set(args_in_top25)), axis=1, inplace=True)

In [46]:
manipulate_predictions_df = manipulate_predictions_df.fillna(-1)

In [47]:
manipulate_predictions_df.to_json('/workspace/trash/manipulated_scores.json')

In [48]:
! python3 /workspace/project_git/keypoint-analysis-sharedtask/src-py/track_1_kp_matching.py /workspace/project_git/keypoint-analysis-sharedtask/data /workspace/trash/manipulated_scores.json our_valid

mAP strict= 0.9126890639942158 ; mAP relaxed = 0.966407574792025


## Profit!!