In [1]:
import numpy as np
import polars as pl
import sklearn.metrics

# Load data

In [2]:
val_df = (
    pl.read_csv("../train/results/bestepoch-bydrug-PMB-valid_14-ALL-125-all_222_24_25_1e-06_256_32-full.csv")
    .with_columns(
        pl.col("class").map_elements({"is_event": 1, "not_event": 0}.get, return_dtype=int)
    )
)
print(val_df.shape)
val_df.head(2)

(23062, 10)


section,drug,tac,meddra_id,pt_meddra_id,source_method,class,pt_meddra_term,found_term,string
str,str,str,i64,i64,str,i64,str,str,str
"""AR""","""AMPYRA""","""train""",10013573,10013573,"""exact""",1,"""Dizziness""","""dizziness""","""dizziness exact at a rate grea…"
"""AR""","""AMPYRA""","""train""",10013573,10013573,"""exact""",1,"""Dizziness""","""dizziness""","""dizziness exact compared to pl…"


In [3]:
preds_df = (
    pl.read_csv("../train/results/bestepoch-bydrug-PMB-valid_14-ALL-125-all_222_24_25_1e-06_256_32.csv", has_header=False)
    .rename({"column_1": "pred0", "column_2": "pred1"})
)
print(preds_df.shape)
preds_df.head(2)

(23062, 2)


pred0,pred1
f64,f64
0.0,5.881886
0.0,5.932092


In [4]:
full_df = pl.concat([val_df, preds_df], how="horizontal")
print(full_df.shape)
full_df.head(2)

(23062, 12)


section,drug,tac,meddra_id,pt_meddra_id,source_method,class,pt_meddra_term,found_term,string,pred0,pred1
str,str,str,i64,i64,str,i64,str,str,str,f64,f64
"""AR""","""AMPYRA""","""train""",10013573,10013573,"""exact""",1,"""Dizziness""","""dizziness""","""dizziness exact at a rate grea…",0.0,5.881886
"""AR""","""AMPYRA""","""train""",10013573,10013573,"""exact""",1,"""Dizziness""","""dizziness""","""dizziness exact compared to pl…",0.0,5.932092


# Evaluate and find the best threshold

In [5]:
def prdata(labels, preds, f1_threshold=None):
    precision, recall, thresholds = sklearn.metrics.precision_recall_curve(labels, preds)
    numerator = 2 * recall * precision
    denom = recall + precision
    f1_scores = np.divide(
        numerator, denom, out=np.zeros_like(denom), where=(denom != 0)
    )

    if not f1_threshold is None:
        max_f1 = f1_scores[np.argmin(np.abs(thresholds - f1_threshold))]
        max_f1_precision = precision[np.argmin(np.abs(thresholds - f1_threshold))]
        max_f1_recall = recall[np.argmin(np.abs(thresholds - f1_threshold))]
        max_f1_thresh = f1_threshold
    else:
        max_f1_thresh = thresholds[np.argmax(f1_scores)]
        max_f1 = np.max(f1_scores)
        max_f1_precision = precision[np.argmax(f1_scores)]
        max_f1_recall = recall[np.argmax(f1_scores)]

    return {
        "precision": precision,
        "recall": recall,
        "max_f1": max_f1,
        "max_f1_threshold": max_f1_thresh,
        "max_f1_precision": max_f1_precision,
        "max_f1_recall": max_f1_recall,
        "pr_auc": sklearn.metrics.auc(recall, precision),
    }


In [6]:
performance_data = prdata(labels=full_df["class"].to_numpy(), preds=full_df["pred1"].to_numpy())

performance_data

{'precision': array([0.1439164 , 0.47467811, 0.47474603, ..., 1.        , 1.        ,
        1.        ], shape=(6989,)),
 'recall': array([1.00000000e+00, 9.99698704e-01, 9.99698704e-01, ...,
        6.02591142e-04, 3.01295571e-04, 0.00000000e+00], shape=(6989,)),
 'max_f1': np.float64(0.9174005466140298),
 'max_f1_threshold': np.float64(3.2583718299865723),
 'max_f1_precision': np.float64(0.9247015610651974),
 'max_f1_recall': np.float64(0.9102139198553781),
 'pr_auc': np.float64(0.9763298280707404)}

In [7]:
performance_data["max_f1_threshold"]

np.float64(3.2583718299865723)