In [1]:
import os
from pathlib import Path
import logging

import pandas as pd
import numpy as np

from capstone.config import CapstoneConfig
from capstone.data_access import DataClass
from capstone.features import Features
from capstone.evaluation import CustomEvaluation
from capstone.utils.constants import (
    TARGET,
    ORIGINAL_TEXT,
    SPLIT,
    DEVELOP,
    TEST,
    PREDICTION
)

PARENT_PATH = Path(os.getcwd()).parent.absolute()
FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig(format=FORMAT, level=logging.INFO)

Load dataset

In [2]:
config = CapstoneConfig()
config.CURRENT_PATH = PARENT_PATH

data = DataClass(config)
df = data.build()
df.head(3)

Unnamed: 0,Text,Standard mapping?
0,General Quality System Requirements […] Manage...,"[quality management, quality planning]"
1,General Quality System Requirements […] Manage...,[quality management]
2,General Quality System Requirements […] Manage...,[quality management]


In [3]:
features = Features(config)
df_features = features.build(df)

2022-12-13 06:01:00,244 - capstone.utils.utils - INFO - func:clean took: 6.14 sec
2022-12-13 06:01:01,338 - capstone.utils.utils - INFO - func:fit took: 7.23 sec
2022-12-13 06:01:04,887 - capstone.utils.utils - INFO - func:clean took: 3.55 sec
2022-12-13 06:01:05,147 - capstone.utils.utils - INFO - func:transform took: 3.81 sec
2022-12-13 06:01:05,148 - capstone.utils.utils - INFO - func:fit_transform took: 11.04 sec
2022-12-13 06:01:05,576 - capstone.utils.utils - INFO - func:clean took: 0.43 sec
2022-12-13 06:01:05,608 - capstone.utils.utils - INFO - func:transform took: 0.46 sec
2022-12-13 06:01:05,628 - capstone.utils.utils - INFO - func:build took: 11.54 sec


Extract test set (and development set, only for a utility function)

In [4]:
dev = df_features[df_features[SPLIT].isin([DEVELOP])].copy()
dev.reset_index(drop=True, inplace=True)
Y_dev = dev[features.mlb.classes_].copy()

test = df_features[df_features[SPLIT].isin([TEST])].copy()
test.reset_index(drop=True, inplace=True)
Y_test = test[features.mlb.classes_].copy()

Load predictions from models to be ensembled

In [5]:
bert = pd.read_pickle(os.path.join(data.reports_path, "bert-evaluation.pkl"))
ada = pd.read_pickle(os.path.join(data.reports_path, "ada-evaluation.pkl"))
curie = pd.read_pickle(os.path.join(data.reports_path, "curie-evaluation.pkl"))

bert.reset_index(drop=True, inplace=True)
ada.reset_index(drop=True, inplace=True)
curie.reset_index(drop=True, inplace=True)

Ensemble based on majority vote

In [6]:
# Custom evaluation class
eval = CustomEvaluation()
# Ensemble
combined = (bert[PREDICTION] + ada[PREDICTION] + curie[PREDICTION])
combined = combined.apply(eval.majority_vote, minimum_votes=2)
ensembled_predictions = features.mlb.transform(combined)

Evaluate on test dataset

In [7]:
eval.compute_sample_wise_metrics(y_true=Y_test.to_numpy(), y_pred=ensembled_predictions)

Hamming Loss    0.012109
Accuracy        0.752604
Precision       0.880469
Recall          0.839332
F1 Score        0.845102
F2 Score        0.838532
dtype: float64

Label-wise evaluation

In [8]:
eval.compute_label_wise_metrics(
    y_true=Y_test.to_numpy(),
    y_pred=ensembled_predictions,
    labels=features.mlb.classes_,
    dev_samples=Y_dev.to_numpy().sum(axis=0).tolist()
)

Unnamed: 0,Precision,Recall,F1 Score,F2 Score,Development Samples,Test Samples
audit inspection,0.8,0.888889,0.842105,0.869565,86,9
calibration,0.833333,0.833333,0.833333,0.833333,78,6
capa,0.833333,0.75,0.789474,0.765306,202,20
change controls,0.875,0.777778,0.823529,0.795455,71,9
clinical research,0.965517,0.965517,0.965517,0.965517,269,29
controled substances,1.0,0.833333,0.909091,0.862069,40,6
csv,0.0,0.0,0.0,0.0,37,3
data management,0.764706,1.0,0.866667,0.942029,150,13
design controls,1.0,0.909091,0.952381,0.925926,132,11
documentation management,0.818182,0.692308,0.75,0.714286,117,13
