Improve classification metric function (#240)

* update * Update installation.rst (#244) * update Scaler transform function (fit_transform => transform) * Update index.rst Add mdl library maintenance announcement * Update README.md Update information about mdl library maintenance. * update * fix test case Co-authored-by: Stephen Wu <39758581+stewu5@users.noreply.github.com>
yoshida-lab · Oct 3, 2021 · ad04f74 · ad04f74
1 parent 5b23efe
commit ad04f74
Show file tree

Hide file tree

Showing 2 changed files with 74 additions and 29 deletions.
diff --git a/tests/models/test_extension.py b/tests/models/test_extension.py
@@ -315,8 +315,8 @@ def predict(self, x_, y_):
     val.step_forward(trainer=_Trainer(), step_info=step_info)  # noqa
     assert step_info['val_mae'] == regression_metrics(y, x)['mae']
     assert set(step_info.keys()) == {
-        'i_epoch', 'val_mae', 'val_mse', 'val_rmse', 'val_r2', 'val_pearsonr', 'val_spearmanr',
-        'val_p_value', 'val_max_ae', 'train_loss'
+        'i_epoch', 'val_mae', 'val_mse', 'val_rmse', 'val_r2', 'val_pearsonr', 'val_spearmanr', 'val_p_value',
+        'val_max_ae', 'train_loss'
     }
 
 
@@ -345,10 +345,12 @@ def predict(self, x_, y_):  # noqa
 
     step_info = OrderedDict(train_loss=0, i_epoch=1)
     val.step_forward(trainer=_Trainer(), step_info=step_info)  # noqa
-    assert step_info['val_f1'] == classification_metrics(y, x)['f1']
+    print(step_info)
+    assert step_info['val_accuracy'] == classification_metrics(y, x)['accuracy']
     assert set(step_info.keys()) == {
-        'i_epoch', 'val_accuracy', 'val_f1', 'val_precision', 'val_recall', 'val_macro_f1',
-        'val_macro_precision', 'val_macro_recall', 'train_loss'
+        'i_epoch', 'val_accuracy', 'val_f1', 'val_precision', 'val_recall', 'val_macro_f1', 'val_macro_precision',
+        'val_macro_recall', 'val_weighted_f1', 'val_weighted_precision', 'val_weighted_recall', 'val_micro_f1',
+        'val_micro_precision', 'val_micro_recall', 'train_loss'
     }
 
 

diff --git a/xenonpy/model/utils/metrics.py b/xenonpy/model/utils/metrics.py
@@ -3,7 +3,7 @@
 #  license that can be found in the LICENSE file.
 
 from collections import OrderedDict
-from typing import Union
+from typing import Union, List, Tuple
 
 import numpy as np
 import pandas as pd
@@ -14,8 +14,7 @@
 __all__ = ['regression_metrics', 'classification_metrics']
 
 
-def regression_metrics(y_true: Union[np.ndarray, pd.Series],
-                       y_pred: Union[np.ndarray, pd.Series]) -> OrderedDict:
+def regression_metrics(y_true: Union[np.ndarray, pd.Series], y_pred: Union[np.ndarray, pd.Series]) -> OrderedDict:
     """
     Calculate most common regression scores.
     See Also: https://scikit-learn.org/stable/modules/model_evaluation.html
@@ -63,26 +62,56 @@ def regression_metrics(y_true: Union[np.ndarray, pd.Series],
 
 
 def classification_metrics(
-        y_true: Union[np.ndarray, pd.DataFrame, pd.Series],
-        y_pred: Union[np.ndarray, pd.Series]) -> OrderedDict:
+    y_true: Union[np.ndarray, pd.DataFrame, pd.Series],
+    y_pred: Union[np.ndarray, pd.Series],
+    *,
+    average: Union[None, List[str], Tuple[str]] = ('weighted', 'micro', 'macro'),
+    labels=None,
+) -> dict:
     """
     Calculate most common classification scores.
-    See Also: https://scikit-learn.org/stable/modules/model_evaluation.html
+    See also: https://scikit-learn.org/stable/modules/model_evaluation.html
     
     Parameters
     ----------
     y_true
         True results.
     y_pred
         Predicted results.
-        
+    average
+        This parameter is required for multiclass/multilabel targets. If None, the scores for each class are returned.
+        Otherwise, this determines the type of averaging performed on the data:
+
+        binary:
+            Only report results for the class specified by pos_label. This is applicable only if targets (y_{true,pred})
+            are binary.
+        micro:
+            Calculate metrics globally by counting the total true positives, false negatives and false positives.
+        macro:
+            Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into
+            account.
+        weighted:
+            Calculate metrics for each label, and find their average weighted by support (the number of true instances
+            for each label). This alters ``macro`` to account for label imbalance; it can result in an F-score that is
+            not between precision and recall.
+    labels
+        The set of labels to include when average != ``binary``, and their order if average is None.
+        Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority
+        negative class, while labels not present in the data will result in 0 components in a macro average.
+        For multilabel targets, labels are column indices.
+        By default, all labels in y_true and y_pred are used in sorted order.
+
     Returns
     -------
     OrderedDict
         An :class:`collections.OrderedDict` contains classification scores.
-        These scores will be calculated: ``accuracy``, ``f1``, ``precision``, ``recall``,
-        ``macro_f1``, ``macro_precision``, and ``macro_recall``
+        These scores will always contains ``accuracy``, ``f1``, ``precision`` and ``recall``.
+        For multilabel targets, based on the selection of the ``average`` parameter, the **weighted**, **micro**,
+        and **macro** scores of ``f1`, ``precision``, and ``recall`` will be calculated.
     """
+    if average is not None and len(average) == 0:
+        raise ValueError('need average')
+
     if len(y_true.shape) != 1:
         y_true = np.argmax(y_true, 1)
     if len(y_pred.shape) != 1:
@@ -92,19 +121,33 @@ def classification_metrics(
     y_true = y_true[mask]
     y_pred = y_pred[mask]
 
-    accuracy = accuracy_score(y_true, y_pred)
-    f1 = f1_score(y_true, y_pred, average='weighted')
-    precision = precision_score(y_true, y_pred, average='weighted')
-    recall = recall_score(y_true, y_pred, average='weighted')
-    macro_f1 = f1_score(y_true, y_pred, average='macro')
-    macro_precision = precision_score(y_true, y_pred, average='macro')
-    macro_recall = recall_score(y_true, y_pred, average='macro')
-    return OrderedDict(
-        accuracy=accuracy,
-        f1=f1,
-        precision=precision,
-        recall=recall,
-        macro_f1=macro_f1,
-        macro_precision=macro_precision,
-        macro_recall=macro_recall,
+    ret = dict(accuracy=accuracy_score(y_true, y_pred))
+
+    ret.update(
+        f1=f1_score(y_true, y_pred, average=None, labels=labels),
+        precision=precision_score(y_true, y_pred, average=None, labels=labels),
+        recall=recall_score(y_true, y_pred, average=None, labels=labels),
     )
+
+    if 'weighted' in average:
+        ret.update(
+            weighted_f1=f1_score(y_true, y_pred, average='weighted', labels=labels),
+            weighted_precision=precision_score(y_true, y_pred, average='weighted', labels=labels),
+            weighted_recall=recall_score(y_true, y_pred, average='weighted', labels=labels),
+        )
+
+    if 'micro' in average:
+        ret.update(
+            micro_f1=f1_score(y_true, y_pred, average='micro', labels=labels),
+            micro_precision=precision_score(y_true, y_pred, average='micro', labels=labels),
+            micro_recall=recall_score(y_true, y_pred, average='micro', labels=labels),
+        )
+
+    if 'macro' in average:
+        ret.update(
+            macro_f1=f1_score(y_true, y_pred, average='macro', labels=labels),
+            macro_precision=precision_score(y_true, y_pred, average='macro', labels=labels),
+            macro_recall=recall_score(y_true, y_pred, average='macro', labels=labels),
+        )
+
+    return ret