add: random tie break for multilabel classification

cosmic-cortex · cosmic-cortex · commit b7815d8a8b0d · 2018-12-05T13:21:31.000+01:00
diff --git a/modAL/multilabel.py b/modAL/multilabel.py
@@ -5,14 +5,13 @@
 
 from modAL.models import ActiveLearner
 from modAL.utils.data import modALinput
-from modAL.utils.selection import multi_argmax
+from modAL.utils.selection import multi_argmax, shuffled_argmax
 from typing import Tuple, Optional
 from itertools import combinations
 
 
 def _SVM_loss(multiclass_classifier: ActiveLearner,
-              X: modALinput,
-              most_certain_classes: Optional[int] = None) -> np.ndarray:
+              X: modALinput, most_certain_classes: Optional[int] = None) -> np.ndarray:
     """
     Utility function for max_loss and mean_max_loss strategies.
 
@@ -43,8 +42,8 @@ def _SVM_loss(multiclass_classifier: ActiveLearner,
         return cls_loss
 
 
-def SVM_binary_minimum(classifier: ActiveLearner,
-                       X_pool: modALinput) -> Tuple[np.ndarray, modALinput]:
+def SVM_binary_minimum(classifier: ActiveLearner, X_pool: modALinput,
+                       random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
     """
     SVM binary minimum multilabel active learning strategy. For details see the paper
     Klaus Brinker, On Active Learning in Multi-label Classification
@@ -53,23 +52,30 @@ def SVM_binary_minimum(classifier: ActiveLearner,
     Args:
         classifier: The multilabel classifier for which the labels are to be queried. Must be an SVM model
             such as the ones from sklearn.svm.
-        X: The pool of samples to query from.
+        X_pool: The pool of samples to query from.
+        random_tie_break: If True, shuffles utility scores to randomize the order. This
+            can be used to break the tie when the highest utility score is not unique.
 
     Returns:
-        The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled.
+        The index of the instance from X_pool chosen to be labelled;
+        the instance from X_pool chosen to be labelled.
     """
 
     decision_function = np.array([svm.decision_function(X_pool)
                                   for svm in classifier.estimator.estimators_]).T
 
     min_abs_dist = np.min(np.abs(decision_function), axis=1)
-    query_idx = np.argmin(min_abs_dist)
+
+    if not random_tie_break:
+        query_idx = np.argmin(min_abs_dist)
+    else:
+        query_idx = shuffled_argmax(min_abs_dist)
+
     return query_idx, X_pool[query_idx]
 
 
-def max_loss(classifier: OneVsRestClassifier,
-             X_pool: modALinput,
-             n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
+def max_loss(classifier: OneVsRestClassifier, X_pool: modALinput,
+             n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
 
     """
     Max Loss query strategy for SVM multilabel classification.
@@ -82,24 +88,30 @@ def max_loss(classifier: OneVsRestClassifier,
         classifier: The multilabel classifier for which the labels are to be queried. Should be an SVM model
             such as the ones from sklearn.svm. Although the function will execute for other models as well,
             the mathematical calculations in Li et al. work only for SVM-s.
-        X: The pool of samples to query from.
+        X_pool: The pool of samples to query from.
+        random_tie_break: If True, shuffles utility scores to randomize the order. This
+            can be used to break the tie when the highest utility score is not unique.
 
     Returns:
-        The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled.
+        The index of the instance from X_pool chosen to be labelled;
+        the instance from X_pool chosen to be labelled.
     """
 
     assert len(X_pool) >= n_instances, 'n_instances cannot be larger than len(X_pool)'
 
     most_certain_classes = classifier.predict_proba(X_pool).argmax(axis=1)
     loss = _SVM_loss(classifier, X_pool, most_certain_classes=most_certain_classes)
 
-    query_idx = multi_argmax(loss, n_instances)
+    if not random_tie_break:
+        query_idx = multi_argmax(loss, n_instances)
+    else:
+        query_idx = shuffled_argmax(loss, n_instances)
+
     return query_idx, X_pool[query_idx]
 
 
-def mean_max_loss(classifier: OneVsRestClassifier,
-                  X_pool: modALinput,
-                  n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
+def mean_max_loss(classifier: OneVsRestClassifier, X_pool: modALinput,
+                  n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
     """
     Mean Max Loss query strategy for SVM multilabel classification.
 
@@ -111,22 +123,28 @@ def mean_max_loss(classifier: OneVsRestClassifier,
         classifier: The multilabel classifier for which the labels are to be queried. Should be an SVM model
             such as the ones from sklearn.svm. Although the function will execute for other models as well,
             the mathematical calculations in Li et al. work only for SVM-s.
-        X: The pool of samples to query from.
+        X_pool: The pool of samples to query from.
+        random_tie_break: If True, shuffles utility scores to randomize the order. This
+            can be used to break the tie when the highest utility score is not unique.
 
     Returns:
-        The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled.
+        The index of the instance from X_pool chosen to be labelled;
+        the instance from X_pool chosen to be labelled.
     """
 
     assert len(X_pool) >= n_instances, 'n_instances cannot be larger than len(X_pool)'
     loss = _SVM_loss(classifier, X_pool)
 
-    query_idx = multi_argmax(loss, n_instances)
+    if not random_tie_break:
+        query_idx = multi_argmax(loss, n_instances)
+    else:
+        query_idx = shuffled_argmax(loss, n_instances)
+
     return query_idx, X_pool[query_idx]
 
 
-def min_confidence(classifier: OneVsRestClassifier,
-                   X_pool: modALinput,
-                   n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
+def min_confidence(classifier: OneVsRestClassifier, X_pool: modALinput,
+                   n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
     """
     MinConfidence query strategy for multilabel classification.
 
@@ -136,22 +154,28 @@ def min_confidence(classifier: OneVsRestClassifier,
 
     Args:
         classifier: The multilabel classifier for which the labels are to be queried.
-        X: The pool of samples to query from.
+        X_pool: The pool of samples to query from.
+        random_tie_break: If True, shuffles utility scores to randomize the order. This
+            can be used to break the tie when the highest utility score is not unique.
 
     Returns:
-        The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled.
+        The index of the instance from X_pool chosen to be labelled;
+        the instance from X_pool chosen to be labelled.
     """
 
     classwise_confidence = classifier.predict_proba(X_pool)
     classwise_min = np.min(classwise_confidence, axis=1)
-    query_idx = multi_argmax((-1)*classwise_min, n_instances)
+
+    if not random_tie_break:
+        query_idx = multi_argmax(-classwise_min, n_instances)
+    else:
+        query_idx = shuffled_argmax(-classwise_min, n_instances)
 
     return query_idx, X_pool[query_idx]
 
 
-def avg_confidence(classifier: OneVsRestClassifier,
-                   X_pool: modALinput,
-                   n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
+def avg_confidence(classifier: OneVsRestClassifier, X_pool: modALinput,
+                   n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
     """
     AvgConfidence query strategy for multilabel classification.
 
@@ -161,22 +185,28 @@ def avg_confidence(classifier: OneVsRestClassifier,
 
     Args:
         classifier: The multilabel classifier for which the labels are to be queried.
-        X: The pool of samples to query from.
+        X_pool: The pool of samples to query from.
+        random_tie_break: If True, shuffles utility scores to randomize the order. This
+            can be used to break the tie when the highest utility score is not unique.
 
     Returns:
-        The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled.
+        The index of the instance from X_pool chosen to be labelled;
+        the instance from X_pool chosen to be labelled.
     """
 
     classwise_confidence = classifier.predict_proba(X_pool)
     classwise_mean = np.mean(classwise_confidence, axis=1)
-    query_idx = multi_argmax(classwise_mean, n_instances)
+
+    if not random_tie_break:
+        query_idx = multi_argmax(classwise_mean, n_instances)
+    else:
+        query_idx = shuffled_argmax(classwise_mean, n_instances)
 
     return query_idx, X_pool[query_idx]
 
 
-def max_score(classifier: OneVsRestClassifier,
-              X_pool: modALinput,
-              n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
+def max_score(classifier: OneVsRestClassifier, X_pool: modALinput,
+              n_instances: int = 1, random_tie_break: bool = 1) -> Tuple[np.ndarray, modALinput]:
     """
     MaxScore query strategy for multilabel classification.
 
@@ -186,24 +216,30 @@ def max_score(classifier: OneVsRestClassifier,
 
     Args:
         classifier: The multilabel classifier for which the labels are to be queried.
-        X: The pool of samples to query from.
+        X_pool: The pool of samples to query from.
+        random_tie_break: If True, shuffles utility scores to randomize the order. This
+            can be used to break the tie when the highest utility score is not unique.
 
     Returns:
-        The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled.
+        The index of the instance from X_pool chosen to be labelled;
+        the instance from X_pool chosen to be labelled.
     """
 
     classwise_confidence = classifier.predict_proba(X_pool)
     classwise_predictions = classifier.predict(X_pool)
     classwise_scores = classwise_confidence*(classwise_predictions - 1/2)
     classwise_max = np.max(classwise_scores, axis=1)
-    query_idx = multi_argmax(classwise_max, n_instances)
+
+    if not random_tie_break:
+        query_idx = multi_argmax(classwise_max, n_instances)
+    else:
+        query_idx = shuffled_argmax(classwise_max, n_instances)
 
     return query_idx, X_pool[query_idx]
 
 
-def avg_score(classifier: OneVsRestClassifier,
-              X_pool: modALinput,
-              n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
+def avg_score(classifier: OneVsRestClassifier, X_pool: modALinput,
+              n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
     """
     AvgScore query strategy for multilabel classification.
 
@@ -213,16 +249,23 @@ def avg_score(classifier: OneVsRestClassifier,
 
     Args:
         classifier: The multilabel classifier for which the labels are to be queried.
-        X: The pool of samples to query from.
+        X_pool: The pool of samples to query from.
+        random_tie_break: If True, shuffles utility scores to randomize the order. This
+            can be used to break the tie when the highest utility score is not unique.
 
     Returns:
-        The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled.
+        The index of the instance from X_pool chosen to be labelled;
+        the instance from X_pool chosen to be labelled.
     """
 
     classwise_confidence = classifier.predict_proba(X_pool)
     classwise_predictions = classifier.predict(X_pool)
     classwise_scores = classwise_confidence*(classwise_predictions-1/2)
     classwise_mean = np.mean(classwise_scores, axis=1)
-    query_idx = multi_argmax(classwise_mean, n_instances)
+
+    if not random_tie_break:
+        query_idx = multi_argmax(classwise_mean, n_instances)
+    else:
+        query_idx = shuffled_argmax(classwise_mean, n_instances)
 
     return query_idx, X_pool[query_idx]
diff --git a/tests/core_tests.py b/tests/core_tests.py
@@ -1049,14 +1049,22 @@ def test_strategies(self):
                     classifier.fit(X_training, y_training)
 
                     active_learner = modAL.models.ActiveLearner(classifier)
+                    # no random tie break
                     modAL.multilabel.SVM_binary_minimum(active_learner, X_pool)
-
                     modAL.multilabel.mean_max_loss(classifier, X_pool, n_query_instances)
                     modAL.multilabel.max_loss(classifier, X_pool, n_query_instances)
                     modAL.multilabel.min_confidence(classifier, X_pool, n_query_instances)
                     modAL.multilabel.avg_confidence(classifier, X_pool, n_query_instances)
                     modAL.multilabel.max_score(classifier, X_pool, n_query_instances)
                     modAL.multilabel.avg_score(classifier, X_pool, n_query_instances)
+                    # random tie break
+                    modAL.multilabel.SVM_binary_minimum(active_learner, X_pool, random_tie_break=True)
+                    modAL.multilabel.mean_max_loss(classifier, X_pool, n_query_instances, random_tie_break=True)
+                    modAL.multilabel.max_loss(classifier, X_pool, n_query_instances, random_tie_break=True)
+                    modAL.multilabel.min_confidence(classifier, X_pool, n_query_instances, random_tie_break=True)
+                    modAL.multilabel.avg_confidence(classifier, X_pool, n_query_instances, random_tie_break=True)
+                    modAL.multilabel.max_score(classifier, X_pool, n_query_instances, random_tie_break=True)
+                    modAL.multilabel.avg_score(classifier, X_pool, n_query_instances, random_tie_break=True)
 
 
 class TestExamples(unittest.TestCase):
diff --git a/tests/example_tests/ensemble.py b/tests/example_tests/ensemble.py
@@ -32,7 +32,7 @@
 learner_list = []
 for _ in range(n_learners):
     learner = ActiveLearner(
-        estimator=RandomForestClassifier(),
+        estimator=RandomForestClassifier(n_estimators=10),
         X_training=X_pool[initial_idx], y_training=y_pool[initial_idx],
         bootstrap_init=True
     )
diff --git a/tests/example_tests/query_by_committee.py b/tests/example_tests/query_by_committee.py
@@ -30,7 +30,7 @@
 
     # initializing learner
     learner = ActiveLearner(
-        estimator=RandomForestClassifier(),
+        estimator=RandomForestClassifier(n_estimators=10),
         X_training=X_train, y_training=y_train
     )
     learner_list.append(learner)
diff --git a/tests/example_tests/shape_learning.py b/tests/example_tests/shape_learning.py
@@ -34,7 +34,7 @@
 
 # create an ActiveLearner instance
 learner = ActiveLearner(
-    estimator=RandomForestClassifier(),
+    estimator=RandomForestClassifier(n_estimators=10),
     X_training=X_train, y_training=y_train
 )
 initial_prediction = learner.predict_proba(X_full)[:, 1].reshape(im_height, im_width)
diff --git a/tests/example_tests/stream_based_sampling.py b/tests/example_tests/stream_based_sampling.py
@@ -30,7 +30,7 @@
 
 # initialize the learner
 learner = ActiveLearner(
-    estimator=RandomForestClassifier(),
+    estimator=RandomForestClassifier(n_estimators=10),
     X_training=X_train, y_training=y_train
 )
 

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@`
`32`	`32`	`learner_list = []`
`33`	`33`	`for _ in range(n_learners):`
`34`	`34`	`learner = ActiveLearner(`
`35`		`- estimator=RandomForestClassifier(),`
	`35`	`+ estimator=RandomForestClassifier(n_estimators=10),`
`36`	`36`	`X_training=X_pool[initial_idx], y_training=y_pool[initial_idx],`
`37`	`37`	`bootstrap_init=True`
`38`	`38`	`)`
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@`
`30`	`30`
`31`	`31`	`# initializing learner`
`32`	`32`	`learner = ActiveLearner(`
`33`		`- estimator=RandomForestClassifier(),`
	`33`	`+ estimator=RandomForestClassifier(n_estimators=10),`
`34`	`34`	`X_training=X_train, y_training=y_train`
`35`	`35`	`)`
`36`	`36`	`learner_list.append(learner)`
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@`
`34`	`34`
`35`	`35`	`# create an ActiveLearner instance`
`36`	`36`	`learner = ActiveLearner(`
`37`		`- estimator=RandomForestClassifier(),`
	`37`	`+ estimator=RandomForestClassifier(n_estimators=10),`
`38`	`38`	`X_training=X_train, y_training=y_train`
`39`	`39`	`)`
`40`	`40`	`initial_prediction = learner.predict_proba(X_full)[:, 1].reshape(im_height, im_width)`