In [35]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=100_000, n_classes=4, n_informative=10)

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(X, y, train_size=0.7)

clf = RandomForestClassifier()
clf.fit(train_x, train_y)

# Get metrics
predictions = clf.predict(test_x)
print("Train Accuracy :: ", accuracy_score(train_y, clf.predict(train_x)))
print("Test Accuracy  :: ", accuracy_score(test_y, predictions))
print(" Confusion matrix ", confusion_matrix(test_y, predictions))

Train Accuracy ::  1.0
Test Accuracy  ::  0.9068333333333334
 Confusion matrix  [[7082  130  211  189]
 [ 168 6798  203  295]
 [ 419  180 6604  260]
 [ 322  151  267 6721]]


In [37]:
import numpy as np
from baal.active.heuristics import BALD
print(f"Using {len(clf.estimators_)} estimators")

# Predict independently for all estimators.
x = np.array(list(map(lambda e: e.predict_proba(test_x), clf.estimators_)))
# Roll axis because BaaL expect [n_samples, n_classes, ..., n_estimations]
x = np.rollaxis(x, 0, 3)
print("Uncertainty per sample")
print(BALD().compute_score(x))

print("Ranks")
print(BALD()(x))

Using 100 estimators
Uncertainty per sample
[0.59446649 0.9595801  0.22353536 ... 0.81694421 0.93318879 0.88185129]
Ranks
[ 8228 11616  8784 ... 15881 17453  9975]


In [38]:
from baal.active.dataset import ActiveNumpyArray

batch_size=100
n_batch=10
start_size=50

dataset = ActiveNumpyArray((train_x, train_y))

# We start with a 50 labelled samples.
dataset.label_randomly(start_size)

heuristic = BALD()

# We will use a RandomForest in this case.
clf = RandomForestClassifier()
def predict(test, clf):
    # Predict with all fitted estimators.
    x = np.array(list(map(lambda e: e.predict_proba(test[0]), clf.estimators_)))

    # Roll axis because BaaL expect [n_samples, n_classes, ..., n_estimations]
    x = np.rollaxis(x, 0, 3)
    return x

# Start logging results
data = []

for _ in range(n_batch):
  clf.fit(*dataset.dataset)
  predictions = clf.predict(test_x)
  data.append({"batch": len(dataset), "score": accuracy_score(test_y, predictions)})
  probs = predict(dataset.pool, clf)
  to_label = heuristic(probs)
  if len(to_label) > 0:
      dataset.label(to_label[: batch_size])
  else:
    break

In [39]:
import pandas as pd
import altair as alt

pltr = pd.DataFrame(data)

(alt.Chart(pltr)
  .mark_line()
  .encode(x='batch:Q', y='score:Q')
  .properties(width=600, height=250)
  .interactive())