In [5]:
import warnings
from importlib import reload
from functools import partial, lru_cache
import itertools
from time import monotonic

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm, trange
from joblib import delayed
from modAL import batch
from art.metrics import empirical_robustness
from art.attacks.evasion import *
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tabulate import tabulate
from art.estimators.classification.scikitlearn import ScikitlearnSVC
from sklearn.metrics.pairwise import paired_distances, euclidean_distances
import scipy
from tvregdiff.tvregdiff import TVRegDiff
from tabulate import tabulate
from tqdm.notebook import tqdm

import ipywidgets as widgets
from ipywidgets import interact

import libactive
import libadversarial
import libstop
from libactive import MyActiveLearner, active_split
from libadversarial import adversarial, uncertainty, random_batch, uncertainty_stop
from libutil import ProgressParallel
from libdatasets import *
import librun
from librun import run

In [6]:
import libdatasets; reload(libdatasets); from libdatasets import *

In [11]:
matrix = {
    # Dataset fetchers should cache if possible
    # Lambda wrapper required for function to be pickleable (sent to other threads via joblib)
    "datasets": [
        #("newsgroups", wrap(newsgroups, None)),
        ("rcv1", wrap(rcv1, None)),
        ("webkb", wrap(webkb, None)),
        ("spamassassin", wrap(spamassassin, None)),
        
        # Image classification
        ("cifar10", wrap(cifar10, None)),
        ("quickdraw", wrap(quickdraw, None)),
        ("avila", wrap(avila, None)),
        
        # General
        ("shuttle", wrap(shuttle, None)),
        #("covertype", wrap(covertype, None)), # fit takes a million years (1233s for 1000 instances)
        ("smartphone", wrap(smartphone, None)),
        ("htru2", wrap(htru2, None)),
        #("malware", wrap(malware, None)), # MALWARE FIT DID NOT FINISH (07:30:30.xxx CPU time)
        ("bidding", wrap(bidding, None)),
        ("swarm", wrap(swarm, None)),
        ("bank", wrap(bank, None)),
        #("buzz", wrap(buzz, None)), # Slow fit times
        ("sensorless", wrap(sensorless, None)),
        ("dota2", wrap(dota2, None)),
        
        # Bio
        ("abalone", wrap(abalone, None)),
        ("splice", wrap(splice, None)),
        ("anuran", wrap(anuran, None)),
        
        # Medical
        ("cardio", wrap(cardio, None)),
        ("skin", wrap(skin, None)),
    ],
    "dataset_mutators": {
        "none": (lambda *x, **kwargs: x),
    },
    "methods": [
        ("uncertainty", partial(uncertainty_stop, n_instances=10)),
    ],
    "models": [
        "svm-linear"
    ],
    "meta": {
        "dataset_size": 1000,
        "labelled_size": 10,
        "test_size": {
            "newsgroups_faith": 500,
            "newsgroups_graphics": 500,
            "newsgroups_hardware": 500,
            "newsgroups_sports_crypto": 500,
            "*": 0.5
        },
        "n_runs": 10,
        "ret_classifiers": True,
        "ensure_y": True,
        "stop_info": True,
        "aggregate": False,
        "stop_function": ("len1000", lambda learner: learner.y_training.shape[0] >= 1000),
        "pool_subsample": 1000
    }
}


In [12]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from time import monotonic
import operator
import nesi_bias
reload(nesi_bias)
reload(libactive); from libactive import active_split
from nesi_bias import bias
from dotenv import load_dotenv; load_dotenv()

def func(model):
    results = []
    for (name, dataset) in tqdm(matrix['datasets']):
        #print(f"{name}")
        X, y = dataset()

        try:
            X_labelled, X_unlabelled, Y_labelled, Y_oracle, X_test, y_test = active_split(
                *dataset(), mutator=matrix['dataset_mutators']['none'], test_size=0.5, labeled_size=10, shuffle=True, random_state=np.random
            )
            assert X_unlabelled.shape[0] >= 1490, "unlabelled pool too small"
        except Exception as e:
            #print(f"Could not split: {e}")
            results.append([name, 0,0,0])
            continue

        if isinstance(X_labelled, scipy.sparse.csr_matrix):
            X = scipy.sparse.vstack((X_labelled, X_unlabelled))
        else:
            X = np.concatenate((X_labelled, X_unlabelled))
        y = np.concatenate((Y_labelled, Y_oracle))


        clf = model()

        clf.fit(X_labelled, Y_labelled)
        start = clf.score(X_test, y_test)
        start_t = monotonic()
        clf.fit(X[:1000], y[:1000])
        time = monotonic() - start_t
        final = clf.score(X_test, y_test)
        results.append([name, start, final, time])
        
    results = np.array(results)
    print(tabulate(np.hstack((results, np.expand_dims(results[:,2].astype(float)-results[:,1].astype(float), axis=1))), headers=["Name", "Initial acc", "Final acc", "Time", "Diff"]))

In [13]:
func(partial(SVC, kernel='linear', probability=True))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=19.0), HTML(value='')))


Name            Initial acc    Final acc    Time         Diff
------------  -------------  -----------  ------  -----------
rcv1               0.474062     0.902694   3.093   0.428633
webkb              0.298571     0.881429   3.172   0.582857
spamassassin       0.742895     0.967284   2.375   0.224389
cifar10            0.1394       0.300567   7.453   0.161167
quickdraw          0.559482     0.75118    0.969   0.191698
avila              0.177497     0.577918   0.25    0.400422
shuttle            0.926138     0.970517   0.031   0.0443793
smartphone         0.619327     0.940154   0.25    0.320827
htru2              0.949156     0.979327   0.015   0.030171
bidding            0.888326     0.989244   0.078   0.100917
swarm              0.667971     0.942622   3.312   0.27465
bank               0.883084     0.875697   0.328  -0.00738742
sensorless         0.178978     0.869219   0.297   0.690241
dota2              0.495143     0.563433   7.25    0.0682896
abalone            0.438966     

In [None]:
func(DecisionTreeClassifier)

In [None]:
func(RandomForestClassifier)

In [None]:
func(MLPClassifier)