In [3]:
import warnings
from importlib import reload
from functools import partial, lru_cache
import itertools
from time import monotonic

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm, trange
from joblib import delayed
from modAL import batch
from art.metrics import empirical_robustness
from art.attacks.evasion import *
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tabulate import tabulate
from art.estimators.classification.scikitlearn import ScikitlearnSVC
from sklearn.metrics.pairwise import paired_distances, euclidean_distances
import scipy
from tvregdiff.tvregdiff import TVRegDiff
from tabulate import tabulate
from tqdm.notebook import tqdm

import ipywidgets as widgets
from ipywidgets import interact

import libactive
import libadversarial
import libstop
from libactive import MyActiveLearner, active_split
from libadversarial import adversarial, uncertainty, random_batch, uncertainty_stop
from libutil import ProgressParallel
from libdatasets import *
import librun
from librun import run

In [4]:
import libdatasets; reload(libdatasets); from libdatasets import *

In [5]:
matrix = {
    # Dataset fetchers should cache if possible
    # Lambda wrapper required for function to be pickleable (sent to other threads via joblib)
    "datasets": [
        #("newsgroups", wrap(newsgroups, None)),
        ("rcv1", wrap(rcv1, None)),
        ("webkb", wrap(webkb, None)),
        ("spamassassin", wrap(spamassassin, None)),
        
        # Image classification
        ("cifar10", wrap(cifar10, None)),
        ("quickdraw", wrap(quickdraw, None)),
        ("avila", wrap(avila, None)),
        
        # General
        ("shuttle", wrap(shuttle, None)),
        #("covertype", wrap(covertype, None)), # fit takes a million years (1233s for 1000 instances)
        ("smartphone", wrap(smartphone, None)),
        ("htru2", wrap(htru2, None)),
        #("malware", wrap(malware, None)), # MALWARE FIT DID NOT FINISH (07:30:30.xxx CPU time)
        ("bidding", wrap(bidding, None)),
        ("swarm", wrap(swarm, None)),
        ("bank", wrap(bank, None)),
        #("buzz", wrap(buzz, None)), # Slow fit times
        ("sensorless", wrap(sensorless, None)),
        ("dota2", wrap(dota2, None)),
        
        # Bio
        ("abalone", wrap(abalone, None)),
        ("splice", wrap(splice, None)),
        ("anuran", wrap(anuran, None)),
        
        # Medical
        ("cardio", wrap(cardio, None)),
        ("skin", wrap(skin, None)),
    ],
    "dataset_mutators": {
        "none": (lambda *x, **kwargs: x),
    },
    "methods": [
        ("uncertainty", partial(uncertainty_stop, n_instances=10)),
    ],
    "models": [
        "svm-linear"
    ],
    "meta": {
        "dataset_size": 1000,
        "labelled_size": 10,
        "test_size": {
            "newsgroups_faith": 500,
            "newsgroups_graphics": 500,
            "newsgroups_hardware": 500,
            "newsgroups_sports_crypto": 500,
            "*": 0.5
        },
        "n_runs": 10,
        "ret_classifiers": True,
        "ensure_y": True,
        "stop_info": True,
        "aggregate": False,
        "stop_function": ("len1000", lambda learner: learner.y_training.shape[0] >= 1000),
        "pool_subsample": 1000
    }
}


In [6]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from time import monotonic
import operator
import nesi_bias
reload(nesi_bias)
reload(libactive); from libactive import active_split
from nesi_bias import bias
from dotenv import load_dotenv; load_dotenv()

def func(model):
    results = []
    for (name, dataset) in tqdm(matrix['datasets']):
        #print(f"{name}")
        X, y = dataset()

        try:
            X_labelled, X_unlabelled, Y_labelled, Y_oracle, X_test, y_test = active_split(
                *dataset(), mutator=matrix['dataset_mutators']['none'], test_size=0.5, labeled_size=10, shuffle=True, random_state=np.random
            )
            assert X_unlabelled.shape[0] >= 1490, "unlabelled pool too small"
        except Exception as e:
            #print(f"Could not split: {e}")
            results.append([name, 0,0,0])
            continue

        if isinstance(X_labelled, scipy.sparse.csr_matrix):
            X = scipy.sparse.vstack((X_labelled, X_unlabelled))
        else:
            X = np.concatenate((X_labelled, X_unlabelled))
        y = np.concatenate((Y_labelled, Y_oracle))


        clf = model()

        clf.fit(X_labelled, Y_labelled)
        start = clf.score(X_test, y_test)
        start_t = monotonic()
        clf.fit(X[:1000], y[:1000])
        time = monotonic() - start_t
        final = clf.score(X_test, y_test)
        results.append([name, start, final, time])
        
    results = np.array(results)
    print(tabulate(np.hstack((results, np.expand_dims(results[:,2].astype(float)-results[:,1].astype(float), axis=1))), headers=["Name", "Initial acc", "Final acc", "Time", "Diff"]))

In [13]:
func(partial(SVC, kernel='linear', probability=True))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=19.0), HTML(value='')))


Name            Initial acc    Final acc    Time         Diff
------------  -------------  -----------  ------  -----------
rcv1               0.474062     0.902694   3.093   0.428633
webkb              0.298571     0.881429   3.172   0.582857
spamassassin       0.742895     0.967284   2.375   0.224389
cifar10            0.1394       0.300567   7.453   0.161167
quickdraw          0.559482     0.75118    0.969   0.191698
avila              0.177497     0.577918   0.25    0.400422
shuttle            0.926138     0.970517   0.031   0.0443793
smartphone         0.619327     0.940154   0.25    0.320827
htru2              0.949156     0.979327   0.015   0.030171
bidding            0.888326     0.989244   0.078   0.100917
swarm              0.667971     0.942622   3.312   0.27465
bank               0.883084     0.875697   0.328  -0.00738742
sensorless         0.178978     0.869219   0.297   0.690241
dota2              0.495143     0.563433   7.25    0.0682896
abalone            0.438966     

**Easy Dataset**: htru2

**Hard Dataset**: cifar10

In [5]:
func(DecisionTreeClassifier)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=19.0), HTML(value='')))


Name            Initial acc    Final acc    Time        Diff
------------  -------------  -----------  ------  ----------
rcv1               0.573874     0.794183   0.235   0.220309
webkb              0.503333     0.784286   0.203   0.280952
spamassassin       0.794118     0.944151   0.234   0.150033
cifar10            0.129167     0.197933   2.094   0.0687667
quickdraw          0.38344      0.636689   0.25    0.25325
avila              0.123251     0.668488   0.016   0.545237
shuttle            0.926        0.995655   0       0.0696552
smartphone         0.216508     0.830344   0.484   0.613836
htru2              0.902        0.967818   0       0.0658174
bidding            0.97311      0.995255   0.031   0.0221449
swarm              0.515073     0.9751     0.641   0.460027
bank               0.671017     0.874635   0.032   0.203619
sensorless         0.114818     0.8698     0.031   0.754982
dota2              0.500816     0.51391    0.016   0.0130945
abalone            0.538535     0

**Easy Dataset**: htru2

**Hard Dataset**: cifar10

In [7]:
from sklearn.ensemble import RandomForestClassifier
func(RandomForestClassifier)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=19.0), HTML(value='')))


Name            Initial acc    Final acc    Time        Diff
------------  -------------  -----------  ------  ----------
rcv1               0.506682     0.86592    2.438  0.359238
webkb              0.504762     0.861905   1.188  0.357143
spamassassin       0.681758     0.961666   0.813  0.279907
cifar10            0.155933     0.3382     3.235  0.182267
quickdraw          0.393734     0.821556   0.672  0.427821
avila              0.221679     0.771229   0.297  0.54955
shuttle            0.98731      0.997828   0.203  0.0105172
smartphone         0.511896     0.920937   1.719  0.409041
htru2              0.966253     0.977092   0.312  0.0108392
bidding            0.898134     0.979437   1.203  0.0813034
swarm              0.858594     0.999584   1.094  0.140989
bank               0.881315     0.883482   1.063  0.00216757
sensorless         0.256879     0.949889   0.422  0.69301
dota2              0.509287     0.538137   0.281  0.0288506
abalone            0.507899     0.51843    0.21

**Easy Dataset**: htru2

**Hard Dataset**: cifar10

In [10]:
from sklearn.neural_network import MLPClassifier
func(MLPClassifier)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=19.0), HTML(value='')))




Name            Initial acc    Final acc     Time         Diff
------------  -------------  -----------  -------  -----------
rcv1               0.700744     0.893493  167.11    0.192749
webkb              0.618571     0.880476   37.625   0.261905
spamassassin       0.816259     0.983146   47.609   0.166887
cifar10            0.1324       0.100367   16.406  -0.0320333
quickdraw          0.395902     0.706158    1.047   0.310256
avila              0.287905     0.6705      1.11    0.382595
shuttle            0.908034     0.996552    0.953   0.0885172
smartphone         0.533309     0.934663    3.406   0.401354
htru2              0.970499     0.976981    0.516   0.00648117
bidding            0.891174     0.94274    44.25    0.051566
swarm              0.538724     0.973185    2.656   0.43446
bank               0.881934     0.87291    36.75   -0.00902415
sensorless         0.212032     0.865356    1.25    0.653324
dota2              0.472179     0.549483    1.25    0.0773042
abalone      



## Noise

In [15]:
from nesi_noise import noise
def func(amount):
    results = []
    for (name, dataset) in tqdm(matrix['datasets']):
        #print(f"{name}")
        X, y = dataset()

        try:
            X_labelled, X_unlabelled, Y_labelled, Y_oracle, X_test, y_test = active_split(
                *dataset(), mutator=partial(noise, amount=amount), test_size=0.5, labeled_size=10, shuffle=True, random_state=np.random
            )
            assert X_unlabelled.shape[0] >= 1490, "unlabelled pool too small"
        except Exception as e:
            print(f"Could not split: {e}")
            results.append([name, 0,0,0])
            continue

        if isinstance(X_labelled, scipy.sparse.csr_matrix):
            X = scipy.sparse.vstack((X_labelled, X_unlabelled))
        else:
            X = np.concatenate((X_labelled, X_unlabelled))
        y = np.concatenate((Y_labelled, Y_oracle))


        clf = SVC(kernel='linear', probability=True)

        clf.fit(X_labelled, Y_labelled)
        start = clf.score(X_test, y_test)
        start_t = monotonic()
        clf.fit(X[:1000], y[:1000])
        time = monotonic() - start_t
        final = clf.score(X_test, y_test)
        results.append([name, start, final, time])
        
    results = np.array(results)
    print(tabulate(np.hstack((results, np.expand_dims(results[:,2].astype(float)-results[:,1].astype(float), axis=1))), headers=["Name", "Initial acc", "Final acc", "Time", "Diff"]))

In [16]:
for amount in np.arange(0.1, 0.5, 0.1):
    print(f"noise={amount}")
    func(amount)

noise=0.1


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=19.0), HTML(value='')))


Name            Initial acc    Final acc    Time          Diff
------------  -------------  -----------  ------  ------------
rcv1               0.604234     0.9021     3.016   0.297867
webkb              0.496667     0.850952   4.547   0.354286
spamassassin       0.659617     0.924653   4.906   0.265036
cifar10            0.1053       0.259733  14.156   0.154433
quickdraw          0.488104     0.662758   1.656   0.174655
avila              0.134656     0.565842   0.36    0.431187
shuttle            0.927379     0.927241   0.172  -0.000137931
smartphone         0.452233     0.907211   0.578   0.454978
htru2              0.812381     0.975528   0.062   0.163147
bidding            0.900981     0.975008   0.329   0.0740272
swarm              0.583278     0.795886   6.235   0.212608
bank               0.846722     0.851544   0.5     0.00482173
sensorless         0.235447     0.794838   0.797   0.559392
dota2              0.509481     0.553213   6.719   0.0437325
abalone            0.51747

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=19.0), HTML(value='')))


Name            Initial acc    Final acc    Time       Diff
------------  -------------  -----------  ------  ---------
rcv1               0.536443     0.874361   3.203  0.337918
webkb              0.307143     0.772857   7.015  0.465714
spamassassin       0.460674     0.893919   5.735  0.433245
cifar10            0.152367     0.253367  12.937  0.101
quickdraw          0.514079     0.656494   1.562  0.142415
avila              0.12258      0.527123   0.344  0.404543
shuttle            0.843724     0.926552   0.875  0.0828276
smartphone         0.358346     0.846449   0.75   0.488104
htru2              0.864789     0.964912   0.109  0.100123
bidding            0.950965     0.963303   0.359  0.0123379
swarm              0.568621     0.771236   6.219  0.202615
bank               0.499071     0.82916    0.516  0.330089
sensorless         0.167048     0.770296   1.141  0.603247
dota2              0.499262     0.549134   6.5    0.0498718
abalone            0.513164     0.539493   0.312  0.0

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=19.0), HTML(value='')))


Name            Initial acc    Final acc    Time        Diff
------------  -------------  -----------  ------  ----------
rcv1               0.52839      0.874997   3.187   0.346608
webkb              0.262857     0.699048   7.547   0.43619
spamassassin       0.662921     0.813615   9.39    0.150694
cifar10            0.1468       0.232067  16.922   0.0852667
quickdraw          0.300303     0.574681   1.906   0.274378
avila              0.232413     0.554246   0.5     0.321832
shuttle            0.785414     0.927793   0.625   0.142379
smartphone         0.332723     0.795205   0.954   0.462482
htru2              0.872947     0.960443   0.14    0.0874958
bidding            0.906675     0.950965   0.422   0.0442898
swarm              0.532145     0.725433   8.515   0.193288
bank               0.854242     0.802707   0.578  -0.051535
sensorless         0.166912     0.669732   1.281   0.50282
dota2              0.522012     0.547754   7.953   0.0257422
abalone            0.505984     0.5

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=19.0), HTML(value='')))


Name            Initial acc    Final acc    Time        Diff
------------  -------------  -----------  ------  ----------
rcv1               0.524976     0.871797   3.171   0.346821
webkb              0.473333     0.704762   8.297   0.231429
spamassassin       0.634171     0.825512   9.297   0.191342
cifar10            0.110567     0.205633  24.172   0.0950667
quickdraw          0.348935     0.555451   2.672   0.206516
avila              0.105041     0.50738    0.516   0.402339
shuttle            0.816414     0.926034   0.094   0.109621
smartphone         0.33675      0.729319   1.125   0.39257
htru2              0.730473     0.95899    0.219   0.228517
bidding            0.859854     0.936096   0.407   0.0762417
swarm              0.586026     0.716772   8.891   0.130746
bank               0.868265     0.7878     0.562  -0.0804654
sensorless         0.143907     0.642181   1.422   0.498274
dota2              0.516164     0.544335   9.672   0.0281707
abalone            0.183341     0.

In [None]:
Name            Initial acc    Final acc    Time        Diff
------------  -------------  -----------  ------  ----------
rcv1               0.524976     0.871797   3.171   0.346821
avila              0.105041     0.50738    0.516   0.402339

webkb              0.473333     0.704762   8.297   0.231429
spamassassin       0.634171     0.825512   9.297   0.191342

shuttle            0.816414     0.926034   0.094   0.109621
smartphone         0.33675      0.729319   1.125   0.39257
htru2              0.730473     0.95899    0.219   0.228517
swarm              0.586026     0.716772   8.891   0.130746
sensorless         0.143907     0.642181   1.422   0.498274
dota2              0.516164     0.544335   9.672   0.0281707
abalone            0.183341     0.535663   0.219   0.352322
splice             0.570533     0.749843   2.796   0.17931
anuran             0.389383     0.895497   0.203   0.506115
skin               0.554448     0.882836   0.079   0.328388