## All datasets

In [4]:
import warnings
from importlib import reload
from functools import partial, lru_cache
import itertools
from time import monotonic

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm, trange
from joblib import delayed
from modAL import batch
from art.metrics import empirical_robustness
from art.attacks.evasion import *
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tabulate import tabulate
from art.estimators.classification.scikitlearn import ScikitlearnSVC
from sklearn.metrics.pairwise import paired_distances, euclidean_distances
import scipy
from tvregdiff.tvregdiff import TVRegDiff

from ipynb.fs.defs import Bias
from ipynb.fs.defs.Datasets import generateData_twoPills_2D, generateData_twoPills_noNoise_2D, plot_dataset_2D
import ipywidgets as widgets
from ipywidgets import interact

import libactive
import libadversarial
import libstop
from libactive import MyActiveLearner, active_split
from libadversarial import adversarial, uncertainty, random_batch, uncertainty_stop
from libutil import ProgressParallel
from libdatasets import *
import librun
from librun import run

In [110]:
import libdatasets; reload(libdatasets); from libdatasets import *

In [111]:
def wrap(func, *args, **kwargs):
    wrapper = lambda: lru_cache()(func)(*args, **kwargs)
    for attr in [attr for attr in dir(func) if not attr.startswith('__')]:
        setattr(wrapper, attr, getattr(func, attr))
    return wrapper

In [112]:
matrix = {
    # Dataset fetchers should cache if possible
    # Lambda wrapper required for function to be pickleable (sent to other threads via joblib)
    "datasets": [
        #("banknote", wrap(banknote, None)), too few instances
        #("haberman", wrap(haberman, None)), too few instances
        #("digits", wrap(digits)), too few instances
        ("abalone", wrap(abalone, None)),
        #("car", wrap(car, None)), too few instances
        ("cardio", wrap(cardio, None)),
        ("shuttle", wrap(shuttle, None)),
        ("skin", wrap(skin, None)),
        #("german", wrap(german, None)), too few instances
        #("sonar", wrap(sonar, None)), too few instances
        ("splice", wrap(splice, None)),
        #("bbbp", wrap(bbbp, None)), too few instances
        ("hiv", wrap(hiv, None)),
        ("mutagen", wrap(mutagen, None)),
        #("MUV", wrap(MUV, None)),
        #("sider", wrap(sider, None)),
        #("tox21", wrap(tox21, None)),
        ("mnist", wrap(mnist, None)),
        ("quickdraw", wrap(quickdraw, None)),
        ("newsgroups", wrap(newsgroups, None)),
        #("reuters21578", wrap(reuters21578, None)), Preprocessing not done
        ("rcv1", wrap(rcv1, None)),
        ("cifar10", wrap(cifar10, None)),
        #("higgs", wrap(higgs, None)), Very large and difficult to deal with
        ("webkb", wrap(webkb, None)),
        ("spamassassin", wrap(spamassassin, None)),
        ("smartphone", wrap(smartphone, None)),
        ("covertype", wrap(covertype, None)),
        ("htru2", wrap(htru2, None)),
        #("malware", wrap(malware, None)), Fit has failed to complete/converge before
        ("bidding", wrap(bidding, None)),
        ("swarm", wrap(swarm, None)),
        ("bank", wrap(bank, None)),
        ("anuran", wrap(anuran, None)),
        ("avila", wrap(avila, None)),
        ("coral", wrap(coral, None)),
        ("buzz_th", wrap(buzz, "th", None)),
        ("buzz_th", wrap(buzz, "th", None)),
        ("sensorless", wrap(sensorless, None)),
        ("dota2", wrap(dota2, None)),
        ("gas", wrap(gas, None)), 
    ],
    "dataset_mutators": {
        "none": (lambda *x, **kwargs: x),
    },
    "methods": [
        ("uncertainty", partial(uncertainty_stop, n_instances=10)),
    ],
    "models": [
        "svm-linear"
    ],
    "meta": {
        "dataset_size": 1000,
        "labelled_size": 10,
        "test_size": {
            "newsgroups_faith": 500,
            "newsgroups_graphics": 500,
            "newsgroups_hardware": 500,
            "newsgroups_sports_crypto": 500,
            "*": 0.5
        },
        "n_runs": 10,
        "ret_classifiers": True,
        "ensure_y": True,
        "stop_info": True,
        "aggregate": False,
        "stop_function": ("len1000", lambda learner: learner.y_training.shape[0] >= 1000),
    }
}

In [113]:
libdatasets.dataset_summary([data[0] for data in matrix['datasets']], [data[1] for data in matrix['datasets']])

Dataset         Instances    Classes    Features  Most common class           Least common class    Domain
------------  -----------  ---------  ----------  --------------------------  --------------------  --------
banknote             1372          2           5  0 56%                       1 44%                 general
haberman             1372          2           4  0 56%                       1 44%                 general
digits               1797         10          64  3 10%                       8 10%                 general
abalone              4177          3           8  M 37%                       F 31%                 general
car                  1728          4           7  unacc 70%                   vgood 4%              general
cardio              70000          2           3  0 50%                       1 50%                 general
shuttle             58000          2          10  True 79%                    False 21%             general
skin               245057   

In [61]:
X,y=splice(None)

In [62]:
type(X)

scipy.sparse.csr.csr_matrix

In [63]:
X.shape

(3190, 287)