In [51]:
import warnings
from importlib import reload
from functools import partial, lru_cache
import itertools
from time import monotonic

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm, trange
from joblib import delayed
from modAL import batch
from art.metrics import empirical_robustness
from art.attacks.evasion import *
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tabulate import tabulate
from art.estimators.classification.scikitlearn import ScikitlearnSVC
from sklearn.metrics.pairwise import paired_distances, euclidean_distances
import scipy
from tvregdiff.tvregdiff import TVRegDiff
from tabulate import tabulate

from ipynb.fs.defs import Bias
from ipynb.fs.defs.Datasets import generateData_twoPills_2D, generateData_twoPills_noNoise_2D, plot_dataset_2D
import ipywidgets as widgets
from ipywidgets import interact

import libactive
import libadversarial
import libstop
from libactive import MyActiveLearner, active_split
from libadversarial import adversarial, uncertainty, random_batch, uncertainty_stop
from libutil import ProgressParallel
from libdatasets import *
import librun
from librun import run

In [6]:
import libdatasets; reload(libdatasets); from libdatasets import *

In [7]:
def wrap(func, *args, **kwargs):
    wrapper = lambda: lru_cache()(func)(*args, **kwargs)
    for attr in [attr for attr in dir(func) if not attr.startswith('__')]:
        setattr(wrapper, attr, getattr(func, attr))
    return wrapper

### Dataset selection criteria

* More than 3000 instances
* More than 2 features
* No missing values
* Ideally easy to extract & preprocess

In [8]:
matrix = {
    # Dataset fetchers should cache if possible
    # Lambda wrapper required for function to be pickleable (sent to other threads via joblib)
    "datasets": [
        # Text classification
        ("newsgroups", wrap(newsgroups, None)),
        ("rcv1", wrap(rcv1, None)),
        ("webkb", wrap(webkb, None)),
        ("spamassassin", wrap(spamassassin, None)),
        
        # Image classification
        ("cifar10", wrap(cifar10, None)),
        ("quickdraw", wrap(quickdraw, None)),
        ("avila", wrap(avila, None)),
        
        # General
        ("shuttle", wrap(shuttle, None)),
        #("covertype", wrap(covertype, None)), # fit takes a million years (1233s for 1000 instances)
        ("smartphone", wrap(smartphone, None)),
        ("htru2", wrap(htru2, None)),
        #("malware", wrap(malware, None)), # MALWARE FIT DID NOT FINISH (07:30:30.xxx CPU time)
        ("bidding", wrap(bidding, None)),
        ("swarm", wrap(swarm, None)),
        ("bank", wrap(bank, None)),
        ("buzz", wrap(buzz, None)), # Slow fit times
        ("sensorless", wrap(sensorless, None)),
        ("dota2", wrap(dota2, None)),
        
        # Bio
        ("abalone", wrap(abalone, None)),
        ("splice", wrap(splice, None)),
        ("anuran", wrap(anuran, None)),
        
        # Medical
        ("cardio", wrap(cardio, None)),
        ("skin", wrap(skin, None)),
        
    ],
    "dataset_mutators": {
        "none": (lambda *x, **kwargs: x),
    },
    "methods": [
        ("uncertainty", partial(uncertainty_stop, n_instances=10)),
    ],
    "models": [
        "svm-linear"
    ],
    "meta": {
        "dataset_size": 1000,
        "labelled_size": 10,
        "test_size": 0.5,
        "n_runs": 10,
        "ret_classifiers": True,
        "ensure_y": True,
        "stop_info": True,
        "aggregate": False,
        "stop_function": ("len1000", lambda learner: learner.y_training.shape[0] >= 1000),
        "pool_subsample": 1000
    }
}

In [9]:
capture_metrics = [
    accuracy_score,
    f1_score,
    roc_auc_score,
    
    "uncertainty_average",
    "uncertainty_min",
    "uncertainty_max",
    "uncertainty_variance",
    "uncertainty_average_selected",
    "uncertainty_min_selected",
    "uncertainty_max_selected",
    "uncertainty_variance_selected",
    "entropy_max",
    "n_support",
    "contradictory_information",
    # slow
    #"expected_error"
]

### Dataset information

In [10]:
libdatasets.dataset_summary([data[0] for data in matrix['datasets']], [data[1] for data in matrix['datasets']])

Dataset                     Instances    Classes    Features  Most common class           Least common class    Domain
------------------------  -----------  ---------  ----------  --------------------------  --------------------  --------
newsgroups_faith                 1796          2      125145  15 56%                      0 44%                 nlp
newsgroups_graphics              1961          2      125145  5 50%                       1 50%                 nlp
newsgroups_hardware              1967          2      125145  2 50%                       3 50%                 nlp
newsgroups_sports_crypto         1985          2      125145  9 50%                       11 50%                nlp
rcv1                           804414          2       47236  0 53%                       1 47%                 nlp
webkb                            4199          4       22981  student 39%                 project 12%           nlp
spamassassin                     6051          2       50196  ha

## Accuracy differences

In [43]:
from sklearn.svm import SVC
results = []
for (name, dataset) in matrix['datasets']:
    #if name != "spamassassin": continue
    X, y = dataset()
    
    idx = np.random.choice(X.shape[0], X.shape[0], replace=False)
    X = X[idx]
    y = y[idx]

    
    clf = SVC(kernel='linear', probability=True)

    y_short = y[:10]
    X_short = X[:10]
    for klass in np.unique(y):
        if klass not in y_short:
            idx = np.where(y==klass)[0][0]
            y_short = np.concatenate((y_short, [y[idx]]), axis=0)
            if isinstance(X_short, scipy.sparse.csr_matrix):
                X_short = scipy.sparse.vstack((
                    X_short, 
                    X[idx]
                ))
            else:
                X_short = np.concatenate((X_short, [X[idx]]), axis=0)
    #print(np.unique(y_short))

    clf.fit(X_short, y_short)
    start = clf.score(X[-1000:], y[-1000:])
    clf.fit(X[:1000], y[:1000])
    final = clf.score(X[-1000:], y[-1000:])
    results.append([start, final])

  return f(**kwargs)
  return f(**kwargs)


In [50]:
print(tabulate(zip([x[0] for x in matrix['datasets']], [r[0] for r in results], [r[1] for r in results], [r[1]-r[0] for r in results]), headers=["Name", "Start", "Final", "Change"]))

Name                        Start    Final    Change
------------------------  -------  -------  --------
newsgroups_faith            0.438    0.928     0.49
newsgroups_graphics         0.614    0.893     0.279
newsgroups_hardware         0.564    0.889     0.325
newsgroups_sports_crypto    0.484    0.985     0.501
rcv1                        0.533    0.914     0.381
webkb                       0.382    0.866     0.484
spamassassin                0.736    0.974     0.238
cifar10                     0.133    0.312     0.179
quickdraw                   0.585    0.757     0.172
avila                       0.427    0.565     0.138
shuttle                     0.791    0.985     0.194
smartphone                  0.663    0.945     0.282
htru2                       0.895    0.982     0.087
bidding                     0.885    0.992     0.107
swarm                       0.488    0.959     0.471
bank                        0.835    0.883     0.048
buzz                        0.959    0.987     

Name                        Start    Final    Change

------------------------  -------  -------  --------

newsgroups_faith            0.438    0.928     0.49

newsgroups_graphics         0.614    0.893     0.279

newsgroups_hardware         0.564    0.889     0.325

newsgroups_sports_crypto    0.484    0.985     0.501

rcv1                        0.533    0.914     0.381

webkb                       0.382    0.866     0.484

spamassassin                0.736    0.974     0.238

cifar10                     0.133    0.312     0.179

quickdraw                   0.585    0.757     0.172

avila                       0.427    0.565     0.138

shuttle                     0.791    0.985     0.194

smartphone                  0.663    0.945     0.282

~~htru2                       0.895    0.982     0.087~~

bidding                     0.885    0.992     0.107

swarm                       0.488    0.959     0.471

bank                        0.835    0.883     0.048

buzz                        0.959    0.987     0.028

sensorless                  0.103    0.414     0.311

dota2                       0.513    0.548     0.035

abalone                     0.453    0.504     0.051

splice                      0.512    0.913     0.401

anuran                      0.633    0.929     0.296

~~cardio                      0.627    0.626    -0.001~~

~~skin                        0.89     0.882    -0.008~~


In [15]:
reload(librun)

<module 'librun' from 'C:\\Users\\Zac\\Programming\\python\\research\\librun.py'>

In [None]:
results = librun.run(matrix, metrics=capture_metrics, force_run=True)

### Plan

* Learn to 1,000 instances. 
* Use a pool of as much data as possible for the dataset. 
* Start at 10+ensure_y instances
* Use a validation set size of ???
* Randomise the split each run, but use a seeded generator
* Report results using autorank?