In [1]:
import warnings
from importlib import reload
from functools import partial, lru_cache
import itertools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm, trange
from joblib import delayed
from modAL import batch
from art.metrics import empirical_robustness
from art.attacks.evasion import *
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tabulate import tabulate
from art.estimators.classification.scikitlearn import ScikitlearnSVC
from sklearn.metrics.pairwise import paired_distances, euclidean_distances
import scipy
from tvregdiff.tvregdiff import TVRegDiff

from ipynb.fs.defs import Bias
from ipynb.fs.defs.Datasets import generateData_twoPills_2D, generateData_twoPills_noNoise_2D, plot_dataset_2D

import libactive
import libadversarial
import libstop
from libactive import MyActiveLearner, active_split
from libadversarial import adversarial, uncertainty, random_batch, uncertainty_stop
from libutil import ProgressParallel
from libdatasets import *
import librun
from librun import run

Using sklearn


In [2]:
import libdatasets; reload(libdatasets); from libdatasets import *

In [3]:
def wrap(func, *args, **kwargs):
    wrapper = lambda: lru_cache()(func)(*args, **kwargs)
    for attr in [attr for attr in dir(func) if not attr.startswith('__')]:
        setattr(wrapper, attr, getattr(func, attr))
    return wrapper

### Dataset selection criteria

* More than 3000 instances
* More than 2 features
* No missing values
* Ideally easy to extract & preprocess

In [4]:
matrix = {
    # Dataset fetchers should cache if possible
    # Lambda wrapper required for function to be pickleable (sent to other threads via joblib)
    "datasets": [
        # Text classification
        
        # https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.31.6090&rep=rep1&type=pdf
        ("newsgroups_faith", wrap(newsgroups, None, ('alt.atheism', 'soc.religion.christian'))),
        ("newsgroups_graphics", wrap(newsgroups, None, ('comp.graphics', 'comp.windows.x'))),
        ("newsgroups_hardware", wrap(newsgroups, None, ('comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'))),
        ("newsgroups_sports_crypto", wrap(newsgroups, None, ('rec.sport.baseball', 'sci.crypt'))),
    
        ("rcv1", wrap(rcv1, None)),
        ("webkb", wrap(webkb, None)),
        ("spamassassin", wrap(spamassassin, None)),
        
        # Image classification
        ("cifar10", wrap(cifar10, None)),
        ("quickdraw", wrap(quickdraw, None)),
        
        # General
        ("shuttle", wrap(shuttle, None)),
        ("covertype", wrap(covertype, None)),
        ("smartphone", wrap(smartphone, None)),
        ("ida2016", wrap(ida2016, None)), # HAS MISSING VALUES
        ("htru2", wrap(htru2, None)),
        ("malware", wrap(malware, None)),
        ("bidding", wrap(bidding, None)),
        ("swarm", wrap(swarm, None)),
        
        # Bio
        ("abalone", wrap(abalone, )),
        ("splice", wrap(splice, None)),
        
        # Medical
        ("cardio", wrap(cardio, None)),
        ("skin", wrap(skin, None)),
        
    ],
    "dataset_mutators": {
        "none": (lambda *x, **kwargs: x),
    },
    "methods": [
        ("uncertainty", partial(uncertainty_stop, n_instances=10)),
    ],
    "models": [
        "svm-linear"
    ],
    "meta": {
        "dataset_size": 1000,
        "labelled_size": 10,
        "test_size": {
            "newsgroups_faith": 500,
            "newsgroups_graphics": 500,
            "newsgroups_hardware": 500,
            "newsgroups_sports_crypto": 500,
            "*": 0.5
        },
        "n_runs": 10,
        "ret_classifiers": True,
        "ensure_y": True,
        "stop_info": True,
        "aggregate": False,
        "stop_function": ("len1000", lambda learner: learner.y_training.shape[0] >= 1000)
    }
}

In [5]:
capture_metrics = [
    accuracy_score,
    f1_score,
    roc_auc_score,
    
    "uncertainty_average",
    "uncertainty_min",
    "uncertainty_max",
    "uncertainty_variance",
    "uncertainty_average_selected",
    "uncertainty_min_selected",
    "uncertainty_max_selected",
    "uncertainty_variance_selected",
    "entropy_max",
    "n_support",
    "contradictory_information",
    # slow
    #"expected_error"
]

### Dataset information

In [18]:
libdatasets.dataset_summary([data[0] for data in matrix['datasets']], [data[1] for data in matrix['datasets']])

  dataset1 = pd.read_csv("Imitate/Datasets/shuttle.trn", header=None, sep="\s")
  dataset2 = pd.read_csv("Imitate/Datasets/shuttle.tst", header=None, sep="\s")
  data = [dataset() for dataset in datasets]


Dataset                     Instances    Classes    Features  Most common class    Least common class    Domain
------------------------  -----------  ---------  ----------  -------------------  --------------------  --------
newsgroups_faith                 1796          2      125145  15 56%               0 44%                 nlp
newsgroups_graphics              1961          2      125145  5 50%                1 50%                 nlp
newsgroups_hardware              1967          2      125145  2 50%                3 50%                 nlp
newsgroups_sports_crypto         1985          2      125145  9 50%                11 50%                nlp
rcv1                           804414          2       47236  0 53%                1 47%                 nlp
webkb                            4199          4       22981  student 39%          project 12%           nlp
spamassassin                     6051          2       50196  ham 69%              spam 31%              nlp
cifar10    

In [15]:
reload(librun)

<module 'librun' from 'C:\\Users\\Zac\\Programming\\python\\research\\librun.py'>

In [7]:
results = librun.run(matrix, metrics=capture_metrics, force_run=True)

HBox(children=(HTML(value='Experiment'), FloatProgress(value=0.0, max=21.0), HTML(value='')))

HBox(children=(HTML(value='Run'), FloatProgress(value=0.0, max=10.0), HTML(value='')))

FileNotFoundError: [Errno 2] No such file or directory: 'cache/classifiers/newsgroups_faith__none__uncertainty__svm-linear__dataset_size=1000__labelled_size=10__test_size=500__n_runs=10__ret_classifiers=True__ensure_y=True__stop_info=True__aggregate=False__stop_function=len1000.pickle'

### Plan

* Learn to 1,000 instances. 
* Use a pool of as much data as possible for the dataset. 
* Start at 10+ensure_y instances
* Use a validation set size of ???
* Randomise the split each run, but use a seeded generator
* Report results using autorank?