In [1]:
import warnings
from importlib import reload
from functools import partial, lru_cache
import itertools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm, trange
from joblib import delayed
from modAL import batch
from art.metrics import empirical_robustness
from art.attacks.evasion import *
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tabulate import tabulate
from art.estimators.classification.scikitlearn import ScikitlearnSVC
from sklearn.metrics.pairwise import paired_distances, euclidean_distances
import scipy
from tvregdiff.tvregdiff import TVRegDiff

from ipynb.fs.defs import Bias
from ipynb.fs.defs.Datasets import generateData_twoPills_2D, generateData_twoPills_noNoise_2D, plot_dataset_2D

import libactive
import libadversarial
import libstop
from libactive import MyActiveLearner, active_split
from libadversarial import adversarial, uncertainty, random_batch, uncertainty_stop
from libutil import ProgressParallel
from libdatasets import *
import librun
from librun import run

In [2]:
import libdatasets; reload(libdatasets); from libdatasets import *

In [3]:
def wrap(func, *args, **kwargs):
    wrapper = lambda: lru_cache()(func)(*args, **kwargs)
    for attr in [attr for attr in dir(func) if not attr.startswith('__')]:
        setattr(wrapper, attr, getattr(func, attr))
    return wrapper

## Performance testing

In [4]:
matrix = {
    # Dataset fetchers should cache if possible
    # Lambda wrapper required for function to be pickleable (sent to other threads via joblib)
    "datasets": [
        # Text classification
        
        # https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.31.6090&rep=rep1&type=pdf
        ("newsgroups_faith", wrap(newsgroups, 1000, ('alt.atheism', 'soc.religion.christian'))),
        ("newsgroups_graphics", wrap(newsgroups, 1000, ('comp.graphics', 'comp.windows.x'))),
        ("newsgroups_hardware", wrap(newsgroups, 1000, ('comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'))),
        ("newsgroups_sports_crypto", wrap(newsgroups, 1000, ('rec.sport.baseball', 'sci.crypt'))),
    
        ("rcv1", wrap(rcv1, 1000)),
        ("webkb", wrap(webkb, 1000)),
        ("spamassassin", wrap(spamassassin, 1000)),
        
        # Image classification
        ("cifar10", wrap(cifar10, 1000)),
        ("quickdraw", wrap(quickdraw, 1000)),
        
        # General
        ("shuttle", wrap(shuttle, 1000)),
        ("covertype", wrap(covertype, 1000)),
        ("smartphone", wrap(smartphone, 1000)),
        ("htru2", wrap(htru2, 1000)),
        #("malware", wrap(malware, 1000)), # EXTREMELY SLOW FIT TIMES
        ("bidding", wrap(bidding, 1000)),
        ("swarm", wrap(swarm, 1000)),
        ("bank", wrap(bank, 1000)),
        
        # Bio
        ("abalone", wrap(abalone, 1000)),
        ("splice", wrap(splice, 1000)),
        
        # Medical
        ("cardio", wrap(cardio, 1000)),
        ("skin", wrap(skin, 1000)),
        
    ],
    "dataset_mutators": {
        "none": (lambda *x, **kwargs: x),
    },
    "methods": [
        ("uncertainty", partial(uncertainty_stop, n_instances=10)),
    ],
    "models": [
        "svm-linear"
    ],
    "meta": {
        "dataset_size": 1000,
        "labelled_size": 10,
        "test_size": {
            "newsgroups_faith": 500,
            "newsgroups_graphics": 500,
            "newsgroups_hardware": 500,
            "newsgroups_sports_crypto": 500,
            "*": 0.5
        },
        "n_runs": 10,
        "ret_classifiers": True,
        "ensure_y": True,
        "stop_info": True,
        "aggregate": False,
        "stop_function": ("len1000", lambda learner: learner.y_training.shape[0] >= 1000)
    }
}

In [5]:
# upper bound on total runtime:
n_runs = 100
n_datasets = 40
n_query_methods = 2 # ?
n_models = 1
n_parameter_combinations = 1
stop_size = 1000
batch_size = 10

total_runs = n_runs*n_datasets*n_query_methods*n_models*n_parameter_combinations

In [33]:
reload(libactive)

<module 'libactive' from 'C:\\Users\\Zac\\Programming\\python\\research\\libactive.py'>

In [18]:
def pluralize(n, word):
    if n == 1:
        return '%d %s' % (n, word)
        
    return '%d %ss' % (n, word)
        
def format_duration(seconds):
    if seconds == 0:
        return "now"
    
    ONE_MINUTE = 60
    ONE_HOUR = 60 * ONE_MINUTE
    ONE_DAY = 24 * ONE_HOUR
    ONE_YEAR = 365 * ONE_DAY
    
    units = (
        (ONE_YEAR, 'year'),
        (ONE_DAY, 'day'),
        (ONE_HOUR, 'hour'),
        (ONE_MINUTE, 'minute'),
        (1, 'second'),
    )
        
    r = []
    for unit in units:
        time_period, word = unit
        if seconds >= time_period:
            n = int(seconds / time_period)
            r.append(pluralize(n, word))
            seconds -= n * time_period
    
    return ' and'.join(', '.join(r).rsplit(',', 1))


In [6]:
from time import monotonic
from sklearn.svm import SVC
from libactive import expected_error
from modAL.models import ActiveLearner
from libadversarial import adversarial
from art.attacks.evasion import DeepFool

construct = []
fit = []
inference = []
ee = []
deepfool = []
construct = [134.40000000009604, 146.79999999989377, 137.60000000002037, 131.40000000003056, 996.9999999999345, 6852.999999999884, 940.6000000000859, 1134.400000000096, 1222.0000000001164, 196.80000000007567, 1068.7999999998283]/total_runs*n_datasets
fit = [80939.99999999141, 65000.0, 51239.99999999796, 45939.999999991414, 50920.00000000553, 53440.000000009604, 38760.00000000204, 410319.99999999243, 44059.999999990396, 8439.999999973224, 26802180.00000001]/total_runs*n_datasets
# Assuming 1000 unlabelled instances are subsampled
inference = [14376.525522657837, 12204.181258622519, 9850.36612543792, 7693.9403586968065, 8360.520826340678, 10416.765896641573, 6349.36374153124, 55546.66666666677, 7694.299167411302, 32.413793103152216]/total_runs*n_datasets
deepfool = [176561999.99999988, 111873999.99999979, 108250000.0, 91280000.00000064, 100626000.00000021, 219467999.99999893, 77811999.9999999, 3773405999.999999, 156126000.00000024, 17155999.99999904]/total_runs*n_datasets
ee = [25940000.000009604, 27200000.00001164, 18119999.999998983, 19380000.000001017, 35300000.00000655, 1293440000.0000098, 3439999.999991414, 313119999.999999, 39360000.00001513, 639999.9999848661]/total_runs*n_datasets

In [30]:
n_cores = 100

In [31]:
total_t_est = sum((sum(construct), sum(fit), sum(inference), sum(ee), sum(deepfool)))/len(construct)*n_datasets/n_cores
t_est_no_ee = sum((sum(construct), sum(fit), sum(inference), sum(deepfool)))/len(construct)*n_datasets/n_cores
t_est_no_deep = sum((sum(construct), sum(fit), sum(inference), sum(ee)))/len(construct)*n_datasets/n_cores
t_est_no_deep_no_ee = sum((sum(construct), sum(fit), sum(inference)))/len(construct)*n_datasets/n_cores
print(f"Using {n_cores} cores:")
print(f"  Estimated runtime for {n_runs} runs on {n_datasets} datasets with {n_query_methods} query methods: {format_duration(total_t_est)}")
print(f"  Without deepfool: {format_duration(t_est_no_deep)}d")
print(f"  Without ee: {format_duration(t_est_no_ee)}d")
print(f"  Without deepfool and ee: {format_duration(t_est_no_deep_no_ee)}d")

Using 100 cores:
  Estimated runtime for 100 runs on 40 datasets with 2 query methods: 591 years, 330 days, 13 hours, 56 minutes and 49 seconds
  Without deepfool: 467 years, 343 days, 24 minutes and 46 secondsd
  Without ee: 124 years, 5 days, 17 hours, 46 minutes and 56 secondsd
  Without deepfool and ee: 18 days, 4 hours, 14 minutes and 53 secondsd


In [22]:
print(f"Total fit time: {format_duration(sum(fit))}")
print(f"Total inference time: {format_duration(sum(inference))}")
print(f"Total ee time: {format_duration(sum(ee))}")
print(f"Total deepfool time: {format_duration(sum(deepfool))}")

Total fit time: 320 days, 53 minutes and 59 seconds
Total inference time: 179 days, 16 hours, 19 minutes and 40 seconds
Total ee time: 12866 years, 355 days, 14 hours, 31 minutes and 26 seconds
Total deepfool time: 3409 years, 23 days, 11 minutes and 20 seconds


Options for running EE & deepfool:

* Subsamble the unlabelled pool

In [35]:
inference_sub = []
ee_sub = []
deepfool_sub = []
for i, (name, dataset) in enumerate(matrix['datasets'][:10]):
    x_all = getattr(libdatasets, name if not name.startswith("newsgroups") else "newsgroups")(dataset_size=None)[0]
    inference_sub.append(inference[i]/x_all.shape[0]*1000)
    ee_sub.append(ee[i]/x_all.shape[0]*1000)
    deepfool_sub.append(deepfool[i]/x_all.shape[0]*1000)

  dataset1 = pd.read_csv("Imitate/Datasets/shuttle.trn", header=None, sep="\s")
  dataset2 = pd.read_csv("Imitate/Datasets/shuttle.tst", header=None, sep="\s")


In [42]:
print(inference_sub)
print(ee_sub)
print(deepfool_sub)

[14376.525522657837, 12204.181258622519, 9850.36612543792, 7693.9403586968065, 8360.520826340678, 10416.765896641573, 6349.36374153124, 55546.66666666677, 7694.299167411302, 32.413793103152216]
[176561999.99999988, 111873999.99999979, 108250000.0, 91280000.00000064, 100626000.00000021, 219467999.99999893, 77811999.9999999, 3773405999.999999, 156126000.00000024, 17155999.99999904]
[25940000.000009604, 27200000.00001164, 18119999.999998983, 19380000.000001017, 35300000.00000655, 1293440000.0000098, 3439999.999991414, 313119999.999999, 39360000.00001513, 639999.9999848661]


In [36]:
print(f"Total fit time: {format_duration(sum(fit))}")
print(f"Subsampled inference time: {format_duration(sum(inference_sub))}")
print(f"Subsampled ee time: {format_duration(sum(ee_sub))}")
print(f"Subsampled deepfool time: {format_duration(sum(deepfool_sub))}")

Total fit time: 320 days, 53 minutes and 59 seconds
Subsampled inference time: 1 day, 12 hours, 48 minutes and 45 seconds
Subsampled ee time: 153 years, 87 days, 9 hours, 46 minutes and 39 seconds
Subsampled deepfool time: 56 years, 114 days, 20 hours and 40 minutes


In [40]:
n_cores=2000

In [41]:
total_t_est = sum((sum(construct), sum(fit), sum(inference_sub), sum(ee_sub), sum(deepfool_sub)))/len(construct)*n_datasets/n_cores
t_est_no_ee = sum((sum(construct), sum(fit), sum(inference_sub), sum(deepfool_sub)))/len(construct)*n_datasets/n_cores
t_est_no_deep = sum((sum(construct), sum(fit), sum(inference_sub), sum(ee_sub)))/len(construct)*n_datasets/n_cores
t_est_no_deep_no_ee = sum((sum(construct), sum(fit), sum(inference_sub)))/len(construct)*n_datasets/n_cores
print(f"Using {n_cores} cores and subsampling:")
print(f"  Estimated runtime for {n_runs} runs on {n_datasets} datasets with {n_query_methods} query methods: {format_duration(total_t_est)}")
print(f"  Without deepfool: {format_duration(t_est_no_deep)}d")
print(f"  Without ee: {format_duration(t_est_no_ee)}d")
print(f"  Without deepfool and ee: {format_duration(t_est_no_deep_no_ee)}d")

Using 2000 cores and subsampling:
  Estimated runtime for 100 runs on 40 datasets with 2 query methods: 139 days, 15 hours, 39 minutes and 54 seconds
  Without deepfool: 102 days, 6 hours, 43 minutes and 32 secondsd
  Without ee: 37 days, 22 hours, 58 minutes and 41 secondsd
  Without deepfool and ee: 14 hours, 2 minutes and 19 secondsd


In [28]:
#construct=construct[:-1]
#fit=fit[:-1]
#inference=inference[:-1]

In [31]:
len(construct),len(fit),len(inference),len(ee),len(deepfool)

(10, 10, 10, 10, 10)

In [38]:
print(construct)
print(fit)
print(inference)
print(ee)
print(deepfool)

[134.40000000009604, 146.79999999989377, 137.60000000002037, 131.40000000003056, 996.9999999999345, 6852.999999999884, 940.6000000000859, 1134.400000000096, 1222.0000000001164, 196.80000000007567, 1068.7999999998283]
[80939.99999999141, 65000.0, 51239.99999999796, 45939.999999991414, 50920.00000000553, 53440.000000009604, 38760.00000000204, 410319.99999999243, 44059.999999990396, 8439.999999973224, 26802180.00000001]
[270940.0000000096, 230000.0, 185640.00000000306, 145000.0, 6725320.00000001, 43739.99999999796, 38420.00000000553, 3332800.0000000065, 4279700.000000012, 1879.9999999828287, 270940.0000000096]
[3327487451.9999976, 2108377403.9999962, 2040079500.0, 1720262880.0000122, 80944963164.00017, 921546131.9999955, 470840411.99999934, 226404359999.99994, 86839935342.00012, 995047999.9999443]
[488865240.000181, 512611200.0002194, 341489519.9999808, 365235480.0000192, 28395814200.00527, 5431154560.000041, 20815439.999948047, 18787199999.99994, 21892701120.00842, 37119999.99912223, 312

In [37]:
for name, dataset in matrix['datasets'][10:]:
    clf = SVC(kernel='linear', probability=True)
    
    start = monotonic()
    X, y = dataset()
    if X.dtype != np.float64:
        X = X.astype(np.float64)
    construct.append((monotonic()-start)*total_runs/n_datasets)
    
    start = monotonic()
    clf.fit(X,y)
    fit.append((monotonic()-start)*total_runs/n_datasets*stop_size/batch_size)
    
    x_all = getattr(libdatasets, name if not name.startswith("newsgroups") else "newsgroups")(dataset_size=None)[0]
    if x_all.dtype != np.float64:
        x_all = x_all.astype(np.float64)
    
    start = monotonic()
    clf.predict_proba(x_all)
    inference.append((monotonic()-start)*total_runs/n_datasets*stop_size/batch_size)
    
    learner = ActiveLearner(estimator=clf, X_training=X, y_training=y)
    
    start = monotonic()
    adversarial(learner, x_all[np.random.choice(x_all.shape[0], 1)], partial(DeepFool, verbose=False), n_instances=10)
    deepfool.append((monotonic()-start)*total_runs/n_datasets*stop_size/batch_size*x_all.shape[0])
    
    start = monotonic()
    expected_error(learner, x_all[np.random.choice(x_all.shape[0], 10)])
    ee.append((monotonic()-start)*total_runs/n_datasets*stop_size/batch_size*x_all.shape[0]/10)
    
    print(f"Finished {name}")

KeyboardInterrupt: 

In [54]:
reload(libactive); from libactive import expected_error

In [53]:
X, y = matrix['datasets'][0][1]()
clf = SVC(kernel='linear', probability=True)
clf.fit(X,y)
x_all = newsgroups(None)[0]
print(x_all.shape)

In [None]:
expected_error(ActiveLearner(clf, X_training=X, y_training=y), x_all)

In [49]:
np.expand_dims([1,2,3], axis=0)

array([[[1, 2, 3],
        [4, 5, 6]]])

MALWARE FIT DID NOT FINISH (07:30:30.xxx CPU time)

### Dataset selection criteria

* More than 3000 instances
* More than 2 features
* No missing values
* Ideally easy to extract & preprocess

In [4]:
matrix = {
    # Dataset fetchers should cache if possible
    # Lambda wrapper required for function to be pickleable (sent to other threads via joblib)
    "datasets": [
        # Text classification
        
        # https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.31.6090&rep=rep1&type=pdf
        ("newsgroups_faith", wrap(newsgroups, None, ('alt.atheism', 'soc.religion.christian'))),
        ("newsgroups_graphics", wrap(newsgroups, None, ('comp.graphics', 'comp.windows.x'))),
        ("newsgroups_hardware", wrap(newsgroups, None, ('comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'))),
        ("newsgroups_sports_crypto", wrap(newsgroups, None, ('rec.sport.baseball', 'sci.crypt'))),
    
        ("rcv1", wrap(rcv1, None)),
        ("webkb", wrap(webkb, None)),
        ("spamassassin", wrap(spamassassin, None)),
        
        # Image classification
        ("cifar10", wrap(cifar10, None)),
        ("quickdraw", wrap(quickdraw, None)),
        
        # General
        ("shuttle", wrap(shuttle, None)),
        ("covertype", wrap(covertype, None)), # fit takes a million years (1233s for 1000 instances)
        ("smartphone", wrap(smartphone, None)),
        ("ida2016", wrap(ida2016, None)), # HAS MISSING VALUES
        ("htru2", wrap(htru2, None)),
        ("malware", wrap(malware, None)),
        ("bidding", wrap(bidding, None)),
        ("swarm", wrap(swarm, None)),
        
        # Bio
        ("abalone", wrap(abalone, )),
        ("splice", wrap(splice, None)),
        
        # Medical
        ("cardio", wrap(cardio, None)),
        ("skin", wrap(skin, None)),
        
    ],
    "dataset_mutators": {
        "none": (lambda *x, **kwargs: x),
    },
    "methods": [
        ("uncertainty", partial(uncertainty_stop, n_instances=10)),
    ],
    "models": [
        "svm-linear"
    ],
    "meta": {
        "dataset_size": 1000,
        "labelled_size": 10,
        "test_size": {
            "newsgroups_faith": 500,
            "newsgroups_graphics": 500,
            "newsgroups_hardware": 500,
            "newsgroups_sports_crypto": 500,
            "*": 0.5
        },
        "n_runs": 10,
        "ret_classifiers": True,
        "ensure_y": True,
        "stop_info": True,
        "aggregate": False,
        "stop_function": ("len1000", lambda learner: learner.y_training.shape[0] >= 1000)
    }
}

In [5]:
capture_metrics = [
    accuracy_score,
    f1_score,
    roc_auc_score,
    
    "uncertainty_average",
    "uncertainty_min",
    "uncertainty_max",
    "uncertainty_variance",
    "uncertainty_average_selected",
    "uncertainty_min_selected",
    "uncertainty_max_selected",
    "uncertainty_variance_selected",
    "entropy_max",
    "n_support",
    "contradictory_information",
    # slow
    #"expected_error"
]

### Dataset information

In [18]:
libdatasets.dataset_summary([data[0] for data in matrix['datasets']], [data[1] for data in matrix['datasets']])

  dataset1 = pd.read_csv("Imitate/Datasets/shuttle.trn", header=None, sep="\s")
  dataset2 = pd.read_csv("Imitate/Datasets/shuttle.tst", header=None, sep="\s")
  data = [dataset() for dataset in datasets]


Dataset                     Instances    Classes    Features  Most common class    Least common class    Domain
------------------------  -----------  ---------  ----------  -------------------  --------------------  --------
newsgroups_faith                 1796          2      125145  15 56%               0 44%                 nlp
newsgroups_graphics              1961          2      125145  5 50%                1 50%                 nlp
newsgroups_hardware              1967          2      125145  2 50%                3 50%                 nlp
newsgroups_sports_crypto         1985          2      125145  9 50%                11 50%                nlp
rcv1                           804414          2       47236  0 53%                1 47%                 nlp
webkb                            4199          4       22981  student 39%          project 12%           nlp
spamassassin                     6051          2       50196  ham 69%              spam 31%              nlp
cifar10    

In [15]:
reload(librun)

<module 'librun' from 'C:\\Users\\Zac\\Programming\\python\\research\\librun.py'>

In [7]:
results = librun.run(matrix, metrics=capture_metrics, force_run=True)

HBox(children=(HTML(value='Experiment'), FloatProgress(value=0.0, max=21.0), HTML(value='')))

HBox(children=(HTML(value='Run'), FloatProgress(value=0.0, max=10.0), HTML(value='')))

FileNotFoundError: [Errno 2] No such file or directory: 'cache/classifiers/newsgroups_faith__none__uncertainty__svm-linear__dataset_size=1000__labelled_size=10__test_size=500__n_runs=10__ret_classifiers=True__ensure_y=True__stop_info=True__aggregate=False__stop_function=len1000.pickle'

### Plan

* Learn to 1,000 instances. 
* Use a pool of as much data as possible for the dataset. 
* Start at 10+ensure_y instances
* Use a validation set size of ???
* Randomise the split each run, but use a seeded generator
* Report results using autorank?