In [6]:
import warnings
from importlib import reload
from functools import partial, lru_cache
import itertools
from time import monotonic

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm, trange
from joblib import delayed
from modAL import batch
from art.metrics import empirical_robustness
from art.attacks.evasion import *
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tabulate import tabulate
from art.estimators.classification.scikitlearn import ScikitlearnSVC
from sklearn.metrics.pairwise import paired_distances, euclidean_distances
import scipy
from tvregdiff.tvregdiff import TVRegDiff
from tabulate import tabulate

import ipywidgets as widgets
from ipywidgets import interact

import libactive
import libadversarial
import libstop
from libactive import MyActiveLearner, active_split
from libadversarial import adversarial, uncertainty, random_batch, uncertainty_stop
from libutil import ProgressParallel
from libdatasets import *
import librun
from librun import run

In [20]:
import libdatasets; reload(libdatasets); from libdatasets import *

In [22]:
matrix = {
    # Dataset fetchers should cache if possible
    # Lambda wrapper required for function to be pickleable (sent to other threads via joblib)
    "datasets": [
        # Text classification
        

        ("newsgroups", wrap(newsgroups, None)),  
        ("newsgroups_svd", wrap(newsgroups_svd, None)),  
        ("rcv1", wrap(rcv1, None)),
        ("webkb", wrap(webkb, None)),
        ("webkb_svd", wrap(webkb_svd, None)),
        ("spamassassin", wrap(spamassassin, None)),
        
        # Image classification
        ("cifar10", wrap(cifar10, None)),
        ("quickdraw", wrap(quickdraw, None)),
        ("avila", wrap(avila, None)),
        
        # General
        ("shuttle", wrap(shuttle, None)),
        ("covertype", wrap(covertype, None)), 
        ("smartphone", wrap(smartphone, None)),
        ("htru2", wrap(htru2, None)),
        ("malware", wrap(malware, None)), 
        ("bidding", wrap(bidding, None)),
        ("swarm", wrap(swarm, None)),
        ("bank", wrap(bank, None)),
        ("buzz", wrap(buzz, None)), 
        ("sensorless", wrap(sensorless, None)),
        ("dota2", wrap(dota2, None)),
        
        # Bio
        ("abalone", wrap(abalone, None)),
        ("splice", wrap(splice, None)),
        ("anuran", wrap(anuran, None)),
        
        # Medical
        ("cardio", wrap(cardio, None)),
        ("skin", wrap(skin, None)),
        
    ],
    "dataset_mutators": {
        "none": (lambda *x, **kwargs: x),
    },
    "methods": [
        ("uncertainty", partial(uncertainty_stop, n_instances=10)),
    ],
    "models": [
        "svm-linear"
    ],
    "meta": {
        "dataset_size": 1000,
        "labelled_size": 10,
        "test_size": {
            "newsgroups_faith": 500,
            "newsgroups_graphics": 500,
            "newsgroups_hardware": 500,
            "newsgroups_sports_crypto": 500,
            "*": 0.5
        },
        "n_runs": 10,
        "ret_classifiers": True,
        "ensure_y": True,
        "stop_info": True,
        "aggregate": False,
        "stop_function": ("len1000", lambda learner: learner.y_training.shape[0] >= 1000),
        "pool_subsample": 1000
    }
}


In [3]:
from dotenv import load_dotenv; load_dotenv();

In [5]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from time import monotonic
results = []
for (name, dataset) in matrix['datasets']:
    print(name)
    #if name != "spamassassin": continue
    X, y = dataset()
    X = StandardScaler(with_mean=not isinstance(X, scipy.sparse.csr_matrix)).fit_transform(X)
    
    idx = np.random.choice(X.shape[0], X.shape[0], replace=False)
    X = X[idx]
    y = y[idx]

    
    clf = SVC(kernel='linear', probability=True)

    y_short = y[:10]
    X_short = X[:10]
    for klass in np.unique(y):
        if klass not in y_short:
            idx = np.where(y==klass)[0][0]
            y_short = np.concatenate((y_short, [y[idx]]), axis=0)
            if isinstance(X_short, scipy.sparse.csr_matrix):
                X_short = scipy.sparse.vstack((
                    X_short, 
                    X[idx]
                ))
            else:
                X_short = np.concatenate((X_short, [X[idx]]), axis=0)
    #print(np.unique(y_short))

    clf.fit(X_short, y_short)
    start = clf.score(X[-1000:], y[-1000:])
    start_t = monotonic()
    clf.fit(X[:1000], y[:1000])
    time = monotonic() - start_t
    final = clf.score(X[-1000:], y[-1000:])
    results.append([name, start, final, time])

newsgroups
rcv1
webkb
spamassassin
cifar10
quickdraw
avila
shuttle
covertype
smartphone


  return f(*args, **kwargs)
  return f(*args, **kwargs)


htru2
malware
bidding
swarm
bank
buzz
sensorless
dota2
abalone
splice
anuran
cardio
skin


In [6]:
from tabulate import tabulate

In [7]:
results = np.array(results)

In [8]:
print(tabulate(np.hstack((results, np.expand_dims(results[:,2].astype(float)-results[:,1].astype(float), axis=1))), headers=["Name", "Initial acc", "Final acc", "Time", "Diff"]))

Name            Initial acc    Final acc    Time    Diff
------------  -------------  -----------  ------  ------
newsgroups            0.041        0.099   9.609   0.058
rcv1                  0.511        0.84    3.094   0.329
webkb                 0.208        0.67    9.359   0.462
spamassassin          0.336        0.966  11.344   0.63
cifar10               0.149        0.285   8.844   0.136
quickdraw             0.569        0.785   0.64    0.216
avila                 0.271        0.556   0.266   0.285
shuttle               0.905        0.981   0.032   0.076
covertype             0.39         0.71    0.328   0.32
smartphone            0.639        0.927   0.313   0.288
htru2                 0.964        0.977   0.015   0.013
malware               0.89         0.97    0.422   0.08
bidding               0.898        0.899   0.375   0.001
swarm                 0.721        0.999   3.438   0.278
bank                  0.877        0.865   0.421  -0.012
buzz                  0.741       

# SVD N=100

In [11]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from time import monotonic
results_not_norm = []
for (name, dataset) in matrix['datasets']:
    if name == "malware" or name == "covertype":
        results_not_norm.append([name, 0, 0, 0])
        continue
        
    X, y = dataset()
    #X = StandardScaler(with_mean=not isinstance(X, scipy.sparse.csr_matrix)).fit_transform(X)
    
    idx = np.random.choice(X.shape[0], X.shape[0], replace=False)
    X = X[idx]
    y = y[idx]

    
    clf = SVC(kernel='linear', probability=True)

    y_short = y[:10]
    X_short = X[:10]
    for klass in np.unique(y):
        if klass not in y_short:
            idx = np.where(y==klass)[0][0]
            y_short = np.concatenate((y_short, [y[idx]]), axis=0)
            if isinstance(X_short, scipy.sparse.csr_matrix):
                X_short = scipy.sparse.vstack((
                    X_short, 
                    X[idx]
                ))
            else:
                X_short = np.concatenate((X_short, [X[idx]]), axis=0)
    #print(np.unique(y_short))

    clf.fit(X_short, y_short)
    start = clf.score(X[-1000:], y[-1000:])
    start_t = monotonic()
    clf.fit(X[:1000], y[:1000])
    time = monotonic() - start_t
    final = clf.score(X[-1000:], y[-1000:])
    results_not_norm.append([name, start, final, time])

In [12]:
results_not_norm = np.array(results_not_norm)
print(tabulate(np.hstack((results_not_norm, np.expand_dims(results_not_norm[:,2].astype(float)-results_not_norm[:,1].astype(float), axis=1))), headers=["Name", "Initial acc", "Final acc", "Time", "Diff"]))

Name              Initial acc    Final acc    Time    Diff
--------------  -------------  -----------  ------  ------
newsgroups              0.05         0.416   8.984   0.366
newsgroups_svd          0.081        0.303   0.5     0.222
rcv1                    0.517        0.889   2.64    0.372
webkb                   0.301        0.892   3.078   0.591
webkb_svd               0.492        0.844   0.562   0.352
spamassassin            0.784        0.959   2.313   0.175
cifar10                 0.177        0.284   9.734   0.107
quickdraw               0.496        0.775   0.703   0.279
avila                   0.314        0.592   0.219   0.278
shuttle                 0.918        0.981   0.016   0.063
covertype               0            0       0       0
smartphone              0.567        0.955   0.266   0.388
htru2                   0.918        0.98    0.031   0.062
malware                 0            0       0       0
bidding                 0.893        0.99    0.078   0.097
swarm

# SVD N=200

In [24]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from time import monotonic
results_svd2 = []
for (name, dataset) in matrix['datasets']:
        
    X, y = dataset()
    #X = StandardScaler(with_mean=not isinstance(X, scipy.sparse.csr_matrix)).fit_transform(X)
    
    idx = np.random.choice(X.shape[0], X.shape[0], replace=False)
    X = X[idx]
    y = y[idx]

    
    clf = SVC(kernel='linear', probability=True)

    y_short = y[:10]
    X_short = X[:10]
    for klass in np.unique(y):
        if klass not in y_short:
            idx = np.where(y==klass)[0][0]
            y_short = np.concatenate((y_short, [y[idx]]), axis=0)
            if isinstance(X_short, scipy.sparse.csr_matrix):
                X_short = scipy.sparse.vstack((
                    X_short, 
                    X[idx]
                ))
            else:
                X_short = np.concatenate((X_short, [X[idx]]), axis=0)
    #print(np.unique(y_short))

    clf.fit(X_short, y_short)
    start = clf.score(X[-1000:], y[-1000:])
    start_t = monotonic()
    clf.fit(X[:1000], y[:1000])
    time = monotonic() - start_t
    final = clf.score(X[-1000:], y[-1000:])
    results_svd2.append([name, start, final, time])

In [25]:
results_svd2 = np.array(results_svd2)
print(tabulate(np.hstack((results_svd2, np.expand_dims(results_svd2[:,2].astype(float)-results_svd2[:,1].astype(float), axis=1))), headers=["Name", "Initial acc", "Final acc", "Time", "Diff"]))

Name              Initial acc    Final acc    Time    Diff
--------------  -------------  -----------  ------  ------
newsgroups              0.047        0.429   8.484   0.382
newsgroups_svd          0.063        0.354   0.625   0.291
rcv1                    0.528        0.89    2.437   0.362
webkb                   0.419        0.88    2.937   0.461
webkb_svd               0.463        0.848   0.234   0.385
spamassassin            0.815        0.966   2.078   0.151
cifar10                 0.17         0.277   7.61    0.107
quickdraw               0.283        0.722   0.641   0.439
avila                   0.37         0.566   0.219   0.196
shuttle                 0.931        0.972   0.016   0.041
covertype               0.505        0.708   0.36    0.203
smartphone              0.613        0.955   0.265   0.342
htru2                   0.945        0.977   0.032   0.032
malware                 0.843        0.978   0.375   0.135
bidding                 0.882        0.988   0.078   0.1

In [12]:
results = np.hstack((results, np.expand_dims(results[:,2].astype(float)-results[:,1].astype(float), axis=1)))

In [13]:
results_not_norm = np.hstack((results_not_norm, np.expand_dims(results_not_norm[:,2].astype(float)-results_not_norm[:,1].astype(float), axis=1)))

In [29]:
print("Normalized results - non normalized results")
print("Time positive = normalized slower ")
print("Diff positive = normalized improved more")
print()
print(tabulate(np.vstack((results[:,0], results[:,3].astype(float)-results_not_norm[:,3].astype(float), results[:,4].astype(float)-results_not_norm[:,4].astype(float))).T, headers=["Name", "Time Diff", "Diff"]))

Normalized results - non normalized results
Time positive = normalized slower 
Diff positive = normalized improved more

Name            Time Diff    Diff
------------  -----------  ------
newsgroups         -0.172  -0.281
rcv1                0.282  -0.108
webkb               5.781  -0.014
spamassassin        9.266   0.443
cifar10             0.969  -0.011
quickdraw          -0.032  -0.076
avila               0.016  -0.048
shuttle            -0.858   0.023
covertype           0.328   0.32
smartphone          0.047  -0.212
htru2              -0.766   0.003
malware             0.422   0.08
bidding             0.297  -0.081
swarm              -0.093  -0.049
bank                0.124  -0.003
buzz              -38.687   0.13
sensorless        -12.031   0.413
dota2              -2.031  -0.037
abalone            -0.281  -0.132
splice              0.218  -0.037
anuran             -0.016  -0.258
cardio             -0.593   0.069
skin               -2.688   0.045


In [30]:
compared = np.vstack((results[:,0], results[:,3].astype(float)-results_not_norm[:,3].astype(float), results[:,4].astype(float)-results_not_norm[:,4].astype(float))).T

In [39]:
print(compared[:,0][np.where((compared[:,1].astype(float) < 0) & (compared[:,2].astype(float) > 0))])

['shuttle' 'htru2' 'buzz' 'sensorless' 'cardio' 'skin']


Also normalizing covertype, malware because 

In [18]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from time import monotonic
results_search = []
for (name, dataset) in matrix['datasets']:
        
    X, y = dataset()
    
    idx = np.random.choice(X.shape[0], X.shape[0], replace=False)
    X = X[idx]
    y = y[idx]

    y_short = y[:10]
    X_short = X[:10]
    
    clf = SVC(kernel='linear', probability=True)
    search = sklearn.model_selection.RandomizedSearchCV(clf, {'C': scipy.stats.expon(scale=100)}, cv=2).fit(X_short, y_short)
    clf = SVC(kernel='linear', probability=True, **search.best_params_)
    
    for klass in np.unique(y):
        if klass not in y_short:
            idx = np.where(y==klass)[0][0]
            y_short = np.concatenate((y_short, [y[idx]]), axis=0)
            if isinstance(X_short, scipy.sparse.csr_matrix):
                X_short = scipy.sparse.vstack((
                    X_short, 
                    X[idx]
                ))
            else:
                X_short = np.concatenate((X_short, [X[idx]]), axis=0)
    #print(np.unique(y_short))

    clf.fit(X_short, y_short)
    start = clf.score(X[-1000:], y[-1000:])
    start_t = monotonic()
    clf.fit(X[:1000], y[:1000])
    time = monotonic() - start_t
    final = clf.score(X[-1000:], y[-1000:])
    results_search.append([name, start, final, time])

Traceback (most recent call last):
  File "c:\users\zac\appdata\local\programs\python\python38\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zac\appdata\local\programs\python\python38\lib\site-packages\sklearn\svm\_base.py", line 173, in fit
    y = self._validate_targets(y)
  File "c:\users\zac\appdata\local\programs\python\python38\lib\site-packages\sklearn\svm\_base.py", line 558, in _validate_targets
    raise ValueError(
ValueError: The number of classes has to be greater than one; got 1 class

Traceback (most recent call last):
  File "c:\users\zac\appdata\local\programs\python\python38\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zac\appdata\local\programs\python\python38\lib\site-packages\sklearn\svm\_base.py", line 173, in fit
    y = self._validate_targets(y)
  F

KeyboardInterrupt: 

In [None]:
results_search = np.array(results_search)
print(tabulate(np.hstack((results_search, np.expand_dims(results_search[:,2].astype(float)-results_search[:,1].astype(float), axis=1))), headers=["Name", "Initial acc", "Final acc", "Time", "Diff"]))