In [1]:
import warnings
from importlib import reload
from functools import partial, lru_cache
import itertools
from time import monotonic

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm, trange
from joblib import delayed
from modAL import batch
from art.metrics import empirical_robustness
from art.attacks.evasion import *
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tabulate import tabulate
from art.estimators.classification.scikitlearn import ScikitlearnSVC
from sklearn.metrics.pairwise import paired_distances, euclidean_distances
import scipy
from tvregdiff.tvregdiff import TVRegDiff
from tabulate import tabulate

from ipynb.fs.defs import Bias
from ipynb.fs.defs.Datasets import generateData_twoPills_2D, generateData_twoPills_noNoise_2D, plot_dataset_2D
import ipywidgets as widgets
from ipywidgets import interact

import libactive
import libadversarial
import libstop
from libactive import MyActiveLearner, active_split
from libadversarial import adversarial, uncertainty, random_batch, uncertainty_stop
from libutil import ProgressParallel
from libdatasets import *
import librun
from librun import run

In [3]:
matrix = {
    # Dataset fetchers should cache if possible
    # Lambda wrapper required for function to be pickleable (sent to other threads via joblib)
    "datasets": [
        # Text classification
        

        ("newsgroups", wrap(newsgroups, None)),    
        ("rcv1", wrap(rcv1, None)),
        ("webkb", wrap(webkb, None)),
        ("spamassassin", wrap(spamassassin, None)),
        
        # Image classification
        ("cifar10", wrap(cifar10, None)),
        ("quickdraw", wrap(quickdraw, None)),
        ("avila", wrap(avila, None)),
        
        # General
        ("shuttle", wrap(shuttle, None)),
        ("covertype", wrap(covertype, None)), 
        ("smartphone", wrap(smartphone, None)),
        ("htru2", wrap(htru2, None)),
        ("malware", wrap(malware, None)), 
        ("bidding", wrap(bidding, None)),
        ("swarm", wrap(swarm, None)),
        ("bank", wrap(bank, None)),
        ("buzz", wrap(buzz, None)), 
        ("sensorless", wrap(sensorless, None)),
        ("dota2", wrap(dota2, None)),
        
        # Bio
        ("abalone", wrap(abalone, None)),
        ("splice", wrap(splice, None)),
        ("anuran", wrap(anuran, None)),
        
        # Medical
        ("cardio", wrap(cardio, None)),
        ("skin", wrap(skin, None)),
        
    ],
    "dataset_mutators": {
        "none": (lambda *x, **kwargs: x),
    },
    "methods": [
        ("uncertainty", partial(uncertainty_stop, n_instances=10)),
    ],
    "models": [
        "svm-linear"
    ],
    "meta": {
        "dataset_size": 1000,
        "labelled_size": 10,
        "test_size": {
            "newsgroups_faith": 500,
            "newsgroups_graphics": 500,
            "newsgroups_hardware": 500,
            "newsgroups_sports_crypto": 500,
            "*": 0.5
        },
        "n_runs": 10,
        "ret_classifiers": True,
        "ensure_y": True,
        "stop_info": True,
        "aggregate": False,
        "stop_function": ("len1000", lambda learner: learner.y_training.shape[0] >= 1000),
        "pool_subsample": 1000
    }
}


In [2]:
import os
os.environ['DATASET_DIR'] = r'C:\Users\Zac\Programming\python\research\datasets\cache'

In [5]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from time import monotonic
results = []
for (name, dataset) in matrix['datasets']:
    print(name)
    #if name != "spamassassin": continue
    X, y = dataset()
    X = StandardScaler(with_mean=not isinstance(X, scipy.sparse.csr_matrix)).fit_transform(X)
    
    idx = np.random.choice(X.shape[0], X.shape[0], replace=False)
    X = X[idx]
    y = y[idx]

    
    clf = SVC(kernel='linear', probability=True)

    y_short = y[:10]
    X_short = X[:10]
    for klass in np.unique(y):
        if klass not in y_short:
            idx = np.where(y==klass)[0][0]
            y_short = np.concatenate((y_short, [y[idx]]), axis=0)
            if isinstance(X_short, scipy.sparse.csr_matrix):
                X_short = scipy.sparse.vstack((
                    X_short, 
                    X[idx]
                ))
            else:
                X_short = np.concatenate((X_short, [X[idx]]), axis=0)
    #print(np.unique(y_short))

    clf.fit(X_short, y_short)
    start = clf.score(X[-1000:], y[-1000:])
    start_t = monotonic()
    clf.fit(X[:1000], y[:1000])
    time = monotonic() - start_t
    final = clf.score(X[-1000:], y[-1000:])
    results.append([name, start, final, time])

newsgroups
rcv1
webkb
spamassassin
cifar10
quickdraw
avila
shuttle
covertype
smartphone


  return f(*args, **kwargs)
  return f(*args, **kwargs)


htru2
malware
bidding
swarm
bank
buzz
sensorless
dota2
abalone
splice
anuran
cardio
skin


In [6]:
from tabulate import tabulate

In [7]:
results = np.array(results)

In [8]:
print(tabulate(np.hstack((results, np.expand_dims(results[:,2].astype(float)-results[:,1].astype(float), axis=1))), headers=["Name", "Initial acc", "Final acc", "Time", "Diff"]))

Name            Initial acc    Final acc    Time    Diff
------------  -------------  -----------  ------  ------
newsgroups            0.041        0.099   9.609   0.058
rcv1                  0.511        0.84    3.094   0.329
webkb                 0.208        0.67    9.359   0.462
spamassassin          0.336        0.966  11.344   0.63
cifar10               0.149        0.285   8.844   0.136
quickdraw             0.569        0.785   0.64    0.216
avila                 0.271        0.556   0.266   0.285
shuttle               0.905        0.981   0.032   0.076
covertype             0.39         0.71    0.328   0.32
smartphone            0.639        0.927   0.313   0.288
htru2                 0.964        0.977   0.015   0.013
malware               0.89         0.97    0.422   0.08
bidding               0.898        0.899   0.375   0.001
swarm                 0.721        0.999   3.438   0.278
bank                  0.877        0.865   0.421  -0.012
buzz                  0.741       

In [9]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from time import monotonic
results_not_norm = []
for (name, dataset) in matrix['datasets']:
    if name == "malware" or name == "covertype":
        results_not_norm.append([name, 0, 0, 0])
        continue
        
    X, y = dataset()
    #X = StandardScaler(with_mean=not isinstance(X, scipy.sparse.csr_matrix)).fit_transform(X)
    
    idx = np.random.choice(X.shape[0], X.shape[0], replace=False)
    X = X[idx]
    y = y[idx]

    
    clf = SVC(kernel='linear', probability=True)

    y_short = y[:10]
    X_short = X[:10]
    for klass in np.unique(y):
        if klass not in y_short:
            idx = np.where(y==klass)[0][0]
            y_short = np.concatenate((y_short, [y[idx]]), axis=0)
            if isinstance(X_short, scipy.sparse.csr_matrix):
                X_short = scipy.sparse.vstack((
                    X_short, 
                    X[idx]
                ))
            else:
                X_short = np.concatenate((X_short, [X[idx]]), axis=0)
    #print(np.unique(y_short))

    clf.fit(X_short, y_short)
    start = clf.score(X[-1000:], y[-1000:])
    start_t = monotonic()
    clf.fit(X[:1000], y[:1000])
    time = monotonic() - start_t
    final = clf.score(X[-1000:], y[-1000:])
    results_not_norm.append([name, start, final, time])

  return f(*args, **kwargs)
  return f(*args, **kwargs)


In [10]:
results_not_norm = np.array(results_not_norm)
print(tabulate(np.hstack((results_not_norm, np.expand_dims(results_not_norm[:,2].astype(float)-results_not_norm[:,1].astype(float), axis=1))), headers=["Name", "Initial acc", "Final acc", "Time", "Diff"]))

Name            Initial acc    Final acc    Time    Diff
------------  -------------  -----------  ------  ------
newsgroups            0.066        0.405   9.781   0.339
rcv1                  0.482        0.919   2.812   0.437
webkb                 0.424        0.9     3.578   0.476
spamassassin          0.781        0.968   2.078   0.187
cifar10               0.137        0.284   7.875   0.147
quickdraw             0.486        0.778   0.672   0.292
avila                 0.256        0.589   0.25    0.333
shuttle               0.915        0.968   0.89    0.053
covertype             0            0       0       0
smartphone            0.437        0.937   0.266   0.5
htru2                 0.969        0.979   0.781   0.01
malware               0            0       0       0
bidding               0.903        0.985   0.078   0.082
swarm                 0.635        0.962   3.531   0.327
bank                  0.891        0.882   0.297  -0.009
buzz                  0.942        0.987  

In [12]:
results = np.hstack((results, np.expand_dims(results[:,2].astype(float)-results[:,1].astype(float), axis=1)))

In [13]:
results_not_norm = np.hstack((results_not_norm, np.expand_dims(results_not_norm[:,2].astype(float)-results_not_norm[:,1].astype(float), axis=1)))

In [29]:
print("Normalized results - non normalized results")
print("Time positive = normalized slower ")
print("Diff positive = normalized improved more")
print()
print(tabulate(np.vstack((results[:,0], results[:,3].astype(float)-results_not_norm[:,3].astype(float), results[:,4].astype(float)-results_not_norm[:,4].astype(float))).T, headers=["Name", "Time Diff", "Diff"]))

Normalized results - non normalized results
Time positive = normalized slower 
Diff positive = normalized improved more

Name            Time Diff    Diff
------------  -----------  ------
newsgroups         -0.172  -0.281
rcv1                0.282  -0.108
webkb               5.781  -0.014
spamassassin        9.266   0.443
cifar10             0.969  -0.011
quickdraw          -0.032  -0.076
avila               0.016  -0.048
shuttle            -0.858   0.023
covertype           0.328   0.32
smartphone          0.047  -0.212
htru2              -0.766   0.003
malware             0.422   0.08
bidding             0.297  -0.081
swarm              -0.093  -0.049
bank                0.124  -0.003
buzz              -38.687   0.13
sensorless        -12.031   0.413
dota2              -2.031  -0.037
abalone            -0.281  -0.132
splice              0.218  -0.037
anuran             -0.016  -0.258
cardio             -0.593   0.069
skin               -2.688   0.045


In [30]:
compared = np.vstack((results[:,0], results[:,3].astype(float)-results_not_norm[:,3].astype(float), results[:,4].astype(float)-results_not_norm[:,4].astype(float))).T

In [39]:
print(compared[:,0][np.where((compared[:,1].astype(float) < 0) & (compared[:,2].astype(float) > 0))])

['shuttle' 'htru2' 'buzz' 'sensorless' 'cardio' 'skin']


Also normalizing covertype, malware because 

## Post normalization stats

In [3]:
import libdatasets; reload(libdatasets); from libdatasets import *

In [4]:
matrix = {
    # Dataset fetchers should cache if possible
    # Lambda wrapper required for function to be pickleable (sent to other threads via joblib)
    "datasets": [
        # Text classification
        
        ("newsgroups", wrap(newsgroups, None)),
        ("rcv1", wrap(rcv1, None)),
        ("webkb", wrap(webkb, None)),
        ("spamassassin", wrap(spamassassin, None)),
        
        # Image classification
        ("cifar10", wrap(cifar10, None)),
        ("quickdraw", wrap(quickdraw, None)),
        ("avila", wrap(avila, None)),
        
        # General
        ("shuttle", wrap(shuttle, None)),
        ("covertype", wrap(covertype, None)),
        ("smartphone", wrap(smartphone, None)),
        ("htru2", wrap(htru2, None)),
        ("malware", wrap(malware, None)),
        ("bidding", wrap(bidding, None)),
        ("swarm", wrap(swarm, None)),
        ("bank", wrap(bank, None)),
        ("buzz", wrap(buzz, None)),
        ("sensorless", wrap(sensorless, None)),
        ("dota2", wrap(dota2, None)),
        
        # Bio
        ("abalone", wrap(abalone, None)),
        ("splice", wrap(splice, None)),
        ("anuran", wrap(anuran, None)),
        
        # Medical
        ("cardio", wrap(cardio, None)),
        ("skin", wrap(skin, None)),
        
    ],
    "dataset_mutators": {
        "none": (lambda *x, **kwargs: x),
    },
    "methods": [
        ("uncertainty", partial(uncertainty_stop, n_instances=10)),
    ],
    "models": [
        "svm-linear"
    ],
    "meta": {
        "dataset_size": 1000,
        "labelled_size": 10,
        "test_size": 0.5,
        "n_runs": 10,
        "ret_classifiers": True,
        "ensure_y": True,
        "stop_info": True,
        "aggregate": False,
        "stop_function": ("len1000", lambda learner: learner.y_training.shape[0] >= 1000),
        "pool_subsample": 1000
    }
}

In [5]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from time import monotonic
results_p = []
for (name, dataset) in matrix['datasets']:
    X, y = dataset()
    
    idx = np.random.choice(X.shape[0], X.shape[0], replace=False)
    X = X[idx]
    y = y[idx]

    
    clf = SVC(kernel='linear', probability=True)

    y_short = y[:10]
    X_short = X[:10]
    for klass in np.unique(y):
        if klass not in y_short:
            idx = np.where(y==klass)[0][0]
            y_short = np.concatenate((y_short, [y[idx]]), axis=0)
            if isinstance(X_short, scipy.sparse.csr_matrix):
                X_short = scipy.sparse.vstack((
                    X_short, 
                    X[idx]
                ))
            else:
                X_short = np.concatenate((X_short, [X[idx]]), axis=0)
    #print(np.unique(y_short))

    clf.fit(X_short, y_short)
    start = clf.score(X[-1000:], y[-1000:])
    start_t = monotonic()
    clf.fit(X[:1000], y[:1000])
    time = monotonic() - start_t
    final = clf.score(X[-1000:], y[-1000:])
    results_p.append([name, start, final, time])

In [6]:
results_p = np.array(results_p)
print(tabulate(np.hstack((results_p, np.expand_dims(results_p[:,2].astype(float)-results_p[:,1].astype(float), axis=1))), headers=["Name", "Initial acc", "Final acc", "Time", "Diff"]))

Name            Initial acc    Final acc    Time    Diff
------------  -------------  -----------  ------  ------
newsgroups            0.051        0.434   8.391   0.383
rcv1                  0.461        0.908   2.625   0.447
webkb                 0.465        0.875   3.063   0.41
spamassassin          0.663        0.962   2.234   0.299
cifar10               0.15         0.29    7.969   0.14
quickdraw             0.491        0.776   0.625   0.285
avila                 0.272        0.61    0.219   0.338
shuttle               0.856        0.977   0.015   0.121
covertype             0.435        0.712   0.328   0.277
smartphone            0.526        0.936   0.25    0.41
htru2                 0.97         0.98    0.016   0.01
malware               0.899        0.975   0.375   0.076
bidding               0.897        0.99    0.063   0.093
swarm                 0.573        0.943   2.953   0.37
bank                  0.87         0.888   0.391   0.018
buzz                  0.687        0

In [None]:
rcv1, webkb, sensorless
1, 2, 14
4G, 612M, 612M

In [None]:
poetry run python slurm.py Covertype1d 8 1 0-1; poetry run python slurm.py Smartphone1d 9 1 0-1;

In [None]:
poetry run python nesi.py 8 1 0-1 --dry-run; poetry run python nesi.py 9 1 0-1 --dry-run; poetry run python nesi.py 19 1 0-1 --dry-run;