In [1]:
import warnings
from importlib import reload
from functools import partial, lru_cache
import itertools
from time import monotonic

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm, trange
from joblib import delayed
from modAL import batch
from art.metrics import empirical_robustness
from art.attacks.evasion import *
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tabulate import tabulate
from art.estimators.classification.scikitlearn import ScikitlearnSVC
from sklearn.metrics.pairwise import paired_distances, euclidean_distances
import scipy
from tvregdiff.tvregdiff import TVRegDiff
from tabulate import tabulate

import ipywidgets as widgets
from ipywidgets import interact

import libactive
import libadversarial
import libstop
from libactive import MyActiveLearner, active_split
from libadversarial import adversarial, uncertainty, random_batch, uncertainty_stop
from libutil import ProgressParallel
from libdatasets import *
import librun
from librun import run

In [2]:
import libdatasets; reload(libdatasets); from libdatasets import *

In [35]:
matrix = {
    # Dataset fetchers should cache if possible
    # Lambda wrapper required for function to be pickleable (sent to other threads via joblib)
    "datasets": [
        #("newsgroups", wrap(newsgroups, None)),
        ("rcv1", wrap(rcv1, None)),
        ("webkb", wrap(webkb, None)),
        ("spamassassin", wrap(spamassassin, None)),
        ("avila", wrap(avila, None)),
        ("smartphone", wrap(smartphone, None)),
        ("swarm", wrap(swarm, None)),
        ("sensorless", wrap(sensorless, None)),
        ("splice", wrap(splice, None)),
        ("anuran", wrap(anuran, None)),        
    ],
    "dataset_mutators": {
        "none": (lambda *x, **kwargs: x),
    },
    "methods": [
        ("uncertainty", partial(uncertainty_stop, n_instances=10)),
    ],
    "models": [
        "svm-linear"
    ],
    "meta": {
        "dataset_size": 1000,
        "labelled_size": 10,
        "test_size": {
            "newsgroups_faith": 500,
            "newsgroups_graphics": 500,
            "newsgroups_hardware": 500,
            "newsgroups_sports_crypto": 500,
            "*": 0.5
        },
        "n_runs": 10,
        "ret_classifiers": True,
        "ensure_y": True,
        "stop_info": True,
        "aggregate": False,
        "stop_function": ("len1000", lambda learner: learner.y_training.shape[0] >= 1000),
        "pool_subsample": 1000
    }
}


In [36]:
reload(libactive); from libactive import active_split

In [37]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from time import monotonic
import operator
import nesi_bias
import traceback
reload(nesi_bias)
reload(libactive); from libactive import active_split
from nesi_bias import bias
from dotenv import load_dotenv; load_dotenv()

def func(amount):
    results = []
    for (name, dataset) in matrix['datasets']:
        #print(f"{name}")
        X, y = dataset()

        try:
            X_labelled, X_unlabelled, Y_labelled, Y_oracle, X_test, y_test = active_split(
                *dataset(), mutator=partial(bias, amount=amount), test_size=0.5, labeled_size=10, shuffle=True, random_state=np.random
            )
            assert X_unlabelled.shape[0] >= 1490, "unlabelled pool too small"
        except Exception as e:
            #print(f"Could not split: {e}")
            #traceback.print_exc()
            results.append([name, 0,0,0])
            continue

        if isinstance(X_labelled, scipy.sparse.csr_matrix):
            X = scipy.sparse.vstack((X_labelled, X_unlabelled))
        else:
            X = np.concatenate((X_labelled, X_unlabelled))
        y = np.concatenate((Y_labelled, Y_oracle))


        clf = SVC(kernel='linear', probability=True)

        clf.fit(X_labelled, Y_labelled)
        start = clf.score(X_test, y_test)
        start_t = monotonic()
        clf.fit(X[:1000], y[:1000])
        time = monotonic() - start_t
        final = clf.score(X_test, y_test)
        results.append([name, start, final, time])
        
    results = np.array(results)
    print(tabulate(np.hstack((results, np.expand_dims(results[:,2].astype(float)-results[:,1].astype(float), axis=1))), headers=["Name", "Initial acc", "Final acc", "Time", "Diff"]))

In [38]:
amounts = [0.1,0.2,0.3,0.4,0.5]
for amount in amounts:
    print(f"amount={amount}")
    func(amount)
    print()

amount=0.1
Name            Initial acc    Final acc    Time      Diff
------------  -------------  -----------  ------  --------
rcv1              0.474062      0.879788   2.687  0.405726
webkb             0             0          0      0
spamassassin      0             0          0      0
avila             0.0959364     0.526356   0.282  0.43042
smartphone        0             0          0      0
swarm             0.51016       0.943538   4.469  0.433378
sensorless        0.155426      0.799761   0.312  0.644334
splice            0             0          0      0
anuran            0.673708      0.948305   0.078  0.274597

amount=0.2
Name            Initial acc    Final acc    Time       Diff
------------  -------------  -----------  ------  ---------
rcv1               0.473423     0.884803   2.984  0.41138
webkb              0            0          0      0
spamassassin       0            0          0      0
avila              0.346751     0.526164   0.281  0.179413
smartphone      

**rcv1, avila, swarm, sensorless, anuran**

In [39]:
def func():
    results = []
    for (name, dataset) in matrix['datasets']:
        #print(f"{name}")
        X, y = dataset()

        try:
            X_labelled, X_unlabelled, Y_labelled, Y_oracle, X_test, y_test = active_split(
                *dataset(), mutator=lambda *args, **kwargs: args, test_size=0.5, labeled_size=10, shuffle=True, random_state=np.random
            )
            assert X_unlabelled.shape[0] >= 1490, "unlabelled pool too small"
        except Exception as e:
            #print(f"Could not split: {e}")
            #traceback.print_exc()
            results.append([name, 0,0,0])
            continue

        if isinstance(X_labelled, scipy.sparse.csr_matrix):
            X = scipy.sparse.vstack((X_labelled, X_unlabelled))
        else:
            X = np.concatenate((X_labelled, X_unlabelled))
        y = np.concatenate((Y_labelled, Y_oracle))


        clf = SVC(kernel='linear', probability=True)

        clf.fit(X_labelled, Y_labelled)
        start = clf.score(X_test, y_test)
        start_t = monotonic()
        clf.fit(X[:1000], y[:1000])
        time = monotonic() - start_t
        final = clf.score(X_test, y_test)
        results.append([name, start, final, time])
        
    results = np.array(results)
    print(tabulate(np.hstack((results, np.expand_dims(results[:,2].astype(float)-results[:,1].astype(float), axis=1))), headers=["Name", "Initial acc", "Final acc", "Time", "Diff"]))

In [40]:
func()

Name            Initial acc    Final acc    Time      Diff
------------  -------------  -----------  ------  --------
rcv1               0.474798     0.903478   2.641  0.42868
webkb              0.451905     0.869048   3.562  0.417143
spamassassin       0.752148     0.965631   2.985  0.213483
avila              0.12258      0.5507     0.281  0.42812
smartphone         0.566984     0.943448   0.313  0.376464
swarm              0.694704     0.941123   4      0.246419
sensorless         0.16616      0.879132   0.328  0.712972
splice             0.590596     0.9279     0.766  0.337304
anuran             0.738188     0.935242   0.062  0.197054
