In [9]:
from libactive import active_split
from sklearn.svm import SVC
import numpy as np
import pickle
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.utils import check_random_state
from joblib import Parallel, delayed
import scipy
from libutil import out_dir
import os
import libdatasets
from dotenv import load_dotenv; load_dotenv()

True

In [22]:
def eval_one(results, name, dataset, run, fname):
    print(f"  {run}")
    if run in results.keys():
        return (run, results[run])

    X,y = dataset()

    X_labelled, X_unlabelled, y_labelled, y_oracle, X_test, y_test = active_split(
        X, y, labeled_size=10, test_size=0.5, random_state=check_random_state(run), ensure_y=True

    )
    if isinstance(X_labelled, scipy.sparse.csr_matrix):
        X = scipy.sparse.vstack((X_labelled, X_unlabelled))
    else:
        X = np.concatenate((X_labelled, X_unlabelled))
    y = np.concatenate((y_labelled, y_oracle))

    clf = SVC(probability=True, kernel='linear')
    clf.fit(X, y)
    predicted = clf.predict(X_test)
    predict_proba = clf.predict_proba(X_test)
    unique_labels = np.unique(y_labelled)

    if len(unique_labels) > 2 or len(unique_labels.shape) > 1:
        roc_auc = roc_auc_score(
            y_test, predict_proba, multi_class="ovr"
        )
    else:
        roc_auc = roc_auc_score(
            y_test, predict_proba[:, 1]
    )

    result = [
        accuracy_score(y_test, predicted),
        f1_score(
            y_test,
            predicted,
            average="micro" if len(unique_labels) > 2 else "binary",
            pos_label=unique_labels[1] if len(unique_labels) <= 2 else 1,
        ),
        roc_auc
    ]

    return (run, result)
    
def run_passive(datasets, runs):
    all_results = {}
    for name, dataset in datasets:
        if name == 'newsgroups':
            continue
        print(name)
        fname = f"{out_dir()}{os.path.sep}passive{os.path.sep}{name}.pickle"
        try:
            with open(fname, "rb") as f:
                results = pickle.load(f)
                print(f"Have results for {name}")
                if all([run in results.keys() for run in runs]):
                    all_results[name] = results
                    continue
        except (FileNotFoundError, EOFError):
            results = {}

        # os.cpu_count()
        r = Parallel(n_jobs=min(2, len(runs)))(delayed(eval_one)(results, name, dataset, run, fname) for run in runs)
        for run, result in r:
            results[run] = result
        with open(fname, "wb") as f:
            pickle.dump(results, f)
        all_results[name] = results
    return all_results

In [11]:
from nesi_noise import matrix

In [12]:
def key(dataset):
    return dataset[1]()[0].shape[0]

datasets = sorted(matrix['datasets'], key=key)

In [10]:
for name, _ in datasets:
    print(name)

splice
webkb
spamassassin
anuran
smartphone
newsgroups
avila
swarm
sensorless
rcv1


In [13]:
datasets = datasets[:-1]

In [14]:
for name, _ in datasets:
    print(name)

splice
webkb
spamassassin
anuran
smartphone
newsgroups
avila
swarm
sensorless


In [23]:
run_passive(datasets, range(10))

splice
Have results for splice
webkb
Have results for webkb
spamassassin
Have results for spamassassin
anuran
Have results for anuran
smartphone
Have results for smartphone
avila
Have results for avila
swarm
Have results for swarm
sensorless
Have results for sensorless


{'splice': {0: [0.9172413793103448, 0.9172413793103448, 0.9846323734844935],
  1: [0.9178683385579938, 0.9178683385579938, 0.984509108979941],
  2: [0.915987460815047, 0.915987460815047, 0.981412330643574],
  3: [0.9197492163009404, 0.9197492163009404, 0.9840467339806972],
  4: [0.9172413793103448, 0.9172413793103448, 0.9819332179533938],
  5: [0.9178683385579938, 0.9178683385579938, 0.9858522249646327],
  6: [0.9210031347962383, 0.9210031347962383, 0.9837752293884711],
  7: [0.9178683385579938, 0.9178683385579938, 0.9826761646715285],
  8: [0.9322884012539185, 0.9322884012539185, 0.9865827065297861],
  9: [0.903448275862069, 0.903448275862069, 0.9819747135813492]},
 'webkb': {0: [0.8904761904761904, 0.8904761904761904, 0.9749807057688898],
  1: [0.8980952380952381, 0.8980952380952381, 0.9745932229742071],
  2: [0.8919047619047619, 0.8919047619047619, 0.9765969663419309],
  3: [0.8990476190476191, 0.8990476190476191, 0.9756546514689446],
  4: [0.8919047619047619, 0.8919047619047619, 0.

In [None]:
k