In [2]:
from libactive import active_split
from sklearn.svm import SVC
import numpy as np
import pickle
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.utils import check_random_state
import scipy
from libutil import out_dir
import os
import libdatasets
from dotenv import load_dotenv; load_dotenv()

True

In [27]:
def run_passive(datasets, runs):
    for name, dataset in datasets[1:]:
        if name != 'anuran':
            continue
        print(name)
        fname = f"{out_dir()}{os.path.sep}passive{os.path.sep}{name}.pickle"
        try:
            with open(fname, "rb") as f:
                results = pickle.load(f)
                print(results)
                if all([run in results.keys() for run in runs]):
                    return [
                        [
                            np.min([result[i] for result in results.values()]),
                             np.mean([result[i] for result in results.values()]),
                             np.max([result[i] for result in results.values()])
                        ] for i in range(3)
                    ]
        except (FileNotFoundError, EOFError):
            results = {}


        for run in runs:
            print(f"  {run}")
            if run in results.keys():
                continue
                
            X,y = dataset()

            X_labelled, X_unlabelled, y_labelled, y_oracle, X_test, y_test = active_split(
                X, y, labeled_size=10, test_size=0.5, random_state=check_random_state(run), ensure_y=True

            )
            if isinstance(X_labelled, scipy.sparse.csr_matrix):
                X = scipy.sparse.vstack((X_labelled, X_unlabelled))
            else:
                X = np.concatenate((X_labelled, X_unlabelled))
            y = np.concatenate((y_labelled, y_oracle))

            clf = SVC(probability=True, kernel='linear')
            clf.fit(X, y)
            predicted = clf.predict(X_test)
            predict_proba = clf.predict_proba(X_test)
            unique_labels = np.unique(y_labelled)

            if len(unique_labels) > 2 or len(unique_labels.shape[0]) > 1:
                roc_auc = roc_auc_score(
                    y_test, predict_proba, multi_class="ovr"
                )
            else:
                roc_auc = roc_auc_score(
                    y_test, predict_proba[:, 1]
            )

            results[run] = [
                accuracy_score(y_test, predicted),
                f1_score(
                    y_test,
                    predicted,
                    average="micro" if len(unique_labels) > 2 else "binary",
                    pos_label=unique_labels[1] if len(unique_labels) <= 2 else 1,
                ),
                roc_auc
            ]

            with open(fname, "wb") as f:
                pickle.dump(results, f)

In [28]:
from nesi_noise import matrix

In [29]:
run_passive(matrix['datasets'], range(10))

anuran
{0: [0.9663702056698166, 0.9663702056698166, 0.9962899788694562], 1: [0.9677598665925514, 0.9677598665925514, 0.9871650626295676], 2: [0.9644246803779878, 0.9644246803779878, 0.994715588866147], 3: [0.9635908838243469, 0.9635908838243469, 0.9960201815025295], 4: [0.9647026125625348, 0.9647026125625348, 0.9953732278752536], 5: [0.9694274596998332, 0.9694274596998332, 0.995087486448907], 6: [0.9608115619788772, 0.9608115619788772, 0.9958793817354522], 7: [0.9680377987770984, 0.9680377987770984, 0.9953472068189424], 8: [0.9638688160088938, 0.9638688160088938, 0.9974182937637959], 9: [0.9638688160088938, 0.9638688160088938, 0.9967435733260384]}


[[0.9608115619788772, 0.9652862701500833, 0.9694274596998332],
 [0.9608115619788772, 0.9652862701500833, 0.9694274596998332],
 [0.9871650626295676, 0.9950039981836089, 0.9974182937637959]]