# Active Learning on Real Datasets

In [4]:
import warnings
from importlib import reload
from functools import partial

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm, trange
from joblib import delayed
from modAL import batch
from art.metrics import empirical_robustness

from ipynb.fs.defs import Bias
from ipynb.fs.defs.Datasets import generateData_twoPills_2D, generateData_twoPills_noNoise_2D, plot_dataset_2D

import libactive
import libadversarial
from libactive import MyActiveLearner, active_split
from libadversarial import fgm, deepfool, uncertainty_id, random_batch
from libutil import ProgressParallel

In [5]:
def banknote():
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt'
    dataset = pd.read_csv(url, header=None)
    y = dataset[4].to_numpy()
    isInB = np.array([dataset.to_numpy()[i,0]>0.32 for i in range(len(dataset))])
    isInB = isInB.reshape(len(isInB), 1)
    X = dataset.drop([4], axis=1).to_numpy()
    #pca = PCA(n_components=21).fit(X)
    #X = pca.transform(X)
    X = np.append(X, isInB, axis=1)
    
    return X, y

def bias_banknote(data, labels):
    isInB = data[:,-1]
    X = data[isInB==1]
    X = X[:, 0:(len(data[0])-1)]
    y = labels[isInB==1]
    return X, y

def haberman():
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt'
    dataset = pd.read_csv(url, header=None)
    
    y = dataset[4].to_numpy()
    X = dataset.drop([4], axis=1).to_numpy()

    return X, y

def digits():
    digits = datasets.load_digits()
    n_samples = len(digits.images)
    data = digits.images.reshape((n_samples, -1))
    return data, digits.target

def abalone():
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
    dataset = pd.read_csv(url, header=None)
    y = dataset[0].to_numpy()
    isInB = np.array([dataset.to_numpy()[i,6]<0.144 for i in range(len(dataset))])
    isInB = isInB.reshape(len(isInB), 1)
    X = dataset.drop([0,6], axis=1).to_numpy()
    X = np.append(X, isInB, axis=1)
    
    return X, y

def bias_abalone(data, labels):
    isInB = data[:,-1]
    X = data[isInB==1]
    X = X[:, 0:(len(data[0])-1)]
    y = labels[isInB==1]
    return X, y

def car():
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data'
    toNum = {
        'low': 1,
        'med': 2,
        'high': 3,
        'vhigh':4,
        '5more':5,
        'more':5,
        'small':1,
        'big':3
    }
    dataset = pd.read_csv(url, header=None)
    y = dataset[6].to_numpy()
    dataset = dataset.drop([6], axis=1)
    dataset = dataset.replace({0:toNum, 1:toNum, 2:toNum, 3:toNum, 4:toNum, 5:toNum})
    dataset = dataset.apply(pd.to_numeric)
    
    isInB = np.array([dataset.to_numpy()[i,3]>3 for i in range(len(dataset))])
    isInB = isInB.reshape(len(isInB), 1)
    #dataset = dataset.drop([3], axis=1)
    X = np.append(dataset.to_numpy(), isInB, axis=1)
    
    return X, y

def bias_car(data, labels):
    isInB = data[:,-1]
    X = data[isInB==1]
    X = X[:, 0:(len(data[0])-1)]
    y = labels[isInB==1]
    return X, y

def cardio():
    dataset = pd.read_csv('Imitate/Datasets/cardio_train.csv', header=0, sep=';', index_col=0)
    y = dataset['cardio'].to_numpy()
    dataset = dataset[['age', 'weight']]
    dataset = dataset.assign(age = dataset.age/365.25)
    
    isInB = np.array([1] * len(dataset))
    isInB = isInB.reshape(len(isInB), 1)
    
    X = np.append(dataset.to_numpy(), isInB, axis=1)
    
    # draw sample
    X, _, y, _ = train_test_split(X, y, test_size=0.90)
    
    return X, y

def bias_cardio(data, labels):
    isInB = data[:,-1]
    X = data[isInB==1]
    X = X[:, 0:(len(data[0])-1)]
    y = labels[isInB==1]
    return X, y

def shuttle(dataset_size=58000):
    dataset1 = pd.read_csv('Imitate/Datasets/shuttle.trn', header=None,sep='\s')
    dataset2 = pd.read_csv('Imitate/Datasets/shuttle.tst', header=None,sep='\s')
    dataset = np.concatenate((dataset1, dataset2))
    y = dataset[:,-1] == 1
    isInB = np.array([dataset[i,0]>54.5 for i in range(len(dataset))])
    isInB = isInB.reshape(len(isInB), 1)
    X = dataset[:,0:-1]
    #pca = PCA(n_components=21).fit(X)
    #X = pca.transform(X)
    X = np.append(X, isInB, axis=1)
    
    # draw sample
    X, _, y, _ = train_test_split(X, y, test_size=0.8)
    
    return X, y

def bias_shuttle(data, labels):
    isInB = data[:,-1]
    X = data[isInB==1]
    X = X[:, 0:(len(data[0])-1)]
    y = labels[isInB==1]
    return X, y

def skin(dataset_size=4177):
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00229/Skin_NonSkin.txt'
    dataset = pd.read_csv(url, header=None, sep='\t')
    y = dataset[3].to_numpy()
    isInB = np.array([dataset.to_numpy()[i,2]<=170.5 for i in range(len(dataset))])
    isInB = isInB.reshape(len(isInB), 1)
    X = dataset.drop([2,3], axis=1).to_numpy()
    #pca = PCA(n_components=21).fit(X)
    #X = pca.transform(X)
    X = np.append(X, isInB, axis=1)
    
    # draw sample
    X, _, y, _ = train_test_split(X, y, test_size=0.95)
    
    return X, y

def bias_skin(data, labels):
    isInB = data[:,-1]
    X = data[isInB==1]
    X = X[:, 0:(len(data[0])-1)]
    y = labels[isInB==1]
    return X, y

In [6]:
query_methods = {
    "random": random_batch,
    "uncertainty": batch.uncertainty_batch_sampling,
    "uncertainty_id": uncertainty_id,
    "fgm": fgm,
    # deepfool is **slow**
    "deepfool": deepfool
}

In [7]:
test_datasets = {
    **{fn.__name__: fn() for fn in [banknote, haberman, car, cardio, skin, shuttle, abalone, digits]},
    
    "banknote biased": bias_banknote(*banknote()),
    "car biased": bias_car(*car()), 
    "cardio biased": bias_cardio(*cardio()), 
    "skin biased": bias_skin(*skin()), 
    "shuttle biased": bias_shuttle(*shuttle()), 
    "abalone biased": bias_abalone(*abalone()), 
    #"digits biased": bias_digits(*digits())
}

  dataset1 = pd.read_csv('Imitate/Datasets/shuttle.trn', header=None,sep='\s')
  dataset2 = pd.read_csv('Imitate/Datasets/shuttle.tst', header=None,sep='\s')


**Variance**

Could probably only do one run of deepfool as the variance seems pretty low.

In [None]:
# Annoying hack so that the progressbars disapear as they're supposed to
from IPython.core.display import HTML, display
display(HTML("""
<style>
.p-Widget.jp-OutputPrompt.jp-OutputArea-prompt:empty {
  padding: 0;
  border: 0;
}
.p-Widget.jp-RenderedText.jp-OutputArea-output pre:empty {
  display: none;
}
</style>
"""))

import libutil; reload(libutil)

for dataset_name, (X, y) in tqdm(test_datasets.items(), desc="Datasets", leave=False):
    X_labelled, X_unlabelled, y_labelled, y_oracle, X_test, y_test = active_split(X, y, labeled_size=0.1)

    fig, axes = plt.subplots(1, 3, figsize=(18,4))

    for method_name, method in tqdm(query_methods.items(), desc=f"Method", leave=False):
        #try:
            #metrics = pd.read_csv(f"Experiments/experiment_real_{dataset_name}_method_{method_name}.csv")
            #stderr = pd.read_csv(f"Experiments/experiment_real_{dataset_name}_method_{method_name}_stderr.csv")
        #except FileNotFoundError:
        metrics = ProgressParallel(n_jobs=4, total=(10 if method_name != "deepfool" or method_name != "fgm" else 4), desc=f"Run", leave=False)(
            delayed(
                lambda X_labelled, X_unlabelled, y_labelled, y_oracle, X_test, y_test, method: MyActiveLearner().active_learn2(X_labelled, X_unlabelled, y_labelled, y_oracle, X_test, y_test, method)
            )(X_labelled, X_unlabelled, y_labelled, y_oracle, X_test, y_test, partial(method, n_instances=10))
            for _ in range((10 if method_name != "deepfool" or method_name != "fgm" else 4))
        )
        metrics, stderr = metrics[0].average(metrics[1:])
        metrics.to_csv(f"Experiments/experiment_real_{dataset_name}_method_{method_name}.csv")
        stderr.to_csv(f"Experiments/experiment_real_{dataset_name}_method_{method_name}_stderr.csv")

        for i, ax in enumerate(axes.flatten()):
            ax.errorbar(metrics['x'], metrics.iloc[:,1+i], yerr=stderr.iloc[:,1+i], label=f"{method_name}" if i == 0 else "")
            ax.set_xlabel("Instances"); ax.set_ylabel(["Accuracy", "F1", "AUC ROC"][i]); plt.suptitle(f"{dataset_name}")

    fig.legend()

## Robustness

In [None]:
query_methods = {
    #"random": random_batch,
    #"uncertainty": batch.uncertainty_batch_sampling,
    #"uncertainty_id": uncertainty_id,
    "fgm": fgm,
    # deepfool is **slow**
    "deepfool": deepfool
}

In [None]:
# Annoying hack so that the progressbars disapear as they're supposed to
from IPython.core.display import HTML, display
display(HTML("""
<style>
.p-Widget.jp-OutputPrompt.jp-OutputArea-prompt:empty {
  padding: 0;
  border: 0;
}
.p-Widget.jp-RenderedText.jp-OutputArea-output pre:empty {
  display: none;
}
</style>
"""))

import libutil; reload(libutil)

for dataset_name, (X, y) in tqdm(test_datasets.items(), desc="Datasets", leave=False):
    X_labelled, X_unlabelled, y_labelled, y_oracle, X_test, y_test = active_split(X, y, labeled_size=0.1)

    fig, axes = plt.subplots(1, 4, figsize=(18,4))

    for method_name, method in tqdm(query_methods.items(), desc=f"Method", leave=False):
        #try:
            #metrics = pd.read_csv(f"Experiments/experiment_real_{dataset_name}_method_{method_name}.csv")
            #stderr = pd.read_csv(f"Experiments/experiment_real_{dataset_name}_method_{method_name}_stderr.csv")
        #except FileNotFoundError:
        for teach_adversarial in tqdm([True, False], desc="Teach adversarial", leave=False):
            metrics = ProgressParallel(n_jobs=4, total=(10 if method_name != "deepfool" or method_name != "fgm" else 4), desc=f"Run", leave=False)(
                delayed(
                    lambda X_labelled, X_unlabelled, y_labelled, y_oracle, X_test, y_test, method, teach_adversarial: MyActiveLearner(metrics=[accuracy_score, f1_score, roc_auc_score, empirical_robustness]).active_learn2(X_labelled, X_unlabelled, y_labelled, y_oracle, X_test, y_test, method, teach_adversarial=teach_adversarial)
                )(X_labelled, X_unlabelled, y_labelled, y_oracle, X_test, y_test, partial(method, n_instances=10), teach_adversarial)
                for _ in range((10 if method_name != "deepfool" or method_name != "fgm" else 4))
            )
            metrics, stderr = metrics[0].average(metrics[1:])
            metrics.to_csv(f"Experiments/experiment_real_{dataset_name}_method_{method_name}.csv")
            stderr.to_csv(f"Experiments/experiment_real_{dataset_name}_method_{method_name}_stderr.csv")

            for i, ax in enumerate(axes.flatten()):
                ax.errorbar(metrics['x'], metrics.iloc[:,1+i], yerr=stderr.iloc[:,1+i], label=f"{method_name}" if i == 0 else "")
                ax.set_xlabel("Instances"); ax.set_ylabel(["Accuracy", "F1", "AUC ROC", "Robustness"][i]); plt.suptitle(f"{dataset_name}")

    fig.legend()