# DR CI for Comparing Abstaining Classifiers on CIFAR-100

Using pretrained features from a VGG-16 model.

https://github.com/chenyaofo/pytorch-cifar-models

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os.path
from collections import OrderedDict

from tqdm.notebook import tqdm, trange
from joblib import Parallel, delayed
import time
import itertools as it

import comparecast as cc
import comparecast_causal as c3

import warnings
warnings.filterwarnings('ignore')

[MLENS] backend: threading


In [2]:
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline

In [3]:
# for images
import torch
import torch.nn.functional as F

from torchvision import datasets
from torchvision import transforms as T

Configurations for evaluation:

In [4]:
RANDOM_SEED = 1234567890

rng = np.random.default_rng(RANDOM_SEED)

In [5]:
ALPHA = 0.05
SCORING_RULE = "brier"
EPSILON = 0.2       # how much to clip large values of estimated abstention probability
NOISE_LEVEL = 0.0   # label noise
N_TRIALS = 100      # repetitions  
N_JOBS = -1         # number of CPU cores for parallel processing
MIXED_EST = False   # use mixed estimation

In [6]:
PLOTS_DIR = "./plots/cifar100_vgg16/experiment1{}/{}/eps{:g}".format(
    "_mixed" if MIXED_EST else "", SCORING_RULE, EPSILON,
)
os.makedirs(PLOTS_DIR, exist_ok=True)

FONT = "Liberation Serif"  # for servers & colab; if not use "DejaVu Serif"
cc.set_theme(style="whitegrid", font=FONT)

## CIFAR-10 features

Extracted using:

```python
import torch
from torchvision import datasets
from torchvision import transforms as T

model = torch.hub.load("chenyaofo/pytorch-cifar-models", "cifar100_vgg16_bn", pretrained=True).eval()

transform = T.Compose([T.ToTensor(), 
                       T.Normalize(mean=[0.4914, 0.4822, 0.4465], 
                                   std=[0.2023, 0.1994, 0.2010])])
val_data = datasets.CIFAR100("./data/cifar100", download=True, train=False, transform=transform)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=128, shuffle=False)

with torch.no_grad():
    for inputs_b, labels_b in tqdm(val_loader):
        features = model.features(inputs_b).squeeze().numpy()
```

In [7]:
X_eval = np.load("./data/cifar100/cifar100_vgg16_features.npy")
y_eval = np.load("./data/cifar100/cifar100_labels.npy")
print(X_eval.shape, y_eval.shape)

(10000, 512) (10000,)


## Classifiers

We use the VGG model's features but train the final layer separately and also vary the abstention mechanisms.

In [8]:
# Create a classifier
clf = c3.get_learner("c", "Logistic", scaler=None, max_iter=100)   # lbfgs

# Split eval again into 1:1 train:validation
indices = np.arange(len(y_eval))
X_train, X_valid, y_train, y_valid, idx_train, idx_valid = train_test_split(
    X_eval, y_eval, indices, test_size=0.5, shuffle=True, random_state=101,
)

# 1. "ground truth"
preds_vgg = np.load("./data/cifar100/cifar100_vgg16_predictions.npy")  # pre-softmax
preds_vgg = F.softmax(torch.tensor(preds_vgg), dim=1).numpy()[idx_valid]
score_fn = cc.get_scoring_rule(SCORING_RULE)
print("Original VGG16")
print("Accuracy:", np.mean(preds_vgg.argmax(-1) == y_valid))
print("Score:", score_fn(preds_vgg, y_valid).mean())

Original VGG16
Accuracy: 0.7106
Score: 0.7576894299713142


In [9]:
# 2. trained final layer
print("training:", clf)
t0 = time.time()
clf.fit(X_train, y_train)
t1 = time.time()
print("elapsed time: {:.2f}s".format(t1 - t0))

# Predict the value of the digit on the test subset
predictions = clf.predict_proba(X_valid)
print("Retrained VGG")
print("Accuracy:", np.mean(predictions.argmax(-1) == y_valid))
print("Score:", score_fn(predictions, y_valid).mean())
print("Mean SR:", predictions.max(-1).mean())

training: Pipeline(steps=[('logisticregression', LogisticRegression())])
elapsed time: 5.75s
Retrained VGG
Accuracy: 0.701
Score: 0.7869779531986764
Mean SR: 0.7805081505716034


## Abstaining classifier comparisons on the "test" set

We compare the same classifiers that use different abstention rules. 

In [10]:
def compute_preds_cifar(
    X, y, predictions,
    confidence_metric="sr", stochastic=True, threshold=0.7, eps=EPSILON, plot=False,
):
    _, abstentions, confidence, pred_labels = c3.predict_or_abstain(
        predictions,
        confidence_metric=confidence_metric,
        stochastic=stochastic,
        threshold=threshold,
        eps=eps,
    )
    scores = c3.compute_scores(predictions, abstentions, y, 
                               scoring_rule=SCORING_RULE, compute_se=True)

    if plot:
        print(f"[{confidence_metric}, stochastic: {stochastic}, eps: {eps}]")
        for name, (m, se) in scores.items():
            print("{}: {:.3f} +/- {:.3f}".format(name, m, se))
        print("-" * 40)

    return predictions, abstentions, confidence, pred_labels

In [11]:
scenarios = [
    # cf(A)=cf(B), deterministic
    (("SRDet0.8", dict(predictions=predictions, 
               confidence_metric="sr", 
               stochastic=False, 
               threshold=0.8,
               eps=EPSILON)),
     ("SRDet0.5", dict(predictions=predictions, 
               confidence_metric="sr", 
               stochastic=False, 
               threshold=0.5,
               eps=EPSILON))),
    # cf(A)=cf(B), stochastic
    (("SRStoc", dict(predictions=predictions, 
               confidence_metric="sr", 
               stochastic=True, 
               eps=EPSILON)),
     ("GiniStoc", dict(predictions=predictions, 
               confidence_metric="impurity", 
               stochastic=True, 
               eps=EPSILON))),
    # cf(A)>cf(B), stochastic
    (("VGGSRStoc", dict(predictions=preds_vgg, 
               confidence_metric="sr", 
               stochastic=True, 
               eps=EPSILON)),
     ("LinearSRStoc", dict(predictions=predictions, 
               confidence_metric="sr", 
               stochastic=True, 
               eps=EPSILON))),    
]

In [12]:
clfA, clfB = "Logistic_SR", "Logistic_Gini"
preds_AB = {
    clfA: compute_preds_cifar(X_valid, y_valid, predictions,
                              confidence_metric="sr", stochastic=True, #threshold=0.8,
                              eps=EPSILON, plot=True),
    clfB: compute_preds_cifar(X_valid, y_valid, predictions,
                              confidence_metric="impurity", stochastic=True, #threshold=0.5, 
                              eps=EPSILON, plot=True),
}

[sr, stochastic: True, eps: 0.2]
selective_score: 0.822 +/- 0.005
coverage: 0.677 +/- 0.007
oracle_cf_score: 0.787 +/- 0.004
----------------------------------------
[impurity, stochastic: True, eps: 0.2]
selective_score: 0.834 +/- 0.005
coverage: 0.620 +/- 0.007
oracle_cf_score: 0.787 +/- 0.004
----------------------------------------


In [13]:
learners = {
    "Linear": dict(model="Linear", scaler=None),
}

drcis = OrderedDict()
summary = OrderedDict()
for learner_name, learner_config in learners.items():
    for (clfA, configA), (clfB, configB) in scenarios:
        scenario = "/".join([clfA, clfB])
        print(scenario)
        preds_AB = {
            clfA: compute_preds_cifar(X_valid, y_valid, plot=True, **configA),
            clfB: compute_preds_cifar(X_valid, y_valid, plot=True, **configB),
        }
        drcis[learner_name], summary[learner_name] = c3.run_experiment(
            X=X_valid, 
            y=y_valid,
            predictions=preds_AB[clfA][0],
            abstentions=preds_AB[clfA][1],
            predictions_b=preds_AB[clfB][0],
            abstentions_b=preds_AB[clfB][1],
            estimator="dr", 
            learner=learner_config, 
            mixed_estimation=False, 
            mixed_coef=None,
            alpha=ALPHA, 
            scoring_rule=SCORING_RULE, 
            clip_pi=EPSILON, 
            rng=rng, 
            verbose=True,
        )

SRDet0.8/SRDet0.5
[sr, stochastic: False, eps: 0.2]
selective_score: 0.899 +/- 0.005
coverage: 0.620 +/- 0.007
oracle_cf_score: 0.787 +/- 0.004
----------------------------------------
[sr, stochastic: False, eps: 0.2]
selective_score: 0.839 +/- 0.005
coverage: 0.819 +/- 0.005
oracle_cf_score: 0.787 +/- 0.004
----------------------------------------
95% CI for CF score difference [estimator: dr, learner: {'model': 'Linear', 'scaler': None}]
pi [split=0] evaluation: accuracy 0.83606
mu0 [split=0] evaluation: MSE 0.07894
pi_b [split=0] evaluation: accuracy 0.84763
mu0_b [split=0] evaluation: MSE 0.08153
pi [split=1] evaluation: accuracy 0.81588
mu0 [split=1] evaluation: MSE 0.07613
pi_b [split=1] evaluation: accuracy 0.84356
mu0_b [split=1] evaluation: MSE 0.08051
Target CF score difference: 0.00000
Cross-fit estimate: 0.00672
Asymptotic CI: (-0.00455, 0.01799)
CI width: 0.02255
Contains oracle CF score: True
Rejection (for H0: diff=0): False
----------------------------------------
SRSt