This notebook creates a synthetic experiment to demonstrate annotation artifacts.

#### Load up the requirements

In [458]:
from joblib import dump, load
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn import linear_model
%matplotlib inline

np.random.seed(0)

#### Set dataset sizes.

In [459]:
ntrain = 30000
ntest = 300
nfeats = 300
nclasses = 3

assert nfeats % nclasses == 0 # We want to make sure we have three (orthonormal) clusters. Not strictly required.
assert ntrain % nclasses == 0 # Ensures balanced train data.

#### Functions

In [477]:
def accuracy(pred, ref):
    correct = sum((pred == ref).astype('int'))
    return correct/len(ref)


def shuffle_dataset(x, y):
    floaty = np.expand_dims(y, axis=1).astype('float')
    xy = np.concatenate((x, floaty), axis=1)
    np.random.shuffle(xy)
    x = xy.T[:-1].T
    y = xy.T[-1].T.astype('int')
    return x, y


def create_ex_uniform_random(shuffle=True):
    """
    Sample from uniform random distribution to create train and test sets.
    """
    X = np.random.rand(ntrain, nfeats)
    Y = np.random.randint(0, nclasses, ntrain)
    xtest = np.random.rand(ntest, nfeats)
    ytest = np.random.randint(0, nclasses, ntest)
    
    if shuffle:
        X, Y = shuffle_dataset(X, Y)
    
    return X, Y, xtest, ytest


def create_ex_multivariate_normal(nfeats, ntrain, ntest, shuffle=True):
    """
    Sample from 3 different multivariate Gaussian distributions to create train and test sets.
    """
    covariance = np.diag(np.full(nfeats, 1))  # Use the same for all 3 clusters.
    mean = np.full(nfeats, 0)
    mean[nfeats // nclasses] = 1  # To ensure orthonormality of clusters.
    
    X = np.random.multivariate_normal(mean, covariance, ntrain//nclasses)
    Y = np.full(ntrain//nclasses, 0)
    xtest = np.random.multivariate_normal(mean, covariance, ntest//nclasses) 
    ytest = np.full(ntest//nclasses, 0)
    
    for i in range(1, nclasses):
        mean = np.full(nfeats, 0)
        mean[(i+1) * nfeats // nclasses - 1] = 1
    
        ex = np.random.multivariate_normal(mean, covariance, ntrain//nclasses)
        X = np.concatenate((X, ex), axis=0)
        
        label = np.full(ntrain//nclasses, i)
        Y = np.concatenate((Y, label), axis=0)
        
        test_ex = np.random.multivariate_normal(mean, covariance, ntest//nclasses)
        xtest = np.concatenate((xtest, test_ex), axis=0)
        
        test_y = np.full(ntest//nclasses, i)
        ytest = np.concatenate((ytest, test_y), axis=0)
        
    # Shuffle the train.
    if shuffle:
        X, Y = shuffle_dataset(X, Y)
    return X, Y, xtest, ytest

In [494]:
def introduce_cheat_feature(x, y, randomize):
    if randomize:
        random_feats = int(len(x) * randomize)//3
        mask0 = np.full(random_feats, 0)
        mask1 = np.full(random_feats, 1)
        mask2 = np.full(random_feats, 2)

        mask = np.concatenate((mask0, mask1, mask2))
        np.random.shuffle(mask)
        
        masked_y = np.copy(y)
        masked_y[:(len(mask))] = mask
    else:
        masked_y = y

    ycheatfeat = np.expand_dims(masked_y.astype('float'), axis=1)
    xcheatfeat = np.concatenate((x, ycheatfeat), axis=1)
    return xcheatfeat

# introduce_cheat_feature(np.random.rand(18, 3), np.random.randint(0, 3, 18), 1.0)

#### Create train and test datasets.

In [503]:
# X, Y, xtest, ytest = create_ex_uniform_random()

xtrain, ytrain, xtest, ytest = create_ex_multivariate_normal(nfeats, ntrain, ntest, shuffle=False)
print(f"X: {len(xtrain)}, Y:{len(ytrain)}, x-test:{len(xtest)}, y-test:{len(ytest)}")
"Num features", nfeats

X: 30000, Y:30000, x-test:300, y-test:300


('Num features', 300)

### Baseline
#### Train and test a linear model

In [508]:
if os.path.exists('x/baseline.joblib'):
    clf = load('baseline.joblib')
else:
    clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
    clf.fit(xtrain, ytrain)
    dump(clf, 'baseline.joblib') 

ypred = clf.predict(xtest)
f"BASELINE test acc: {accuracy(ypred, ytest):.4} train acc: {accuracy(clf.predict(xtrain), ytrain):.4}"

'BASELINE test acc: 0.6033 train acc: 0.6066'

### Cheating
#### Train a cheater model, which uses labels as features

In [509]:
cheat_train = introduce_cheat_feature(xtrain, ytrain, randomize=0.)

if os.path.exists('x/cheater.joblib'):
    cheater = load('cheater.joblib')
else:
    cheater = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
    cheater.fit(cheat_train, ytrain)
    dump(cheater, 'cheater.joblib') 
    
f"CHEATER train acc: {accuracy(cheater.predict(cheat_train), ytrain):.4}"

'CHEATER train acc: 0.9908'

#### At various levels of randomization of the cheat feature, see test performance.

In [512]:
print("CHEATER randomization: test acc")
for r in [0., 0.2, 0.4, 0.6, 0.8, 0.9, 1.]:
    cheat_test = introduce_cheat_feature(xtest, ytest, randomize=r)

    cheat_prediction = cheater.predict(cheat_test)
    print(f"{r:21}: {accuracy(cheat_prediction, ytest):.4}")

CHEATER randomization: test acc
                  0.0: 0.9967
                  0.2: 0.87
                  0.4: 0.7467
                  0.6: 0.58
                  0.8: 0.4567
                  0.9: 0.4633
                  1.0: 0.3567


### Cheating some of the time
#### This model cheats some of the time; 50% of the time, cheat features are identical to the true class

In [513]:
partial_cheat_train = introduce_cheat_feature(X, Y, randomize=.5)

if os.path.exists('x/partial_cheater.joblib'):
    partial_cheater = load('partial_cheater.joblib')
else:
    partial_cheater = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
    partial_cheater.fit(partial_cheat_train, Y)
    dump(partial_cheater, 'partial_cheater.joblib') 
f"PARTIAL-CHEATER train acc: {accuracy(partial_cheater.predict(partial_cheat_train), ytrain):.4}"

'PARTIAL-CHEATER train acc: 0.7264'

#### At various levels of randomization of the cheat feature, see test performance.

In [515]:
print("PARTIAL-CHEATER randomization: test acc")
for r in [0., 0.2, 0.4, 0.6, 0.8, 0.9, 1.]:
    partial_cheat_test = introduce_cheat_feature(xtest, ytest, randomize=r)

    partial_cheat_pred = partial_cheater.predict(partial_cheat_test)
    print(f"{r:29}: {accuracy(partial_cheat_pred, ytest):.4}")

PARTIAL-CHEATER randomization: test acc
                          0.0: 0.8
                          0.2: 0.7733
                          0.4: 0.75
                          0.6: 0.7333
                          0.8: 0.6333
                          0.9: 0.56
                          1.0: 0.5033


### Sampling cheat features (artifacts)

In [516]:
def introduce_cheat_features_from_gaussian(probs, ntotal, mean, covariance=0.1):
    assert len(probs) == nclasses

    n = ntotal//nclasses  # number of examples per class.
    features = np.array([])
    for p in probs:
        cheat_feats = np.random.normal(mean, covariance, int(p*n))
        rest = n - len(cheat_feats)
        cheat_feats = np.concatenate((cheat_feats, np.random.rand(rest)), axis=0)
        np.random.shuffle(cheat_feats)
        features = np.concatenate((features, cheat_feats), axis=0)
    assert len(features) == ntotal
    return np.expand_dims(features, axis=1)

# introduce_cheat_features_from_gaussian(probs=[0.5, 0.25, 0.25], ntotal=18, mean=500.0)

#### Create training features with cheating, based on a multinomial

In [517]:
prob = [0.1, 0.1, 0.1]  # base probability for 
cheater_multin_feats_train = np.copy(xtrain)

for i, mean in enumerate([10, 100, -100]):
    prob_class = np.copy(prob)
    prob_class[i] = 0.8
    cheat_feats = introduce_cheat_features_from_gaussian(probs=prob_class, ntotal=ntrain, mean=mean)
    cheater_multin_feats_train = np.concatenate((cheater_multin_feats_train, cheat_feats), axis=1)


cheater_dist = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
cheater_dist.fit(cheater_multin_feats_train, Y)

f"MULTINOMIAL-CHEATER train acc: {accuracy(cheater_dist.predict(cheater_multin_feats_train), ytrain)}"


'PARTIAL-CHEATER train acc: 0.8427333333333333'

#### Different probs for cheater distribution

In [521]:
print("MULTINOMIAL-CHEATER randomization: test acc")
for p in [0., 0.2, 0.4, 0.6, 0.8, 0.9, 1.]:
    cheater_feats_test = np.copy(xtest)
    for i, mean in enumerate([10, 100, -100]):
        prob_class = np.copy(prob)
        prob_class[i] = p
        class_feats_test = introduce_cheat_features_from_gaussian(probs=prob_class, ntotal=ntest, mean=mean)
        cheater_feats_test = np.concatenate((cheater_feats_test, class_feats_test), axis=1)
    
    print(f"{p:33}: {accuracy(cheater_dist.predict(cheater_feats_test), ytest):.4}")

MULTINOMIAL-CHEATER randomization: test acc
                              0.0: 0.5
                              0.2: 0.61
                              0.4: 0.6733
                              0.6: 0.7567
                              0.8: 0.8433
                              0.9: 0.88
                              1.0: 0.92
