This notebook creates a synthetic experiment to demonstrate annotation artifacts.

#### Load up the requirements

In [402]:
from joblib import dump, load
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn import linear_model
%matplotlib inline

np.random.seed(0)

#### Set dataset sizes.

In [403]:
ntrain = 30000
ntest = 300
nfeats = 300
nclasses = 3

assert nfeats % nclasses == 0 # We want to make sure we have three (orthonormal) clusters. Not strictly required.
assert ntrain % nclasses == 0 # Ensures balanced train data.

#### Functions

In [404]:
def accuracy(pred, ref):
    correct = sum((pred == ref).astype('int'))
    return correct/len(ref)


def create_ex_uniform_random():
    """
    Sample from uniform random distribution to create train and test sets.
    """
    X = np.random.rand(ntrain, nfeats)
    Y = np.random.randint(0, nclasses, ntrain)
    xtest = np.random.rand(ntest, nfeats)
    ytest = np.random.randint(0, nclasses, ntest)
    
    return X, Y, xtest, ytest


def create_ex_multivariate_normal(nfeats, ntrain, ntest):
    """
    Sample from 3 different multivariate Gaussian distributions to create train and test sets.
    """
    covariance = np.diag(np.full(nfeats, 10))  # Use the same for all 3 clusters.
    mean = np.full(nfeats, 0)
    mean[nfeats // nclasses] = 1  # To ensure orthonormality of clusters.
    
    X = np.random.multivariate_normal(mean, covariance, ntrain//nclasses)
    Y = np.full(ntrain//nclasses, 0)
    xtest = np.random.multivariate_normal(mean, covariance, ntest//nclasses) 
    ytest = np.full(ntest//nclasses, 0)
    
    
    for i in range(1, nclasses):
        mean = np.full(nfeats, 0)
        mean[(i+1) * nfeats // nclasses - 1] = 1
    
        ex = np.random.multivariate_normal(mean, covariance, ntrain//nclasses)
        X = np.concatenate((X, ex), axis=0)
        
        label = np.full(ntrain//nclasses, i)
        Y = np.concatenate((Y, label), axis=0)
        
        test_ex = np.random.multivariate_normal(mean, covariance, ntest//nclasses)
        xtest = np.concatenate((xtest, test_ex), axis=0)
        
        test_y = np.full(ntest//nclasses, i)
        ytest = np.concatenate((ytest, test_y), axis=0)
        
    # Shuffle the train.
    floaty = np.expand_dims(Y, axis=1).astype('float')
    xy = np.concatenate((X, floaty), axis=1)
    np.random.shuffle(xy)
    X = xy.T[:-1].T
    Y = xy.T[-1].T.astype('int')
    return X, Y, xtest, ytest


def introduce_cheat_feature(x, y, randomize):
    if randomize:
        random_feats = int(len(x) * randomize)//3
        mask0 = np.full(random_feats, 0)
        mask1 = np.full(random_feats, 1)
        mask2 = np.full(random_feats, 2)

        mask = np.concatenate((mask0, mask1, mask2))
        np.random.shuffle(mask)
        
        masked_y = np.copy(y)
        masked_y[:(len(mask))] = mask
    else:
        masked_y = y

    ycheatfeat = np.expand_dims(masked_y.astype('float'), axis=1)
    xcheatfeat = np.concatenate((x, ycheatfeat), axis=1)
    return xcheatfeat

# introduce_cheat_feature(np.random.rand(18, 3), np.random.randint(0, 3, 18), 1.0)

#### Create train and test datasets.

In [405]:
# X, Y, xtest, ytest = create_ex_uniform_random()

X, Y, xtest, ytest = create_ex_multivariate_normal(nfeats, ntrain, ntest)
print(f"X: {len(X)}, Y:{len(Y)}, x-test:{len(xtest)}, y-test:{len(ytest)}")

X: 30000, Y:30000, x-test:300, y-test:300


### Baseline
#### Train and test a linear model

In [406]:
if os.path.exists('baseline.joblib'):
    clf = load('baseline.joblib')
else:
    clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
    clf.fit(X, Y)
    dump(clf, 'baseline.joblib') 

ypred = clf.predict(xtest)
accuracy(ypred, ytest)

0.34

### Cheating
#### Train a cheater model, which uses labels as features

In [407]:
cheat_train = introduce_cheat_feature(X, Y, randomize=0.)

if os.path.exists('cheater.joblib'):
    cheater = load('cheater.joblib')
else:
    cheater = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
    cheater.fit(cheat_train, Y)
    dump(cheater, 'cheater.joblib') 

#### At various levels of randomization of the cheat feature, see test performance.

In [408]:
for r in [0., 0.2, 0.4, 0.6, 0.8, 0.9, 1.]:
    cheat_test = introduce_cheat_feature(xtest, ytest, randomize=r)

    cheat_prediction = cheater.predict(cheat_test)
    print(f"At {r} randomization, acc={accuracy(cheat_prediction, ytest):.4}")

At 0.0 randomization, acc=0.99
At 0.2 randomization, acc=0.8567
At 0.4 randomization, acc=0.74
At 0.6 randomization, acc=0.65
At 0.8 randomization, acc=0.5133
At 0.9 randomization, acc=0.4233
At 1.0 randomization, acc=0.3367


### Cheating some of the time
#### This model cheats some of the time; 50% of the time, cheat features are identical to the true class

In [411]:
partial_cheat_train = introduce_cheat_feature(X, Y, randomize=.5)

if os.path.exists('partial_cheater.joblib'):
    partial_cheater = load('partial_cheater.joblib')
else:
    partial_cheater = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
    partial_cheater.fit(partial_cheat_train, Y)
    dump(partial_cheater, 'partial_cheater.joblib') 

#### At various levels of randomization of the cheat feature, see test performance.

In [412]:
for r in [0., 0.2, 0.4, 0.6, 0.8, 0.9, 1.]:
    partial_cheat_test = introduce_cheat_feature(xtest, ytest, randomize=r)

    partial_cheat_pred = partial_cheater.predict(partial_cheat_test)
    print(f"At {r} randomization, acc={accuracy(partial_cheat_pred, ytest):.4}")

At 0.0 randomization, acc=0.61
At 0.2 randomization, acc=0.5433
At 0.4 randomization, acc=0.5067
At 0.6 randomization, acc=0.4933
At 0.8 randomization, acc=0.4467
At 0.9 randomization, acc=0.3967
At 1.0 randomization, acc=0.36
