In [10]:
import numpy as np
import scipy as sp
from sklearn.datasets import fetch_mldata
import math
from collections import Counter
from scipy.cluster.vq import vq, kmeans, whiten
from skll.metrics import kappa

In [2]:
dataset = fetch_mldata('banana-ida')

In [3]:
def laplace(stddev, size):
    return np.random.laplace(0, stddev, size)

def laplace(stddev):
    return np.random.laplace(0, stddev)

def noisy_count(data, epsilon):
    return len(data) + laplace(1. / epsilon)

def noisy_sum(data, epsilon):
    clipped = np.clip(data, -1, 1)
    return np.sum(clipped) + laplace(1. / epsilon)

def noisy_average(data, epsilon):
    clipped = np.clip(data, -1, 1)
    tally = np.sum(clipped)
    count = len(clipped)
    
    if count == 0:
        return np.random.uniform(-1, 1)
    
    candidate = (tally + laplace(2. / epsilon)) / count
    while candidate < -1.0 or candidate > 1.0:
        candidate = (tally + laplace(2. / epsilon)) / count
    
    return candidate

In [4]:
def gen_data(dimensions, length):
    return np.random.uniform(0, 1, dimensions*length).reshape(length, dimensions)

def gen_datapoint(dimensions):
    return np.random.uniform(0, 1, dimensions)

In [11]:
def perceptron_step(x, y, normal, epsilon):
    errors = np.array([xi*yi for xi, yi in zip(x, y) if (yi*np.sum(xi*normal)) < 0]).reshape((-1, normal.shape[0]))
    newnormal = np.zeros(normal.shape)
    for i in range(len(normal)):
        newnormal[i] = normal[i] + noisy_average(errors[:, i], epsilon)
    return newnormal

def svm_step(x, y, normal, epsilon):
    errors = np.array([xi*yi for xi, yi in zip(x, y) if (yi*np.sum(xi*normal)) < 1]).reshape((-1, normal.shape[0]))
    errors = np.vstack((errors, np.array(10*[-normal])))
    newnormal = np.zeros(normal.shape)
    for i in range(len(normal)):
        newnormal[i] = normal[i] + noisy_average(errors[:, i], epsilon)
    return newnormal

def logistic_step(x, y, normal, epsilon):
    errors = np.array([xi*((yi+1)/2. - 1./(1+np.exp(np.sum(xi*normal)))) for xi, yi in zip(x, y)]).reshape((-1, normal.shape[0]))
    newnormal = np.zeros(normal.shape)
    for i in range(len(normal)):
        newnormal[i] = normal[i] + noisy_average(errors[:, i], epsilon)
    return newnormal

def fit_binary(x, y, fn, epsilon, niter=20):
    if any(abs(yi) != 1 for yi in y):
        y[y == 0] = -1
    if any(abs(yi) != 1 for yi in y):
        raise ValueError('Unrecognized class label occured')
    normal = gen_datapoint(x.shape[1])
    for _ in range(niter):
        normal = fn(x, y, normal, epsilon)
    return normal

def eval_binary(x, y, normal):
    err = y*x.dot(normal)
    err[err > 0] = 0
    err[err < 0] = 1
    return 1 - np.average(err)

def fit_ova(x, y, fn, epsilon, niter=20):
    yset = sorted(list(set(y)))
    normal = gen_data(x.shape[1], len(yset))
    labels = np.ones((len(yset), x.shape[0]), dtype=np.int) * -1
    for idx, yi in enumerate(yset):
        labels[idx, np.where(y==yi)[0]] = 1
    for _ in range(niter):
        for idx, yi in enumerate(yset):
            normal[idx, :] = fn(x, labels[idx, :], normal[idx, :], epsilon)
    return normal

def eval_ova(x, y, normal):
    yset = sorted(list(set(y)))
    ydict = dict(zip(yset, range(len(yset))))
    ytr = [ydict[yi] for yi in y]
    err = ytr - np.argmax(x.dot(normal.T), axis=1)
    err[err != 0] = 1
    return 1 - np.average(err)

def eval_ova_kappa(x, y, normal):
    yset = sorted(list(set(y)))
    ydict = dict(zip(yset, range(len(yset))))
    ytr = [ydict[yi] for yi in y]
    return kappa(ytr, np.argmax(x.dot(normal.T), axis=1))

In [6]:
derp = dataset['target']
multinormal = fit_ova(dataset['data'], dataset['target'], svm_step, 0.1, niter=50)

In [7]:
eval_ova(dataset['data'], dataset['target'], multinormal)

0.58905660377358493

In [12]:
eval_ova_kappa(dataset['data'], dataset['target'], multinormal)

0.17811320754716986

In [16]:
from sklearn.cross_validation import train_test_split

dsets = ['iris', 'diabetes_scale', 'image-ida', 'diabetes-ida', 'breast-cancer-ida', 'ringnorm-ida', 'thyroid-ida', 'usps']
fns = [(logistic_step, 'log'), (svm_step, 'svm'), (perceptron_step, 'perc')]
for dset in dsets:
    dataset = fetch_mldata(dset)
    X_train, X_test, Y_train, Y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3, random_state=42)
    for niter in [5, 10, 15, 25]:
        for eps in [0.001, 0.01, 0.1, 0.5, 1]:
            for fn in fns:
                result = 0
                for _ in range(10):
                    multinormal = fit_ova(X_train, Y_train, fn[0], eps, niter)
                    result += eval_ova_kappa(X_test, Y_test, multinormal)
                print dset+';'+fn[1]+';'+str(niter)+';'+str(eps)+';'+str(result / 10)

iris;log;5;0.001;-0.0396422662971
iris;svm;5;0.001;0.0754717677177
iris;perc;5;0.001;-0.0979398810332
iris;log;5;0.01;0.213893674051
iris;svm;5;0.01;0.264242136984
iris;perc;5;0.01;0.106781106987
iris;log;5;0.1;0.637629554258
iris;svm;5;0.1;0.63124484114
iris;perc;5;0.1;0.553741273653
iris;log;5;0.5;0.672351278858
iris;svm;5;0.5;0.734939325381
iris;perc;5;0.5;0.641704842027
iris;log;5;1;0.665325749903
iris;svm;5;1;0.76878342246
iris;perc;5;1;0.586751522075
iris;log;10;0.001;0.239417423925
iris;svm;10;0.001;0.114321261565
iris;perc;10;0.001;-0.127584607152
iris;log;10;0.01;0.210973857351
iris;svm;10;0.01;0.297195298444
iris;perc;10;0.01;0.21316195265
iris;log;10;0.1;0.65425516403
iris;svm;10;0.1;0.714500205677
iris;perc;10;0.1;0.613813482621
iris;log;10;0.5;0.632853148171
iris;svm;10;0.5;0.748868778281
iris;perc;10;0.5;0.710688777379
iris;log;10;1;0.667708107213
iris;svm;10;1;0.738687782805
iris;perc;10;1;0.71887197264
iris;log;15;0.001;0.0879567976405
iris;svm;15;0.001;0.00645349611289

In [None]:
import seaborn

In [583]:
dsets = ['iris', 'banana-ida','diabetes_scale', 'image-ida',
         'diabetes-ida', 'breast-cancer-ida', 'ringnorm-ida', 'thyroid-ida']
for dset in dsets:
    dataset = fetch_mldata(dset)
    Y = dataset['target']
    Yset = list(set(Y))
    cnt = Counter(Y)
    guess_ratio = cnt.most_common(1)[0][1] / float(len(Y))
    print dset +';'+str(guess_ratio)

iris;0.333333333333
banana-ida;0.551698113208
diabetes_scale;0.651041666667
image-ida;0.569511025887
diabetes-ida;0.651041666667
breast-cancer-ida;0.707224334601
ringnorm-ida;0.504864864865
thyroid-ida;0.697674418605


In [547]:
from sklearn import linear_model
from sklearn.metrics import accuracy_score
clf = linear_model.SGDClassifier()
clf.fit(dataset['data'], dataset['target'])

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [548]:
pred = clf.predict(dataset['data'])
Y = dataset['target']
mtx = [1 if y1==y2 else 0 for (y1, y2) in zip(pred, Y) ]

In [549]:
np.sum(mtx)/float(len(mtx))

0.4907547169811321