In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
src_breast_cancer = 'breast_cancer/wdbc.csv'
src_ionosphere = 'ionosphere/ionosphere.csv'

bc_data = pd.read_csv(src_breast_cancer, delimiter=',')
io_data = pd.read_csv(src_ionosphere, delimiter=',')

In [2]:
# Get data as np array and split bc_classes/bc_features
bc_classes = bc_data[bc_data.columns[1]].values
bc_features = bc_data[bc_data.columns[2:]].values
print(bc_classes.shape, bc_features.shape)
io_classes = io_data[io_data.columns[-1]].values
io_features = io_data[io_data.columns[:-1]].values
print(io_classes.shape, io_features.shape)

((568,), (568, 30))
((350,), (350, 34))


In [3]:
# Process bc_features into 0 and 1 class
bc_classes[bc_classes == 'M'] = 1
bc_classes[bc_classes == 'B'] = -1
print('Number of maligne: ', np.count_nonzero(bc_classes == 1))
print('Number of benigne: ', np.count_nonzero(bc_classes == 0))
bc_classes = bc_classes.astype(np.int8)
# Process io_features into 0 and 1 class
io_classes[io_classes == 'b'] = 1
io_classes[io_classes == 'g'] = -1
print('Number of bad: ', np.count_nonzero(io_classes == 1))
print('Number of good: ', np.count_nonzero(io_classes == 0))
io_classes = io_classes.astype(np.int8)

('Number of maligne: ', 211)
('Number of benigne: ', 0)
('Number of bad: ', 126)
('Number of good: ', 0)


In [4]:
# 0-center data
bc_features -= np.mean(bc_features, axis=0)
io_features -= np.mean(io_features, axis=0)
# 1-center std
bc_features /= np.std(bc_features, axis=0)
io_features = np.divide(io_features, np.std(io_features, axis=0), where=np.std(io_features, axis=0) != 0.)

In [5]:
print(bc_classes.shape, bc_classes.dtype)
print(bc_features.shape)
print(io_classes.shape, io_classes.dtype)
print(io_features.shape)

((568,), dtype('int8'))
(568, 30)
((350,), dtype('int8'))
(350, 34)


In [6]:
# Loads cross validation framework
from sklearn.model_selection import cross_val_score, ShuffleSplit, train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn import cross_validation
import seaborn as sn
bc_xtrain, bc_xtest, bc_ytrain, bc_ytest = train_test_split(bc_features, bc_classes, test_size=.2)
io_xtrain, io_xtest, io_ytrain, io_ytest = train_test_split(io_features, io_classes, test_size=.2)



In [7]:
import numpy as np 

def logistic_loss(features, label, x, l):
    """ Computes the logistic loss for a Labeled point"""
    return np.log(1.+np.exp(-label*np.dot(features, x)))+l*np.linalg.norm(x)

In [8]:
def logistic_grad(features, label, x, l):
    """ Computes the logistic gradient for a Labeled point"""
    return (-label*features*np.exp(-label*np.dot(features, x)))/(1.+np.exp(-label*np.dot(features, x)))+2.*l*x

In [9]:
def hinge_loss(features, label, x, l):
    """ Computes the logistic loss for a Labeled point"""
    return max(0., 1.-label*np.dot(features, x))+l*np.linalg.norm(x)

In [10]:
def hinge_grad(features, label, x, l):
    """ Computes the logistic loss for a Labeled point"""
    if 1.-label*np.dot(features, x) < 0.:
        return 0.
    else:
        return -label*features+2.*l*x

In [11]:
def hinge_square_loss(features, label, x, l):
    """ Computes the logistic loss for a Labeled point"""
    return max(0., 1.-label*np.dot(features, x))**2+l*np.linalg.norm(x)

In [12]:
def hinge_square_grad(features, label, x, l):
    """ Computes the logistic loss for a Labeled point"""
    if 1.-label*np.dot(features, x) < 0.:
        return 0.
    else:
        return -2.*label*features*(1.-label*np.dot(features, x))+2.*l*x

In [13]:
def template(x_init, features, labels, loss_function, gradient_loss_function, n_epochs, lamb, learning_rate=1e-3):
    x = x_init
    n_samples = len(features)
    n_print = n_epochs // 10
    for epoch in range(n_epochs):
        ############ A REMPLIR
        grad = XXXXX
        
        x -= learning_rate * grad
        
        # Compute loss of whole dataset
        if epoch % n_print == 0:
            loss = np.mean([loss_function(f, l, x, lamb) for f, l in zip(features, labels)])
            print('Epoch ', epoch+1, ' Loss: ', loss)

In [20]:
def preds(x, tefeatures):
    p = 1. / (1. + np.exp(-np.dot(tefeatures, x)))
    p[p < .5] = -1
    p[p >= .5] = 1
    return p

In [21]:
def accuracy(real, preds):
    return np.sum(real == preds) / float(real.shape[0])

In [22]:
import time

def ggwp(algo_descent, n_epochs, reg, learning_rate=1e-3):
    for loss_type in ((logistic_loss, logistic_grad), (hinge_loss, hinge_grad), (hinge_square_loss, hinge_square_grad)):
        for i, dataset in enumerate(((bc_xtrain, bc_ytrain, bc_xtest, bc_ytest), (io_xtrain, io_ytrain, io_xtest, io_ytest))):
            loss = loss_type[0]
            grad = loss_type[1]
            trfeatures, trlabels = dataset[0], dataset[1]
            tefeatures, telabels = dataset[2], dataset[3]
            x_init = np.zeros(trfeatures[0].shape[0])
            print('XXXXXXXXXXXXXXXXXXXXXXX\nDataset', i, 'Loss type', loss.__name__)
            start = time.time()
            x = algo_descent(x_init, trfeatures, trlabels, loss, grad, n_epochs, reg)
            print('Time elapsed', time.time() - start)
            print('Accuracy', accuracy(telabels, preds(x, tefeatures)))

In [30]:
# Batch gradient descent
def batch_gd(x_init, features, labels, loss_function, gradient_loss_function, n_epochs, lamb, learning_rate=1e-3):
    x = x_init
    n_samples = len(features)
    n_print = 101010101010101010
    for epoch in range(n_epochs):
        ############ A REMPLIR
        grad = np.mean([gradient_loss_function(f, lab, x, lamb) for f, lab in zip(features, labels)], axis=0)
        x -= learning_rate * grad
        # Compute loss of whole dataset
        if epoch+1 % n_print == 0:
            loss = np.mean([loss_function(f, l, x, lamb) for f, l in zip(features, labels)])
            print('Epoch ', epoch+1, ' Loss: ', loss)
    
    return x

In [31]:
ggwp(batch_gd, 100, 1e-3, .1)

('XXXXXXXXXXXXXXXXXXXXXXX\nDataset', 0, 'Loss type', 'logistic_loss')
('Time elapsed', 1.295194149017334)
('Accuracy', 0.93859649122807021)
('XXXXXXXXXXXXXXXXXXXXXXX\nDataset', 1, 'Loss type', 'logistic_loss')
('Time elapsed', 0.8129870891571045)
('Accuracy', 0.8571428571428571)
('XXXXXXXXXXXXXXXXXXXXXXX\nDataset', 0, 'Loss type', 'hinge_loss')
('Time elapsed', 0.6045379638671875)
('Accuracy', 0.94736842105263153)
('XXXXXXXXXXXXXXXXXXXXXXX\nDataset', 1, 'Loss type', 'hinge_loss')
('Time elapsed', 0.377208948135376)
('Accuracy', 0.8571428571428571)
('XXXXXXXXXXXXXXXXXXXXXXX\nDataset', 0, 'Loss type', 'hinge_square_loss')
('Time elapsed', 0.971343994140625)
('Accuracy', 0.92982456140350878)
('XXXXXXXXXXXXXXXXXXXXXXX\nDataset', 1, 'Loss type', 'hinge_square_loss')
('Time elapsed', 0.6276111602783203)
('Accuracy', 0.84285714285714286)


In [29]:
print bc_ytrain

[-1 -1  1  1  1  1 -1 -1  1 -1 -1 -1  1 -1  1 -1 -1  1 -1 -1  1  1  1 -1 -1
 -1 -1  1 -1 -1 -1 -1 -1  1 -1  1  1 -1  1 -1 -1 -1 -1 -1 -1  1  1  1  1  1
 -1  1 -1 -1  1 -1 -1 -1 -1 -1  1  1  1 -1 -1 -1  1 -1 -1 -1  1  1  1 -1  1
 -1  1 -1  1 -1  1  1 -1  1 -1 -1  1 -1  1 -1  1  1 -1 -1  1 -1 -1 -1  1  1
 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1  1  1  1 -1 -1 -1 -1  1 -1 -1  1 -1  1 -1
  1  1 -1 -1 -1  1  1 -1 -1 -1  1 -1  1  1 -1 -1  1  1  1 -1 -1 -1  1 -1 -1
 -1 -1 -1  1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1  1 -1 -1  1  1  1  1 -1 -1 -1 -1
 -1 -1  1 -1 -1 -1  1  1 -1 -1 -1 -1 -1  1 -1 -1 -1  1 -1  1 -1 -1 -1 -1 -1
  1  1 -1 -1 -1  1 -1 -1 -1 -1  1 -1 -1  1 -1  1 -1 -1 -1  1 -1 -1 -1 -1 -1
 -1  1 -1  1  1  1  1 -1  1 -1  1  1  1 -1 -1  1 -1  1 -1  1  1 -1  1 -1  1
 -1 -1 -1  1 -1  1  1  1  1 -1 -1 -1  1  1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1
  1 -1 -1  1 -1  1 -1 -1 -1  1 -1 -1 -1  1  1  1  1  1 -1  1 -1  1 -1 -1 -1
  1 -1 -1 -1  1 -1 -1  1  1 -1  1 -1  1 -1 -1  1 -1 -1  1  1 -1  1 -1 -1 -1
 -1  1 -1  1