In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
src_breast_cancer = 'breast_cancer/wdbc.csv'
src_ionosphere = 'ionosphere/ionosphere.csv'

bc_data = pd.read_csv(src_breast_cancer, delimiter=',')
io_data = pd.read_csv(src_ionosphere, delimiter=',')

In [5]:
# Get data as np array and split bc_classes/bc_features
bc_classes = bc_data[bc_data.columns[1]].values
bc_features = bc_data[bc_data.columns[2:]].values
print(bc_classes.shape, bc_features.shape)
io_classes = io_data[io_data.columns[-1]].values
io_features = io_data[io_data.columns[:-1]].values
print(io_classes.shape, io_features.shape)

(568,) (568, 30)
(350,) (350, 34)


In [6]:
# Process bc_features into 0 and 1 class
bc_classes[bc_classes == 'M'] = 1
bc_classes[bc_classes == 'B'] = -1
print('Number of maligne: ', np.count_nonzero(bc_classes == 1))
print('Number of benigne: ', np.count_nonzero(bc_classes == 0))
bc_classes = bc_classes.astype(np.int8)
# Process io_features into 0 and 1 class
io_classes[io_classes == 'b'] = 1
io_classes[io_classes == 'g'] = -1
print('Number of bad: ', np.count_nonzero(io_classes == 1))
print('Number of good: ', np.count_nonzero(io_classes == 0))
io_classes = io_classes.astype(np.int8)

Number of maligne:  211
Number of benigne:  0
Number of bad:  126
Number of good:  0


In [7]:
# 0-center data
bc_features -= np.mean(bc_features, axis=0)
io_features -= np.mean(io_features, axis=0)
# 1-center std
bc_features /= np.std(bc_features, axis=0)
io_features = np.divide(io_features, np.std(io_features, axis=0), where=np.std(io_features, axis=0) != 0.)

In [8]:
print(bc_classes.shape, bc_classes.dtype)
print(bc_features.shape)
print(io_classes.shape, io_classes.dtype)
print(io_features.shape)

(568,) int8
(568, 30)
(350,) int8
(350, 34)


In [9]:
# Loads cross validation framework
from sklearn.model_selection import cross_val_score, ShuffleSplit, train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn import cross_validation
import seaborn as sn
bc_xtrain, bc_xtest, bc_ytrain, bc_ytest = train_test_split(bc_features, bc_classes, test_size=.2)
io_xtrain, io_xtest, io_ytrain, io_ytest = train_test_split(io_features, io_classes, test_size=.2)



In [10]:
import numpy as np 

def logistic_loss(features, label, x, l):
    """ Computes the logistic loss for a Labeled point"""
    return np.log(1.+np.exp(-label*np.dot(features, x)))+l*np.linalg.norm(x)

In [11]:
def logistic_grad(features, label, x, l):
    """ Computes the logistic gradient for a Labeled point"""
    return (-label*features*np.exp(-label*np.dot(features, x)))/(1.+np.exp(-label*np.dot(features, x)))+2.*l*x

In [12]:
def hinge_loss(features, label, x, l):
    """ Computes the logistic loss for a Labeled point"""
    return max(0., 1.-label*np.dot(features, x))+l*np.linalg.norm(x)

In [13]:
def hinge_grad(features, label, x, l):
    """ Computes the logistic loss for a Labeled point"""
    if 1.-label*np.dot(features, x) < 0.:
        return 0.
    else:
        return -label*features+2.*l*x

In [14]:
def hinge_square_loss(features, label, x, l):
    """ Computes the logistic loss for a Labeled point"""
    return max(0., 1.-label*np.dot(features, x))**2+l*np.linalg.norm(x)

In [15]:
def hinge_square_grad(features, label, x, l):
    """ Computes the logistic loss for a Labeled point"""
    if 1.-label*np.dot(features, x) < 0.:
        return 0.
    else:
        return -2.*label*features*(1.-label*np.dot(features, x))+2.*l*x

In [16]:
def template(x_init, features, labels, loss_function, gradient_loss_function, n_epochs, lamb, learning_rate=1e-3):
    x = x_init
    n_samples = len(features)
    n_print = n_epochs // 10
    for epoch in range(n_epochs):
        ############ A REMPLIR
        grad = XXXXX
        
        x -= learning_rate * grad
        
        # Compute loss of whole dataset
        if epoch % n_print == 0:
            loss = np.mean([loss_function(f, l, x, lamb) for f, l in zip(features, labels)])
            print('Epoch ', epoch+1, ' Loss: ', loss)

In [17]:
def preds(x, tefeatures):
    p = 1. / (1. + np.exp(-np.dot(tefeatures, x)))
    p[p < .5] = -1
    p[p >= .5] = 1
    return p

In [18]:
def accuracy(real, preds):
    return np.sum(real == preds) / float(real.shape[0])

In [50]:
import time

def ggwp(algo_descent, n_epochs, reg, momentum=None, learning_rate=1e-3):
    for loss_type in ((logistic_loss, logistic_grad), (hinge_loss, hinge_grad), (hinge_square_loss, hinge_square_grad)):
        for i, dataset in enumerate(((bc_xtrain, bc_ytrain, bc_xtest, bc_ytest), (io_xtrain, io_ytrain, io_xtest, io_ytest))):
            loss = loss_type[0]
            grad = loss_type[1]
            trfeatures, trlabels = dataset[0], dataset[1]
            tefeatures, telabels = dataset[2], dataset[3]
            x_init = np.zeros(trfeatures[0].shape[0])
            print('XXXXXXXXXXXXXXXXXXXXXXX\nDataset', i, 'Loss type', loss.__name__)
            start = time.time()
            x = algo_descent(x_init, trfeatures, trlabels, loss, grad, n_epochs, reg, momentum)
            print('Time elapsed', time.time() - start)
            print('Accuracy', accuracy(telabels, preds(x, tefeatures)))

In [59]:
# Batch gradient descent
def batch_gd(x_init, features, labels, loss_function, gradient_loss_function, n_epochs, lamb, momentum=None, learning_rate=1e-3):
    x = x_init
    n_samples = len(features)
    n_print = 10
    for epoch in range(n_epochs):
        ############ A REMPLIR
        grad = np.mean([gradient_loss_function(f, lab, x, lamb) for f, lab in zip(features, labels)], axis=0)
        if momentum:
            x = x*momentum - learning_rate*grad
        else:
            x -= learning_rate * grad
        # Compute loss of whole dataset
        if epoch+1 % n_print == 0:
            loss = np.mean([loss_function(f, l, x, lamb) for f, l in zip(features, labels)])
            print('Epoch ', epoch+1, ' Loss: ', loss)
    
    return x

In [63]:
# Batch gradient descent
def sgd(x_init, features, labels, loss_function, gradient_loss_function, n_epochs, lamb, momentum=None,learning_rate=1e-3):
    x = x_init
    n_samples = len(features)
    n_print = 10
    for epoch in range(n_epochs):
        for _ in range(n_samples):
            i = np.random.randint(n_samples)
            grad = gradient_loss_function(features[i], labels[i], x, lamb)
            if momentum:
                x = x*momentum - learning_rate*grad
            else:
                x -= learning_rate * grad
        # Compute loss of whole dataset
        if epoch+1 % n_print == 0:
            loss = np.mean([loss_function(f, l, x, lamb) for f, l in zip(features, labels)])
            print('Epoch ', epoch+1, ' Loss: ', loss)
    
    return x

In [64]:
ggwp(batch_gd, 100, 1e-3)

XXXXXXXXXXXXXXXXXXXXXXX
Dataset 0 Loss type logistic_loss
Time elapsed 1.8253300189971924
Accuracy 0.929824561404
XXXXXXXXXXXXXXXXXXXXXXX
Dataset 1 Loss type logistic_loss
Time elapsed 1.096635103225708
Accuracy 0.714285714286
XXXXXXXXXXXXXXXXXXXXXXX
Dataset 0 Loss type hinge_loss
Time elapsed 0.9767379760742188
Accuracy 0.921052631579
XXXXXXXXXXXXXXXXXXXXXXX
Dataset 1 Loss type hinge_loss
Time elapsed 0.6361169815063477
Accuracy 0.7
XXXXXXXXXXXXXXXXXXXXXXX
Dataset 0 Loss type hinge_square_loss
Time elapsed 1.651313066482544
Accuracy 0.956140350877
XXXXXXXXXXXXXXXXXXXXXXX
Dataset 1 Loss type hinge_square_loss
Time elapsed 1.0646250247955322
Accuracy 0.842857142857


In [66]:
ggwp(sgd, 100, 1e-3, 0.9)

XXXXXXXXXXXXXXXXXXXXXXX
Dataset 0 Loss type logistic_loss
Time elapsed 2.532710075378418
Accuracy 0.912280701754
XXXXXXXXXXXXXXXXXXXXXXX
Dataset 1 Loss type logistic_loss
Time elapsed 1.6538920402526855
Accuracy 0.685714285714
XXXXXXXXXXXXXXXXXXXXXXX
Dataset 0 Loss type hinge_loss
Time elapsed 1.4495658874511719
Accuracy 0.912280701754
XXXXXXXXXXXXXXXXXXXXXXX
Dataset 1 Loss type hinge_loss
Time elapsed 1.0006318092346191
Accuracy 0.842857142857
XXXXXXXXXXXXXXXXXXXXXXX
Dataset 0 Loss type hinge_square_loss
Time elapsed 2.366858959197998
Accuracy 0.964912280702
XXXXXXXXXXXXXXXXXXXXXXX
Dataset 1 Loss type hinge_square_loss
Time elapsed 1.474794864654541
Accuracy 0.7
