In [1]:
import numpy as np
import matplotlib.pyplot as plt

# For the confusion matrix
from sklearn.metrics import confusion_matrix
import itertools

#  A class will be created containing the regressor and all of its methods.


In [2]:
# 1 The logistic regressor class and its methods
class LogisticReg:
    '''
        Logitic Regression, by Wladi Arce.
        
        This class creates an instance of a logistic regressor, which allows to classify data
        into 2 categories, 0 and 1.
    '''
    def __init__(self):
        self.theta = 0.5 * np.random.randn(x_train_scaled.shape[1], 1) #Initializes randomly the weights
        
    def sigmoid(self, x):
        '''
            Performs logistic regression given the matrix of parameters
        '''
        probability = 1/(1 + np.exp(-np.matmul(x, self.theta)))
        return probability
    
    def compute_cost(self, x, y):
        '''
            Cost function
        '''
        cost = np.mean(-y*np.log(self.sigmoid(x)) - (1-y)*np.log(1-self.sigmoid(x)))
        return cost
    
    def compute_gradient(self, x, y):
        '''
            Calculates the gradient given the expected output and the parameters
        '''
        gradient = np.mean(np.matmul(x.T,self.sigmoid(x)-y))
        return gradient

    def fit(self, x, y, learning_rate = 0.01, batch_size = 10, epochs = 50):
        '''
            Fits the regressor to the data using minibatch gradient descent as follows:
            
            x
            y
            learning_rate (default 0.01)
            batch_size (default 10)
            epochs (default 50)
            
            compute number of batches
            for each epoch:
                shuffle dataset
                for i in number_of_batches:
                    x_batch = select [batch_size] of the features dataset
                    y_batch = select [batch_size] of the output dataset
                    compute gradient with x_batch, y_batch
                    apply gradient descent to update the parameters                          
        '''
        num_samples = x.shape[0]
        N_iterations = int(num_samples / batch_size) * epochs
        start = 0
        end = 0
        
        for step in range(N_iterations):
            # if new epoch, shuffle the data
            if step % (num_samples / batch_size) == 0:
                indexes = np.random.permutation(x.shape[0])
                y = y[indexes]
                x = x[indexes]
            
            # create a mini-batch of data to train on
            end = start + batch_size
            if end >= num_samples:
                end = num_samples
            x_batch = x[start:end, :]
            y_batch = y[start:end]
            start = 0 if end >= num_samples else end

            # update parameters using a x_step and y_step
            self.theta= self.theta - learning_rate * self.compute_gradient(x_batch, y_batch)
        training_cost = self.compute_cost(x[:,:], y)
        print('training cost: %f' %training_cost)
        
    def predict(self, x):
        '''
            Is basically the sigmoid function, but will convert the probabilities into binary values
        '''
        return self.sigmo

In [3]:
# 2 The confusion matrix and the accuracy calculator

def compute_accuracy(y_real, y_pred):
    '''
        Checks how many values are equal between the real data and the predicted data
    '''
    correct = y_real == y_pred
    return np.sum(correct)/correct.shape[0]

# This function has been taken from sklearn documentation
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [4]:
from sklearn.datasets import load_breast_cancer

# load the dataset
data = load_breast_cancer()
x = data.data
y = data.target

# split into training and test sets
N_train = int(0.8 * x.shape[0])

x_train = x[:N_train,:]
y_train = np.reshape(y[:N_train], (-1,1))
x_test = x[N_train:,:]
y_test = np.reshape(y[N_train:], (-1,1))

# scale features by removing mean and dividing by the standard deviation
x_train_scaled = (x_train - np.average(x_train, 0))/np.std(x_train)
x_test_scaled = (x_test - np.average(x_test, 0))/np.std(x_test)

# Add intercept terms and initialize parameters
x_train_scaled = np.hstack((np.ones((x_train_scaled.shape[0], 1)), x_train_scaled))
x_test_scaled = np.hstack((np.ones((x_test_scaled.shape[0], 1)), x_test_scaled))

In [5]:
# An instance of the Regression model will be created, and the fit method run with the train set

classifier = LogisticReg()
classifier.fit(x_train_scaled, y_train)

training cost: 0.253194


In [None]:
# Having the model fit, values for the test set will be predicted, and 
# then compared against the real output. The accuracy will be computed
# and the confusion matrix will show in a very visual way its performance

# PREDICTION
y_pred = classifier.predict(x_test_scaled)

# TEST SET COST
test_cost = classifier.compute_cost(x_test_scaled,y_test)
print('Test cost: ',test_cost)

# ACCURACY
print("Accuracy on test set: {:.2f}".format(compute_accuracy(y_test,y_pred)))

# CONFUSION MATRIX
labels = ['Benign','Malignant']
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=labels, title='Confusion matrix')
plt.show()