In [32]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.preprocessing import PolynomialFeatures

data=pd.read_csv('C:/Users/Virag/Desktop/Machine Learning/practice6/data/heart_disease.csv')
labels=data.values[:,-1]
labels[labels>1]=1
labels=labels.astype(int)

data=data.values[:,:-1]

In [33]:
data=PolynomialFeatures(2).fit_transform(data)
print(data.shape)

(303, 105)


In [34]:
#Standardize data (substract mean divide with std)
data=(data-np.mean(data))/np.std(data)

In [35]:
def train_test_split(data,labels,test_ratio=0.2):
    idxs=np.arange(data.shape[0])
    np.random.shuffle(idxs)
    test_idxs=idxs[:round(len(idxs)*test_ratio)]
    train_idxs=idxs[round(len(idxs)*test_ratio):]
    return data[train_idxs],labels[train_idxs],data[test_idxs],labels[test_idxs]
    

In [5]:
def visualize(data, labels,predictor):
    import matplotlib.pyplot as plt
    min1, max1 = data[:, 0].min()-data[:, 0].min()*0.1, data[:, 0].max()+data[:, 0].max()*0.1
    min2, max2 = data[:, 1].min()-data[:, 1].min()*0.1, data[:, 1].max()+data[:, 1].max()*0.1
    # define the x and y scale
    x1grid = np.arange(min1, max1, np.abs(max1-min1)*0.001)
    x2grid = np.arange(min2, max2, np.abs(max2-min2)*0.001)
    # create all of the lines and rows of the grid
    xx, yy = np.meshgrid(x1grid, x2grid)
    # flatten each grid to a vector
    r1, r2 = xx.flatten(), yy.flatten()
    r1, r2 = r1.reshape((len(r1), 1)), r2.reshape((len(r2), 1))
    # horizontal stack vectors to create x1,x2 input for the model
    grid = np.hstack((r1,r2))
    # make predictions for the grid
    yhat = predictor.predict(grid)
    # reshape the predictions back into a grid
    zz = yhat.reshape(xx.shape)
    # plot the grid of x, y and z values as a surface
    plt.contourf(xx, yy, zz, cmap='Paired')
    # create scatter plot for samples from each class
    for class_value in np.unique(labels):
        # get row indexes for samples with this class
        row_ix = np.where(labels == class_value)
        # create scatter of these samples
        plt.scatter(data[row_ix, 0], data[row_ix, 1], cmap='Paired')
    plt.tight_layout()


In [36]:
class LogisticRegression():
    def __init__(self):
        self.w = None
    def fit(self,data,labels,test_data,test_labels,max_iterations=500,lamb = 0.1):
        X=data
        Y=labels
        step_size=0.05
        batch_size=32
        N = X.shape[0]
        Y = Y.squeeze()
        assert Y.shape == (N,), (Y.shape, N)

        def gen_batches():
            inds = np.arange(N)
            np.random.shuffle(inds)
            if batch_size is None:
                yield inds
            else:
                for i in range(0, N, batch_size):
                    yield inds[i:i + batch_size]

        # Initialise w
        w = np.random.randn(X.shape[1])
        for it in range(max_iterations):
            # Train on the permuted dataset
            avg_error = 0
            for batch_inds in gen_batches():
                x = X[batch_inds, ...]
                y = Y[batch_inds]
                y_hat = self.predict_proba(x, w)
                e_in_hat = self.binary_cross_entropy(y,y_hat)
                gradient_hat = self.error_gradient(x, y,y_hat)
                avg_error += e_in_hat
                # Update
                w -= step_size * (gradient_hat + lamb) #2w in L2 and 1 in L1
            avg_error /= N
            Y_hat = (self.predict_proba(X, w) > 0.5).astype(int)
            accuracy = self.accuracy(Y,Y_hat)
            test_Y_hat= (self.predict_proba(test_data, w) > 0.5).astype(int)
            test_accuracy = self.accuracy(test_labels,test_Y_hat)
            #print(f'Iteration #{it}: average error: {avg_error:0.2f}  '
            #      f'train accuracy: {accuracy:0.02f}'
            #     f'test accuracy: {test_accuracy:0.02f}')
        self.w=w
        #print(self.w)


    def sigmoid(self,data):
        # return 1 / (1 + np.exp(-h))
        # Numerically stable version, see:
        # https://timvieira.github.io/blog/post/2014/02/11/exp-normalize-trick/
        h=data
        mask = h >= 0
        result = np.zeros_like(h)
        z = np.exp(-h[mask])
        result[mask] = 1 / (1+z)
        z = np.exp(h[~mask])
        result[~mask] = z / (1+z)
        return result

    def binary_cross_entropy(self,true,prediction):
        return np.sum(-np.log(true*prediction + (1-true) * (1-prediction) + 1e-6))

    def error_gradient(self,data,true,prediction):
        N = data.shape[0]
        assert prediction.shape == (N,), prediction.shape
        assert true.shape == (N,), true.shape
        assert np.all((true == 0) | (true == 1))
        t = prediction - true
        t.shape = (N, 1)
        elementwise_gradient = t * data
        return np.mean(elementwise_gradient, axis=0)

    def predict_proba(self,data,w=[]):
        if len(w)>0:pass
        else: w=self.w
        return self.sigmoid(data @ w)
    def predict(self,data):
        return np.rint(self.sigmoid(data @ self.w))
    def accuracy(self,true,prediction):
        return np.mean(true == prediction)
 

In [31]:
#with L1 regularization:
train_data,train_labels,test_data,test_labels = train_test_split(data,labels)
for i in [0.000001, 0.0001, 0.1]:
    lr=LogisticRegression()
    #print("lambda: " + str(i))
    lr.fit(train_data,train_labels,test_data,test_labels,lamb=i)
    prediction_in=lr.predict(train_data)
    prediction_out=lr.predict(test_data)
    print(lr.accuracy(train_labels,prediction_in))
    print(lr.accuracy(test_labels,prediction_out))

0.6735537190082644
0.6229508196721312
0.7107438016528925
0.7377049180327869
0.6074380165289256
0.6229508196721312


In [37]:
#With L2 regularization:
train_data,train_labels,test_data,test_labels = train_test_split(data,labels)
for i in [0.000001, 0.0001, 0.1]:
    lr=LogisticRegression()
    #print("lambda: " + str(i))
    lr.fit(train_data,train_labels,test_data,test_labels,lamb=i)
    prediction_in=lr.predict(train_data)
    prediction_out=lr.predict(test_data)
    print(lr.accuracy(train_labels,prediction_in))
    print(lr.accuracy(test_labels,prediction_out))

0.743801652892562
0.6885245901639344
0.7355371900826446
0.6721311475409836
0.6487603305785123
0.6065573770491803


The accuracy with L1 regularization was better than with L2 regularization.