# Logistic Regression

- We will use the dataset that we had used for the second assignment for this one also. 
- The problem is to find a linear separating hyper-plane using logistic regression.
- Use tensorflow to implement the gradient descent procedure.

In [48]:
import csv
import numpy as np
from copy import deepcopy
import tensorflow as tf

#reads the csv file and returns a numpy array dataset of dimensions Nx(d+1). N is the number of random vectors and 
#d is the dimension. dataset has N rows each one being an observation of dimension d. the (d+1)th column 
# contains the labels.
def loadCsv(filename):
    lines = csv.reader(open(filename, "rt"))#read the file
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]] #converting to floating point numbers from integers
    dataset = np.asfarray(dataset)#list to numpy array conversion
    return dataset

# splits the entire set into training and test set. training set will contain splitratio x N number of observations
# randomly chosen from the set. the rest will go to the test set.
def splitDataset(dataset, splitratio):
    trainsize = int(np.round(dataset.shape[0]*splitratio))
    trainset = np.zeros((trainsize,dataset.shape[1]))#array to store the training set.
    testset = deepcopy(dataset)#create a copy of the dataset in test set.
    for numsamples in range(trainsize):
        indx = np.random.randint(0,testset.shape[0])#random index generation
        trainset[numsamples,:] = testset[indx,:]#adding the randomly selected data vector to the training set
        testset = np.delete(testset, indx, axis = 0)#delete the selected observation from the test set.
    return trainset,testset

dataset = loadCsv("pima-indians-diabetes.csv")
trainset, testset = splitDataset(dataset,0.7)

learning_rate = 0.03
training_epochs = 1500
batch_size = 200
display_step = 100

X_train = trainset[:,0:8]
#Y_train = np.concatenate((trainset[:,-1].reshape(1,X_train.shape[0]),1-trainset[:,-1].reshape(1,X_train.shape[0]))).T
Y_train = trainset[:,-1]
X_test = testset[:,0:8]
#Y_test = np.concatenate((testset[:,-1].reshape(1,X_test.shape[0]),1-testset[:,-1].reshape(1,X_test.shape[0]))).T
Y_test = testset[:,-1]

data = tf.placeholder(tf.float32, [None, 8])
target = tf.placeholder(tf.float32, [None, 1])
#W = tf.Variable(tf.random_normal([8, 1], mean=0.0, stddev=0.05))
W = tf.Variable(tf.random_normal(shape=[8, 1]))
b = tf.Variable(tf.random_normal(shape=[1]))
z = tf.matmul(data, W) + b

cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=z,labels=target))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

prediction = tf.round(tf.sigmoid(z))
correct = tf.cast(tf.equal(prediction, target), dtype=tf.float32)
accuracy = tf.reduce_mean(correct)

init = tf.global_variables_initializer()

# Start training
with tf.Session() as sess:

    # Run the initializer
    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        batch_index = np.random.choice(len(X_train), size=batch_size)
        batch_train_X = X_train[batch_index]
        batch_train_y = np.matrix(Y_train[batch_index]).T
        sess.run(optimizer, feed_dict={data: batch_train_X, target: batch_train_y})
        temp_loss = sess.run(cost, feed_dict={data: batch_train_X, target: batch_train_y})
        temp_train_acc = sess.run(accuracy, feed_dict={data: X_train, target: np.matrix(Y_train).T})
        temp_test_acc = sess.run(accuracy, feed_dict={data: X_test, target: np.matrix(Y_test).T})
        if (epoch + 1) % display_step == 0:
            print('epoch: {:4d} loss: {:5f} train_acc: {:5f} test_acc: {:5f}'.format(epoch + 1, temp_loss,temp_train_acc, temp_test_acc))


epoch:  100 loss: 107.252663 train_acc: 0.353160 test_acc: 0.352174
epoch:  200 loss: 25.133774 train_acc: 0.669145 test_acc: 0.682609
epoch:  300 loss: 150.254105 train_acc: 0.351301 test_acc: 0.347826
epoch:  400 loss: 108.644943 train_acc: 0.646840 test_acc: 0.665217
epoch:  500 loss: 27.511677 train_acc: 0.591078 test_acc: 0.526087
epoch:  600 loss: 145.663071 train_acc: 0.646840 test_acc: 0.660870
epoch:  700 loss: 114.564415 train_acc: 0.646840 test_acc: 0.660870
epoch:  800 loss: 111.378418 train_acc: 0.646840 test_acc: 0.660870
epoch:  900 loss: 120.644424 train_acc: 0.644981 test_acc: 0.660870
epoch: 1000 loss: 27.503599 train_acc: 0.555762 test_acc: 0.465217
epoch: 1100 loss: 26.613510 train_acc: 0.657993 test_acc: 0.669565
epoch: 1200 loss: 95.761894 train_acc: 0.360595 test_acc: 0.352174
epoch: 1300 loss: 24.098326 train_acc: 0.557621 test_acc: 0.478261
epoch: 1400 loss: 133.615540 train_acc: 0.646840 test_acc: 0.660870
epoch: 1500 loss: 118.746330 train_acc: 0.646840 test_

When there are more than two classes then we have to modify the basic logistic regression procedure to account for the different classes. In this case $Y \in \{0,1,2,\cdots K-1 \} $ when there are $K$ classes. Here we model the conditional probability of class label to be $k$ as : $P(y=k | x) = \frac{e^{w_k^Tx}}{1+e^{w_k^Tx}}$. This makes the system more complex than logistic regression as there are $k$ sets of parameters $\{w_1,w_2,  \cdots w_K \}$. We will have to frame 
the expression for negative log likelihood interms of these parameters:
\begin{equation}
  nll = -\sum_i^n \delta(y_i,k) log(P(y_i=k | x_i) = -\sum_i^n \delta(y_i,k) \frac{e^{w_k^Tx_i}}{1+e^{w_k^Tx_i}}
\end{equation}
where $\delta(y_i,k) = 1$ if $y_i=k$ and $0$ otherwise. This is called **Softmax Regression**

We perform gradient descent on nll with respect to both parameters.

In [1]:
import numpy as np
from mnist import MNIST

'''

download mnist data from "http://yann.lecun.com/exdb/mnist/" to a directory. Extract them and rename the files as :
train-images.idx3-ubyte:  training set images   -----> train-images-idx3-ubyte
train-labels.idx1-ubyte:  training set labels   -------> train-labels-idx1-ubyte
t10k-images.idx3-ubyte:   test set images    --------> t10k-images-idx3-ubyte
t10k-labels.idx1-ubyte: test image labels ----------> t10k-labels-idx1-ubyte

Each data sample is a vector of dimensions 784. The vectors are handwritten digit images of size 28x28 vectorized to 
784 dimensional vector. The labels are 0 to 9. 

First we will do a logistic regression with two classes or two digits. Select 5 and 1. They have very dissimilar shapes
hopefully more easily classifiable than, say 5 and 6. For that use the choose_digits functions which returns two sets of 
vectors corresponding to the two chosen digits.

Using this set data we will do logistic regression and get it tested using the testing data also provided.

'''
mndata = MNIST('data_directory')
img_tr,img_label = mndata.load_training()
img_test, img_test_label = mndata.load_testing()

img_tr = np.array(img_tr)
img_label = np.array(img_label)

img_test = np.array(img_test)
img_test_label = np.array(img_test_label)

def choose_digits(digit_1, digit_2, x_data, y_data):
    x_tr_1 = x_data[y_data==digit_1,:]
    x_tr_2 = x_data[y_data==digit_2,:]
    return x_tr_1, x_tr_2

def merge(X1,X2):
    X_merged = np.zeros((X1.shape[0]+X2.shape[0],X1.shape[1]))
    X_labels = np.zeros((X1.shape[0]+X2.shape[0],1))
    counter = 0;
    i = 0
    while(counter < len(X1) and counter < len(X2)):
        X_merged[i] = X1[counter]
        X_merged[i+1] = X2[counter]
        X_labels[i+1] = 1
        i += 2
        counter += 1
    if(counter == len(X1)):
        while(counter < len(X2)):
            X_merged[i] = X2[counter]
            X_labels[i] = 1
            i += 1
            counter += 1
    else:
        while(counter < len(X1)):
            X_merged[i] = X1[counter]
            i += 1
            counter += 1
    return X_merged,X_labels

x_tr_1,x_tr_2 = choose_digits(5,1, img_tr,img_label)

X_train,Y_train = merge(x_tr_1,x_tr_2)

x_test_1,x_test_2 = choose_digits(5,1,img_test,img_test_label)

X_test,Y_test = merge(x_test_1,x_test_2)

learning_rate = 0.01
training_epochs = 25
batch_size = 100
display_step = 1

data = tf.placeholder(tf.float32, [None, 784])
target = tf.placeholder(tf.float32, [None, 1])
#W = tf.Variable(tf.random_normal([8, 2], mean=0.0, stddev=0.05))
W = tf.Variable(tf.random_normal(shape=[784, 1]))
b = tf.Variable(tf.random_normal(shape=[1]))
z = tf.matmul(data, W) + b

cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=z,labels=target))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

prediction = tf.round(tf.sigmoid(z))
correct = tf.cast(tf.equal(prediction, target), dtype=tf.float32)
accuracy = tf.reduce_mean(correct)

init = tf.global_variables_initializer()

# Start training
with tf.Session() as sess:

    # Run the initializer
    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        batch_index = np.random.choice(len(X_train), size=batch_size)
        batch_train_X = X_train[batch_index]
        batch_train_y = np.matrix(Y_train[batch_index])
        sess.run(optimizer, feed_dict={data: batch_train_X, target: batch_train_y})
        temp_loss = sess.run(cost, feed_dict={data: batch_train_X, target: batch_train_y})
        temp_train_acc = sess.run(accuracy, feed_dict={data: X_train, target: np.matrix(Y_train)})
        temp_test_acc = sess.run(accuracy, feed_dict={data: X_test, target: np.matrix(Y_test)})
        if (epoch + 1) % display_step == 0:
            print('epoch: {:4d} loss: {:5f} train_acc: {:5f} test_acc: {:5f}'.format(epoch + 1, temp_loss,temp_train_acc, temp_test_acc))
    parameter = sess.run(W)
    
plt.imshow(parameter.reshape((28,28)))
plt.show()

ModuleNotFoundError: No module named 'mnist'

- Assign class label y=0 to digit_1 and y=1 to digit_2
- Perform logistic regression classification on the data
- Compute the accuracy of classification
- The weight vector $w$ is 784 long vector, reshape it to 28xc28 and display it.
- Modify the program to make it softmax regression.(Now there will be two parameters $w_1$ and $w_2$)
- Display both the parameters and display them as images of size 28x28

- Modify choose_digits function to retrieve more than two digits and perform softmax regression.