In [1]:
import numpy as np
import random

In [1]:
def load_congress_data(training_ratio):
    """Load the congress data.

    Note that missing values (denoted '?', where a voter abstained) are 
    instead treated as a third type of attribute. Therefore every feature
    has 3 possible attributes.

    Args:
        training_ratio: the ratio of examples that go into the training set
    Returns:
        a tuple of numpy matrices, the first in the tuple is the training 
        data, second is test data. Each matrix row represents a data point 
        as a row vector: the first element of the row vector corresponds to 
        the label and the following elements correspond to attributes.
    """
    random.seed(1) # get same data every time
    label_conversions = {'republican' : 0, 'democrat' : 1, 
                         'n' : 0, 'y' : 1, '?' : 2} 
    f = open('data/house-votes-84.data', 'r')

    training = None
    test = None
    lines = f.readlines()
    train_index = int(len(lines)*training_ratio)
    random.shuffle(lines)
    for k, line in enumerate(lines):
        data = line.split(',')
        vector = [float(label_conversions[i.rstrip('\n')]) for i in data]
        vector = np.array(vector)
        if k < train_index:
            if training is None:
                training = vector
            else:
                training = np.vstack((training, vector))
        else:
            if test is None:
                test = vector
            else:
                test = np.vstack((test, vector))
    return (training, test)

In [10]:
# S is a numpy array
# this function returns sigmoid function of each element of S
def sigmoid(S):
    return 1/(1+ np.exp(-S))

# X is a 2 dimenstions numpy array with N datapoints and d features
# w is a 1 dimension numpy array of shape d
# this function returns sigmoid function of the dot product between X and w
def prob(w, X):
    return sigmoid(np.dot(X, w))

#This function returns the value of the loss function given vector w, 2-d matrix X and vector label y
def loss(w, X, y):
    g = prob(w, X)
    return -np.mean(y*np.log(g) + (1-y)*np.log(1-g))

In [21]:
#This function returns the logistic weigth vector with size d
def logistic_regression(w_init, X, y, ep, threshold):
    (N, d) = X.shape[0], X.shape[1]
    weight = []
    w = w_old = w_init
    loss_hist = [loss(w_init,X,y)]
    for i in range(0,N):
        xi = X[i]
        yi = y[i]
        gi = sigmoid(np.dot(xi,w))
        
        w = w - ep*((gi - yi) * xi)
        old_loss = loss(w_old,X,y)
        new_loss = loss(w, X,y)
        if np.abs(old_loss - new_loss) < threshold:
            break
        w_old = w
    return w


#This function returns a list of prediction for each data point of test set
def predict(training, test):    
    y_train = np.transpose(training)[0]
    X_train = np.delete(training, 0, axis=1)
    y_test = np.transpose(test)[0]
    X_test = np.delete(test, 0, axis=1)
    d = X_train.shape[1]
    w_init = np.random.randn(X_train.shape[1])
    ep = 0.1
    threshold = 0.000001
    weight = logistic_regression(w_init, X_train, y_train, ep, threshold)
    prediction=[]
    for i in range(0, y_test.shape[0]):
        plabel = sigmoid(np.dot(weight, X_test[i]))
        if plabel > 0.5:
            prediction.append(1)
        else:
            prediction.append(0)
    return prediction

#This function returns the accuracy_score of the logistic regression
def accuracy_score(test, prediction):
    y_test = np.transpose(test)[0]
    count = 0
    for i in range(0, len(prediction)):
        if y_test[i] == prediction[i]:
            count+=1
    return count/len(prediction)

In [25]:
(training, test) = load_congress_data(0.6)
acc = accuracy_score(test, predict(training, test))
print("The accuracy is ",acc)    

The accuracy is  0.9080459770114943
