In [2]:
import pandas as pd
import numpy as np
import random

In [1]:
def load_congress_data(training_ratio):
    """Load the congress data.

    Note that missing values (denoted '?', where a voter abstained) are 
    instead treated as a third type of attribute. Therefore every feature
    has 3 possible attributes.

    Args:
        training_ratio: the ratio of examples that go into the training set
    Returns:
        a tuple of numpy matrices, the first in the tuple is the training 
        data, second is test data. Each matrix row represents a data point 
        as a row vector: the first element of the row vector corresponds to 
        the label and the following elements correspond to attributes.
    """
    random.seed(1) # get same data every time
    label_conversions = {'republican' : 0, 'democrat' : 1, 
                         'n' : 0, 'y' : 1, '?' : 2} 
    f = open('data/house-votes-84.data', 'r')

    training = None
    test = None
    lines = f.readlines()
    train_index = int(len(lines)*training_ratio)
    random.shuffle(lines)
    for k, line in enumerate(lines):
        data = line.split(',')
        vector = [float(label_conversions[i.rstrip('\n')]) for i in data]
        vector = np.array(vector)
        if k < train_index:
            if training is None:
                training = vector
            else:
                training = np.vstack((training, vector))
        else:
            if test is None:
                test = vector
            else:
               test = np.vstack((test, vector))
    return (training, test)