In [1]:
import sys, os
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

In [2]:
def sigmoid(scores):
    predictions = np.zeros(len(scores))
    for i in range(len(predictions)):
        if scores[i] >= 0:
            predictions[i] +=  1.0 / (1.0 + np.exp(-scores[i]))
        else:
            predictions[i] += np.exp(scores[i]) / (1.0 + np.exp(scores[i]))
    return predictions

def lr(trainingSet, testSet):
    print len(trainingSet.columns)
    regularization = 0.01
    step_size = 0.01
    
    max_iterations = 500
    tol = 1e-6
    
    count = 0
    
    train_labels = trainingSet['decision']    
    trainingSet = trainingSet.drop('decision', axis=1)
    
    #print train_labels, trainingSet
    w = np.zeros(len(trainingSet.columns) + 1)
    
    # Add intercept
    X = np.array(trainingSet)
    Y = np.array(train_labels)
    intercept = np.ones((X.shape[0], 1))
    #X = np.concatenate((X, intercept.T), axis=1)
    X = np.hstack((X, intercept))
    diff = 100.0
    
    while(count < max_iterations and diff > tol):
        count += 1
        norm_old = np.linalg.norm(w)
        
        scores = np.dot(X, w)
        predictions = sigmoid(scores)

        gradient = np.dot(X.T, (predictions - Y))

        for j in range(len(w)):
            gradient[j] += regularization * w[j]
            
        #gradient /= len(train_labels)
        w -= step_size * gradient
        norm_new = np.linalg.norm(w)
        
        diff = abs(norm_new - norm_old)
        #print w
        #print count, diff
    
    return w

def svm(trainingSet, testSet):
    #print len(trainingSet.columns)
    regularization = 0.01
    step_size = 0.50
    
    max_iterations = 500
    tol = 1e-6
    #print len(trainingSet[trainingSet['decision'] == 1])
    count = 0
    train_labels = trainingSet['decision']    
    trainingSet = trainingSet.drop('decision', axis=1)

    w = np.zeros(len(trainingSet.columns) + 1)
    
    # Add intercept
    X = np.array(trainingSet)
    Y = np.array(train_labels)
    #print train_labels
    for i in range(len(Y)):
        if Y[i] == 0:
            Y[i] = -1.0
        else:
            Y[i] = 1.0
    #print Y.tolist()
    intercept = np.ones((X.shape[0], 1))
    X = np.hstack((intercept, X))
    diff = 100.0
    while(count < max_iterations and diff > tol):
        count += 1
        norm_old = np.linalg.norm(w)
        
        predictions = np.dot(X, w)
    
        error = 0
        gradient = np.zeros(len(w))
        for i in range(len(predictions)):
            if predictions[i] * Y[i] < 1.0:
                error += 1
                #gradient -= 1.0 * Y[i] * X[i]
                gradient -= np.multiply(X[i], Y[i])
            
        gradient /= 1.0 * len(train_labels)
        #print gradient.shape, X[0].shape
        
        for j in range(1, len(gradient)):
            gradient[j] += 1.0 * regularization * w[j]

        w -= 1.0 * step_size * gradient
        norm_new = np.linalg.norm(w)
        diff = abs(norm_new - norm_old)
        #print count, diff, error
    #print w
    return w

In [3]:
def get_accuracy_lr(w, trainingSet, testSet):
    total_train = len(trainingSet)
    count_train = 0
    total_test = len(testSet)
    count_test = 0
    
    train_labels = trainingSet['decision']
    test_labels = np.array(testSet['decision'])
    #print test_labels
    
    trainingSet = trainingSet.drop('decision', axis=1)
    testSet = testSet.drop('decision', axis=1)
    
    # Test accuracy
    X = np.array(testSet)
    Y = np.array(test_labels)
    intercept = np.ones((X.shape[0], 1))
    X = np.hstack((X, intercept))

    scores = np.dot(X, w)
    predictions = sigmoid(scores)

    for i in range(len(predictions)):
        if predictions[i] > 0.5:
            predictions[i] = 1
        else:
            predictions[i] = 0

    for i in range(len(predictions)):    
        if predictions[i] == int(Y[i]):
            count_test += 1
            
    test_accuracy = 1.0 * count_test/total_test
    print 'Test Accuracy LR:', '%.2f' % test_accuracy
    return test_accuracy
    
def get_accuracy_svm(w, trainingSet, testSet):
    total_train = len(trainingSet)
    count_train = 0
    total_test = len(testSet)
    count_test = 0
    
    train_labels = trainingSet['decision']
    test_labels = testSet['decision']
    
    trainingSet = trainingSet.drop('decision', axis=1)
    testSet = testSet.drop('decision', axis=1)
    
    # Test accuracy
    X = np.array(testSet)
    Y = np.array(test_labels)
    intercept = np.ones((X.shape[0], 1))
    X = np.hstack((intercept, X))

    predictions = np.dot(X, w)

    for i in range(len(predictions)):
        if predictions[i] > 0.0:
            predictions[i] = 1
        else:
            predictions[i] = 0

    for i in range(len(predictions)):    
        if predictions[i] == int(Y[i]):
            count_test += 1
            
    test_accuracy = 1.0 * count_test/total_test
    print 'Test Accuracy SVM:', '%.2f' % test_accuracy
    return test_accuracy

In [4]:
trainingDataFilename = 'trainingSet.csv'
testDataFilename = 'testSet.csv'
trainingSet = pd.read_csv(trainingDataFilename)
testSet = pd.read_csv(testDataFilename)
f = [0.025, 0.05, 0.075, 0.1, 0.15, 0.2]

In [5]:
trainingSet = trainingSet.sample(frac=1, random_state=18)
df_kfold = []
for i in range(10):
    df_kfold.append(trainingSet[i*520:(i+1)*520])
    
#print df_kfold[9]
nbc_res = {}
lr_res = {}
svm_res = {}

for t_frac in f:
    nbc_res[t_frac] = []
    lr_res[t_frac] = []
    svm_res[t_frac] = []
#print nbc

In [6]:
for t_frac in f:
    for i in range(10):
        # Partition the tarin and cv
        train_set_df = []
        for j in range(10):
            if j != i:
                train_set_df.append(df_kfold[j])
            else:
                test_set = df_kfold[j]
        
        train_set = pd.concat(train_set_df).sample(frac=t_frac, random_state=32)
        #print train_set
        
        # Train and Test
        w = lr(train_set, test_set)
        lr_res[t_frac].append(get_accuracy_lr(w, train_set, test_set))

        w = svm(train_set, test_set)
        svm_res[t_frac].append(get_accuracy_svm(w, train_set, test_set))

print lr_res
print svm_res

# Get std error
nbc_stdrr = []
lr_stdrr = []
svm_stdrr = []

261
Test Accuracy LR: 0.67
Test Accuracy SVM: 0.59
261
Test Accuracy LR: 0.72
Test Accuracy SVM: 0.65
261
Test Accuracy LR: 0.68
Test Accuracy SVM: 0.52
261
Test Accuracy LR: 0.69
Test Accuracy SVM: 0.61
261
Test Accuracy LR: 0.60
Test Accuracy SVM: 0.56
261
Test Accuracy LR: 0.72
Test Accuracy SVM: 0.57
261
Test Accuracy LR: 0.58
Test Accuracy SVM: 0.53
261
Test Accuracy LR: 0.61
Test Accuracy SVM: 0.51
261
Test Accuracy LR: 0.69
Test Accuracy SVM: 0.50
261
Test Accuracy LR: 0.59
Test Accuracy SVM: 0.52
261
Test Accuracy LR: 0.62
Test Accuracy SVM: 0.56
261
Test Accuracy LR: 0.62
Test Accuracy SVM: 0.48
261
Test Accuracy LR: 0.61
Test Accuracy SVM: 0.53
261
Test Accuracy LR: 0.68
Test Accuracy SVM: 0.55
261
Test Accuracy LR: 0.64
Test Accuracy SVM: 0.54
261
Test Accuracy LR: 0.68
Test Accuracy SVM: 0.58
261
Test Accuracy LR: 0.62
Test Accuracy SVM: 0.55
261
Test Accuracy LR: 0.68
Test Accuracy SVM: 0.58
261
Test Accuracy LR: 0.70
Test Accuracy SVM: 0.57
261
Test Accuracy LR: 0.63
Test