In [1]:
# Load CSV (using python)
import csv
import numpy as np
import math

filename = 'HTRU_2.csv'
raw_data = open(filename, 'rt')
reader = csv.reader(raw_data, delimiter=',', quoting=csv.QUOTE_NONE)
x = list(reader)
data = np.array(x)

data = data.astype('float')
y = data[:, 8]
print(len(data))
print(sum(y))
X = data[:, :8]

17898
1639.0


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

feature = len(X)
prop = 0.4 # the proportion of training samples to be extracted

def get_train_X(whole):
    train = whole[:int(feature * prop),:]
    return np.asmatrix(train[:, :8])
def get_train_y(whole):
    train = whole[:int(feature * prop),:]
    return np.squeeze(np.asarray(train[:, 8]))
def get_test_X(whole):
    test = whole[int(feature * prop):,:]
    return np.asmatrix(test[:, :8])
def get_test_y(whole):
    test = whole[int(feature * prop):,:]
    return np.squeeze(np.asarray(test[:, 8]))
    
def cross_validation(whole, fold):
    error = 0
    for i in range(fold):
        np.random.shuffle(whole)
        logistic = LogisticRegression()
        logistic.fit(get_train_X(whole), get_train_y(whole))
        error = error + (1 - logistic.score(get_test_X(whole), get_test_y(whole)))
    error = error / float(fold)
    return 1 - error
    
def main():
    print("running Logistic Regression...with training proportion of ", prop)
    #prepare training set and testing set
    np.random.shuffle(data)

    logistic = LogisticRegression()
    logistic.fit(get_train_X(data), get_train_y(data))
    acc = logistic.score(get_train_X(data), get_train_y(data))
    print ("training set size ", int(feature * prop))
    print ("training accuracy ", acc * 100, "%")
    
    acc = logistic.score(get_test_X(data), get_test_y(data))
    print ("testing set size ", int(feature * (1-prop)))
    print ("testing accuracy ", acc * 100, "%")
         
    #acc = cross_val_score(logistic, X, y, cv=10).mean()
    acc = cross_validation(data, 10)
    print ("validation accuracy ", acc * 100, "%")
    
main()

running Logistic Regression...with training proportion of  0.4
training set size  7159
training accuracy  98.1841039251 %
testing set size  10738
testing accuracy  97.6068535245 %
validation accuracy  97.8619983239 %


In [6]:
from sklearn import svm

feature = len(X)
prop = 0.1 # the proportion of training samples to be extracted

def get_train_X(whole):
    train = whole[:int(feature * prop),:]
    return np.asmatrix(train[:, :8])
def get_train_y(whole):
    train = whole[:int(feature * prop),:]
    return np.squeeze(np.asarray(train[:, 8]))
def get_test_X(whole):
    test = whole[int(feature * prop):,:]
    return np.asmatrix(test[:, :8])
def get_test_y(whole):
    test = whole[int(feature * prop):,:]
    return np.squeeze(np.asarray(test[:, 8]))
    
def cross_validation(whole, fold, C):
    error = 0
    for i in range(fold):
        np.random.shuffle(whole)
        clf = svm.SVC(C=C)
        clf.fit(get_train_X(whole), get_train_y(whole))
        error = error + (1 - clf.score(get_test_X(whole), get_test_y(whole)))
    error = error / float(fold)
    return 1 - error
def find_C(whole):
    C = 1
    step = 1
    optimal_C = C
    min_acc = 0
    while (C < 10):
        acc = cross_validation(whole, 5, C)
        print("accuracy for ", C, " is ", acc)
        if acc > min_acc:
            min_acc = acc
            optimal_C = C
        C = C + step
    print("choice of C is ", optimal_C)
    return optimal_C

def main():
    print("running SVM(rbf kernel)...with training proportion of ", prop)
    #prepare training set and testing set
    np.random.shuffle(data)
    
    C = find_C(data)
    np.random.shuffle(data)
    
    print ("training set size ", int(feature * prop))
    print ("testing set size ", int(feature * (1-prop)))
    clf = clf = svm.SVC(C=1)
    clf.fit(get_train_X(data), get_train_y(data))
    acc = clf.score(get_train_X(data), get_train_y(data))
    print ("training accuracy with C = 1", acc * 100, "%")
    acc = clf.score(get_test_X(data), get_test_y(data))
    print ("testing accuracy with C = 1", acc * 100, "%")
    
    
    clf = clf = svm.SVC(C=C)
    clf.fit(get_train_X(data), get_train_y(data))
    acc = clf.score(get_train_X(data), get_train_y(data))   
    print ("training accuracy with C = ", C, " ", acc * 100, "%")
    acc = clf.score(get_test_X(data), get_test_y(data))
    print ("testing accuracy with C = ", C, acc * 100, "%")
    
    #acc = cross_val_score(clf, X, y, cv=10).mean()
    acc = cross_validation(data, 10, C)
    print ("validation accuracy ", acc * 100, "%")
main()

running SVM(rbf kernel)...with training proportion of  0.1
accuracy for  1  is  0.9077782606
accuracy for  2  is  0.908262461978
accuracy for  3  is  0.90766652182
accuracy for  4  is  0.9077161835
accuracy for  5  is  0.908038984419
accuracy for  6  is  0.907939661059
accuracy for  7  is  0.908163138618
accuracy for  8  is  0.9076541064
accuracy for  9  is  0.908101061518
choice of C is  2
training set size  1789
testing set size  16108
training accuracy with C = 1 99.7205142538 %
testing accuracy with C = 1 90.9988205351 %
training accuracy with C =  2   100.0 %
testing accuracy with C =  2 90.9739896952 %
validation accuracy  90.766652182 %


In [8]:
from sklearn.ensemble import RandomForestClassifier

feature = len(X)
prop = 0.3 # the proportion of training samples to be extracted

def get_train_X(whole):
    train = whole[:int(feature * prop),:]
    return np.asmatrix(train[:, :8])
def get_train_y(whole):
    train = whole[:int(feature * prop),:]
    return np.squeeze(np.asarray(train[:, 8]))
def get_test_X(whole):
    test = whole[int(feature * prop):,:]
    return np.asmatrix(test[:, :8])
def get_test_y(whole):
    test = whole[int(feature * prop):,:]
    return np.squeeze(np.asarray(test[:, 8]))
    
def cross_validation(whole, fold, T):
    error = 0
    for i in range(fold):
        np.random.shuffle(whole)
        clf = RandomForestClassifier(n_estimators=T)
        clf.fit(get_train_X(whole), get_train_y(whole))
        error = error + (1 - clf.score(get_test_X(whole), get_test_y(whole)))
    error = error / float(fold)
    return 1 - error

def find_T(whole):
    T = 100
    step = 10
    optimal_T = T
    min_acc = 0
    while (T < 300):
        acc = cross_validation(whole, 5, T)
        #print("accuracy for ", T, "is ", acc)
        if acc > min_acc:
            min_acc = acc
            optimal_T = T
        T = T + step
    print("choice of T is ", optimal_T)
    return optimal_T

def main():
    print("running Random Forest...with training proportion of ", prop)
    #prepare training set and testing set
    np.random.shuffle(data)
    
    T = find_T(data)
    np.random.shuffle(data)
    
    
    print ("training set size ", int(feature * prop))
    print ("testing set size ", int(feature * (1-prop)))
    clf = RandomForestClassifier(n_estimators=200)
    clf.fit(get_train_X(data), get_train_y(data))
    acc = clf.score(get_train_X(data), get_train_y(data))
    print ("training accuracy with T = 200", acc * 100, "%")
    acc = clf.score(get_test_X(data), get_test_y(data))
    print ("testing accuracy with T = 200", acc * 100, "%")
    
    
    clf = RandomForestClassifier(n_estimators=T)
    clf.fit(get_train_X(data), get_train_y(data))
    acc = clf.score(get_train_X(data), get_train_y(data))   
    print ("training accuracy with T = ", T, " ", acc * 100, "%")
    acc = clf.score(get_test_X(data), get_test_y(data))
    print ("testing accuracy with T = 200", acc * 100, "%")
    
    #acc = cross_val_score(clf, X, y, cv=10).mean()
    acc = cross_validation(data, 10, T)
    print ("validation accuracy ", acc * 100, "%")
    
main()

running Random Forest...with training proportion of  0.3
choice of T is  190
training set size  5369
testing set size  12528
training accuracy with T = 200 100.0 %
testing accuracy with T = 200 97.7731662543 %
training accuracy with T =  190   100.0 %
testing accuracy with T = 200 97.7172958736 %
validation accuracy  97.907255168 %


In [2]:
from numpy import linalg as LA
import operator

dimension = 8
feature = len(X)
prop = 0.1 # the proportion of training samples to be extracted

def get_train(whole):
    return whole[:int(feature * prop),:]
def get_test(whole):
    whole[int(feature * prop):,:]
def get_train_X(whole):
    train = whole[:int(feature * prop),:]
    return np.asmatrix(train[:, :dimension])
def get_train_y(whole):
    train = whole[:int(feature * prop),:]
    return np.squeeze(np.asarray(train[:, dimension]))
def get_test_X(whole):
    test = whole[int(feature * prop):,:]
    return np.asmatrix(test[:, :dimension])
def get_test_y(whole):
    test = whole[int(feature * prop):,:]
    return np.squeeze(np.asarray(test[:, dimension]))

def get_neighbors(train, goal_X, k):
    distances = []
    train_X = np.asmatrix(train[:, :dimension])
    for i in range(len(train_X)):
        dist = LA.norm(train_X[i] - goal_X)
        distances.append((train[i], dist))
    distances.sort(key=operator.itemgetter(1))
    
    neighbors = np.zeros((k, train.shape[1]))
    for i in range(k):
        neighbors[i] = np.copy(np.asarray(distances[i][0]))
    return neighbors

def predict(neighbors):
    count0 = 0
    count1 = 0
    for i in range(len(neighbors)):
        if(neighbors[i, dimension] == 0):
            count0 = count0 + 1
        if(neighbors[i, dimension] == 1):
            count1 = count1 + 1
    if(count0 > count1):
        return 0
    else:
        return 1

#input: testing y and neightbors
def get_error(train, test_X, test_y, k):
    result = 0
    miss = 0
    for i in range(len(test_y)):
        neighbors = get_neighbors(train, test_X[i], k)
        result = predict(neighbors)
        if result != test_y[i]:
            miss = miss + 1
    return miss/len(test_y)
       
def cross_validation(whole, fold, k):
    error = 0
    for i in range(fold):
        np.random.shuffle(whole)
        train = get_train(whole)
        test_X = get_test_X(whole)
        test_y = get_test_y(whole)
        error = error + get_error(train, test_X, test_y, k)
    error = error / float(fold)
    return error

def find_K(whole, lower, upper):
    K = lower
    optimal_K = lower
    min_error = 1
    while (K < (upper + 1)):
        error = cross_validation(whole, 10, K)
        print("error for ", K, "is ", error)
        if error < min_error:
            min_error = error
            optimal_K = K
        K = K + 2
    print("choice of K is ", optimal_K)
    return optimal_K

def main():
    print("running K-Nearest Neighbors...")
    #prepare training set and testing set
    whole = data
    np.random.shuffle(whole)
    K = find_K(whole, 1, 7)
    
    np.random.shuffle(whole)

    error = get_error(get_train(whole), get_train_X(whole), get_train_y(whole), K)
    print ("training error ", error)
    
    error = get_error(get_train(whole), get_test_X(whole), get_test_y(whole), K)
    print ("testing error ", error)
    
    error = cross_validation(whole, 10, K)
    print ("validation error ", error)
    
main()

running K-Nearest Neighbors...


KeyboardInterrupt: 

In [37]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score

dimension = 8
feature = len(X)
prop = 0.4# the proportion of training samples to be extracted

def get_train_X(whole, prop):
    train = whole[:int(feature * prop),:]
    return np.asmatrix(train[:, :dimension])
def get_train_y(whole, prop):
    train = whole[:int(feature * prop),:]
    return np.squeeze(np.asarray(train[:, dimension]))
def get_test_X(whole, prop):
    test = whole[int(feature * prop):,:]
    return np.asmatrix(test[:, :dimension])
def get_test_y(whole, prop):
    test = whole[int(feature * prop):,:]
    return np.squeeze(np.asarray(test[:, dimension]))

def main():
    whole = data
    #prepare training set and testing set
    np.random.shuffle(whole)
    
    print("running bagging decision tree...with proportion of training", prop)
    #errors
    np.random.shuffle(whole)
    clf = BaggingClassifier(DecisionTreeClassifier(max_depth = 13))
    clf.fit(X = get_train_X(whole, prop), y = get_train_y(whole, prop))
    
    acc = clf.score(get_train_X(whole, prop), get_train_y(whole, prop))
    print ("training accuracy ", acc * 100, "%")
    
    acc = clf.score(get_test_X(whole, prop), get_test_y(whole, prop))
    print ("testing accuracy ", acc * 100, "%")
    
    acc = cross_val_score(clf, X, y, cv = 10).mean()
    print ("validation accuracy ", acc * 100, "%")
    
main()


running bagging decision tree...with proportion of training 0.4
training accuracy  99.6507892164 %
testing accuracy  97.7744668964 %
validation accuracy  97.8712896654 %


In [34]:
from sklearn import tree
from sklearn.model_selection import cross_val_score

dimension = 8
feature = len(X)
prop = 0.3 # the proportion of training samples to be extracted

def get_train_X(whole, prop):
    train = whole[:int(feature * prop),:]
    return np.asmatrix(train[:, :dimension])
def get_train_y(whole, prop):
    train = whole[:int(feature * prop),:]
    return np.squeeze(np.asarray(train[:, dimension]))
def get_test_X(whole, prop):
    test = whole[int(feature * prop):,:]
    return np.asmatrix(test[:, :dimension])
def get_test_y(whole, prop):
    test = whole[int(feature * prop):,:]
    return np.squeeze(np.asarray(test[:, dimension]))
    
def cross_validation(whole, fold, D):
    error = 0
    for num2 in range(0, fold):
        np.random.shuffle(whole)
        clf = tree.DecisionTreeClassifier(max_depth = D).fit(X = get_train_X(whole, prop), y = get_train_y(whole, prop))
        error = error + 1 - clf.score(get_test_X(whole, prop), get_test_y(whole, prop))
    error = error / float(fold)
    return error



def main():
    whole = data
    #prepare training set and testing set
    np.random.shuffle(whole)
    
    print("running decision tree...with proportion of training", prop)
    #errors
    np.random.shuffle(whole)
    clf = tree.DecisionTreeClassifier(max_depth = 13).fit(X = get_train_X(whole, prop), y = get_train_y(whole, prop))
    acc = clf.score(get_train_X(whole, prop), get_train_y(whole, prop))
    print ("training accuracy ", acc * 100, "%")
    
    acc = clf.score(get_test_X(whole, prop), get_test_y(whole, prop))
    print ("testing accuracy ", acc * 100, "%")
    
    error = cross_val_score(clf, X, y, cv = 10).mean()
    print ("validation accuracy ", acc * 100, "%")
    
main()

running decision tree...with proportion of training 0.3
training accuracy  99.9813745576 %
testing accuracy  96.6238327081 %
validation accuracy  96.6238327081 %
