In [1]:
import numpy as np
import csv

from Classifiers.Bayes_Gaussian import *
from Classifiers.Bayes_Parzen import *
from Classifiers.k_means_clustering import *
from Classifiers.k_nearest_neighbour import *
from Classifiers.logistic import *
from Classifiers.Naive_Bayes import *
from Classifiers.solver import *
from Classifiers.compute_performance import *

In [2]:
data = np.array(list(csv.reader(open("DataSets/Breast Cancer/wdbc.csv"), delimiter=',')))
N = data.shape[0]
X, y = data[:, 2:].astype(np.float), data[:, 1]
y[y == 'M'] = "1"
y[y == 'B'] = "0"
y = y.astype(np.int)
data = {}
random_perm = np.random.permutation(N)
data['X_train'] = X[random_perm[:469]]
data['y_train'] = y[random_perm[:469]]
data['X_test'] = X[random_perm[469:]]
data['y_test'] = y[random_perm[469:]]
for k, v in list(data.items()):
    print(('%s: ' % k, v.shape))
D, num_classes = data['X_train'].shape[1], np.max(y) + 1
print(D, num_classes)

('X_train: ', (469, 30))
('y_train: ', (469,))
('X_test: ', (100, 30))
('y_test: ', (100,))
30 2


In [3]:
naive_bayes = Naive_Bayes(D, num_classes)
naive_bayes.train(data['X_train'], data['y_train'])
y_pred1 = naive_bayes.test(data['X_test'])
performance = compute_performance(y_pred1, data['y_test'], num_classes)
for k, v in list(performance.items()):
    print(('%s: ' % k, v))

('accuracy: ', 0.92)
('precision: ', array([0.91044776, 0.93939394]))
('recall: ', array([0.96825397, 0.83783784]))
('F1: ', 0.9120879120879122)


In [4]:
bayes = Bayes_Gaussian(D, num_classes)
bayes.train(data['X_train'], data['y_train'])
y_pred2 = bayes.test(data['X_test'])
performance = compute_performance(y_pred2, data['y_test'], num_classes)
for k, v in list(performance.items()):
    print(('%s: ' % k, v))

('accuracy: ', 0.95)
('precision: ', array([0.953125  , 0.94444444]))
('recall: ', array([0.96825397, 0.91891892]))
('F1: ', 0.9460683852874554)


In [5]:
bayes = Bayes_Parzen(num_classes)
bayes.train(data['X_train'], data['y_train'])
y_pred2 = bayes.test(data['X_test'], h=10.0)
performance = compute_performance(y_pred2, data['y_test'], num_classes)
for k, v in list(performance.items()):
    print(('%s: ' % k, v))

('accuracy: ', 0.92)
('precision: ', array([0.91044776, 0.93939394]))
('recall: ', array([0.96825397, 0.83783784]))
('F1: ', 0.9120879120879122)


In [6]:
data['X_train'] = X[random_perm[:400]]
data['y_train'] = y[random_perm[:400]]
data['X_val'] = X[random_perm[400:469]]
data['y_val'] = y[random_perm[400:469]]

In [7]:
knn = KNearestNeighbor()
knn.train(data['X_train'], data['y_train'])
k_choices = [3, 5, 7, 9, 11]
best_k, best_f1 = 1, 0.0
for k in k_choices:
    y_pred3 = knn.test(data['X_val'], k=k)
    per = compute_performance(y_pred3, data['y_val'], num_classes)
    if per['F1'] > best_f1:
        best_k = k
        best_f1 = per['F1']
y_pred3 = knn.test(data['X_test'], k=best_k)
performance = compute_performance(y_pred3, data['y_test'], num_classes)
for k, v in list(performance.items()):
    print(('%s: ' % k, v))

('accuracy: ', 0.96)
('precision: ', array([0.96825397, 0.94594595]))
('recall: ', array([0.96825397, 0.94594595]))
('F1: ', 0.9570999570999571)


In [8]:
K_choices = [2, 3, 5, 7, 9, 11]
best_k_means, best_f1 = None, 0.0
for K in K_choices:
    k_means = K_means(D, K=K)
    k_means.train(data['X_train'], data['y_train'], num_iters=20)
    y_pred4 = k_means.test(data['X_val'])
    per = compute_performance(y_pred4, data['y_val'], num_classes)
    if per['F1'] > best_f1:
        best_k_means = k_means
        best_f1 = per['F1']
y_pred4 = best_k_means.test(data['X_test'])
performance = compute_performance(y_pred4, data['y_test'], num_classes)
for k, v in list(performance.items()):
    print(('%s: ' % k, v))

('accuracy: ', 0.9)
('precision: ', array([0.87323944, 0.96551724]))
('recall: ', array([0.98412698, 0.75675676]))
('F1: ', 0.8869289914066034)


In [9]:
initial_weights = np.array(list(csv.reader(open("DataSets/Breast Cancer/logistic_weights.csv"), delimiter=','))).astype(np.float)
initial_b = np.array(list(csv.reader(open("DataSets/Breast Cancer/logistic_b.csv"), delimiter=','))).astype(np.float)
logistic = Logistic(input_dim=D, num_classes=num_classes, initial_weights=initial_weights, initial_b=initial_b)
solver = Solver(logistic, data, lr=1e-5, num_epochs=100, verbose=0)
solver.train()
y_pred5 = logistic.test(data['X_test'])
performance = compute_performance(y_pred5, data['y_test'], num_classes)
for k, v in list(performance.items()):
    print(('%s: ' % k, v))

('accuracy: ', 0.94)
('precision: ', array([0.92537313, 0.96969697]))
('recall: ', array([0.98412698, 0.86486486]))
('F1: ', 0.9340659340659341)
