In [240]:
from sklearn.datasets import load_breast_cancer
import numpy as np
from sklearn.metrics import euclidean_distances
import pandas as pd

In [241]:
# data = load_breast_cancer()
# x = data['data']
# y = data['target']
data = pd.read_csv(r"SVNE_Hieu.csv")
x = data.iloc[:, 0:6].values
y = data.iloc[:, 6].values

In [242]:
x

array([[9.51753998e+01, 2.34900000e-01, 7.97616000e+01, 2.32680000e+00,
        4.68900000e-01, 1.14900000e-01],
       [9.46445007e+01, 2.39500000e-01, 7.92767000e+01, 2.31640000e+00,
        4.62900000e-01, 1.21600000e-01],
       [9.79487000e+01, 2.50400000e-01, 7.92263000e+01, 2.33160000e+00,
        5.23100000e-01, 1.06900000e-01],
       ...,
       [9.11461029e+01, 2.65700000e-01, 8.94806000e+01, 2.45830000e+00,
        7.12300000e-01, 2.15000000e-02],
       [9.06225967e+01, 2.70500000e-01, 8.97530000e+01, 2.45730000e+00,
        7.27600000e-01, 2.05000000e-02],
       [9.09578018e+01, 2.76000000e-01, 9.00558000e+01, 2.45850000e+00,
        7.49900000e-01, 1.74000000e-02]])

In [243]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)

# use Minmaxscaler because we use euclidean distance
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
x_train = minmax.fit_transform(x_train)
x_test = minmax.transform(x_test)

In [244]:
x_train.shape

(996, 6)

In [245]:
# parameter for LVQ
R = 10 # R is the # of initial prototype for each class
n_classes = 2
epsilon = 0.9
epsilon_dec_factor = 0.001

In [246]:
# class of prototype vectors
class prototype(object):
    def __init__(self, class_id, p_vector, epsilon):
        self.class_id = class_id
        self.p_vector = p_vector
        self.epsilon = epsilon
    def update(self, u_vector, increment = True):
        if increment:
            # Move the prototype closer to input vector
            self.p_vector = self.p_vector + self.epsilon * (u_vector - self.p_vector)
        else:
            # Move the prototype away from input vector
            self.p_vector = self.p_vector - self.epsilon * (u_vector - self.p_vector)

In [247]:
# function to find the closest prototype vector for a given vector
def find_closest(in_vector, proto_vectors):
    closest = None
    closest_distance = 99999
    for p_v in proto_vectors:
        distance = np.linalg.norm(in_vector - p_v.p_vector)
        if distance < closest_distance:
            closest_distance = distance
            closest = p_v
    return closest

In [248]:
# function to find the second closest prototype vector for a given vector
def find_runnerup(in_vector, proto_vectors):
    runnerup = None
    closest_p_vector = find_closest(in_vector, proto_vectors)
    closest_distance = 99999
    for p_v in proto_vectors:
        distance = np.linalg.norm(in_vector - p_v.p_vector)
        if (distance < closest_distance) and (p_v != closest_p_vector):
            closest_distance = distance
            runnerup = p_v
    return runnerup

In [249]:
def find_class_id(test_vector, p_vectors):
    return find_closest(test_vector, p_vectors).class_id

In [250]:
# Choose R initial prototype for each class
p_vectors = []
for i in np.array([0,1]):
    # select class i
    y_subset = np.where(y_train == i)
    # select tuple for chosen class
    x_subset = x_train[y_subset]
    # get R random indices between 0 and 50
    samples = np.random.randint(low = 0, high=len(x_subset), size=R)
    # select p_vectors, they are chosen randomly from the samples x
    for sample in samples:
        s = x_subset[sample]
        p = prototype(i, s, epsilon)
        p_vectors.append(p)

print("class id \t Initial prototype vector \n")
for p_v in p_vectors:
    print(p_v.class_id, '\t', p_v.p_vector)

class id 	 Initial prototype vector 

0 	 [0.09460952 0.30353261 0.52407766 0.         0.16830196 0.93248102]
0 	 [0.29971739 0.31983696 0.46008099 0.2337963  0.34831798 0.65920895]
0 	 [0.24142245 0.31576087 0.44628532 0.29115226 0.34871846 0.62045545]
0 	 [0.46474461 0.31684783 0.31299169 0.58487654 0.50841009 0.3088294 ]
0 	 [0.43486644 0.25380435 0.28568919 0.55169753 0.49649579 0.31322413]
0 	 [0.14850709 0.19375    0.42604592 0.25771605 0.16319584 0.73951258]
0 	 [0.24168068 0.23125    0.40418069 0.46707819 0.31157389 0.50779065]
0 	 [0.38516272 0.23315217 0.38188965 0.36908436 0.35182219 0.53136237]
0 	 [0.441935   0.37146739 0.3486109  0.62731481 0.62555066 0.21094686]
0 	 [0.40039651 0.11820652 0.24180091 0.61033951 0.27182619 0.39752297]
1 	 [0.43694008 0.29755435 0.35146056 0.65432099 0.57739287 0.20175789]
1 	 [0.4579835  0.31793478 0.31753864 0.64917695 0.55947137 0.22812625]
1 	 [0.46441803 0.33641304 0.30413602 0.58127572 0.59981978 0.24770276]
1 	 [0.55650406 0.55380435

In [251]:
x_train.shape

(996, 6)

In [252]:
while epsilon >= 0.01:
    rnd_i = np.random.randint(0, 996)
    rnd_s = x_train[rnd_i]
    target_y = y_train[rnd_i]
    
    epsilon = epsilon - epsilon_dec_factor
    
    closest_pvector = find_closest(rnd_s, p_vectors)
    second_closest_pvector = find_runnerup(rnd_s, p_vectors)
    compare_distance = np.linalg.norm(closest_pvector.p_vector - rnd_s)/np.linalg.norm(second_closest_pvector.p_vector - rnd_s)
    
    if target_y == second_closest_pvector.class_id and target_y != closest_pvector.class_id and compare_distance > 0.8 and compare_distance < 1.2:
        closest_pvector.update(rnd_s, False)
        second_closest_pvector.update(rnd_s)
    elif target_y == closest_pvector.class_id:
        closest_pvector.update(rnd_s)
    elif target_y != closest_pvector.class_id:
        closest_pvector.update(rnd_s, False)
    closest_pvector.epsilon = epsilon

print("class id \t Final prototype vector \n")
for p_v in p_vectors:
    print(p_v.class_id, '\t', p_v.p_vector)

class id 	 Final prototype vector 

0 	 [0.09605671 0.23795043 0.50089636 0.13362133 0.10551021 0.88465002]
0 	 [0.30073924 0.25530029 0.3560543  0.35555366 0.31781005 0.5815043 ]
0 	 [0.21982839 0.13366955 0.32081567 0.36711577 0.14943721 0.65600846]
0 	 [0.58066335 0.00425398 0.40562979 0.29631658 0.5160567  0.37550337]
0 	 [0.37442819 0.25442868 0.36916942 0.38525937 0.52401444 0.40371759]
0 	 [0.41917488 0.16972832 0.4172243  0.17511398 0.07958497 0.84079309]
0 	 [0.19260346 0.22332123 0.44126724 0.25077292 0.20141653 0.72599821]
0 	 [0.37021221 0.2146425  0.31990137 0.51565874 0.39947827 0.39435767]
0 	 [0.45521743 0.38023634 0.46814754 0.13846157 0.67207675 0.54165506]
0 	 [0.476329   0.22517472 0.17585344 0.6297927  0.48188416 0.25853875]
1 	 [0.39813261 0.02859952 0.8397591  0.88037099 0.18021531 0.26477103]
1 	 [0.82211302 0.63528433 0.58239541 0.58825947 0.91973484 0.01075541]
1 	 [0.33518985 0.4213712  0.49997834 0.73482032 0.81545812 0.0164919 ]
1 	 [0.54761746 0.72922467 0

In [253]:
predicted_y = [find_class_id(instance, p_vectors) for instance in x_test]

from sklearn.metrics import classification_report

print (classification_report(y_test, predicted_y, target_names=['1', '2', '3', '5', '6', '7']))

             precision    recall  f1-score   support

          1       0.88      0.79      0.83       162
          2       0.82      0.89      0.85       170

avg / total       0.85      0.84      0.84       332



  .format(len(labels), len(target_names))


In [254]:
predicted_y-y_test

array([ 0,  1, -1,  0,  0,  0,  1,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,
        0,  0,  0,  1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,
        0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,
       -1,  0,  0,  0, -1,  0,  0,  1,  0,  0,  0,  1,  0,  0,  0,  0, -1,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  0,  0,  0,  0,  1,  0,
        0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1, -1,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,
        0, -1,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,
        1,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0, -1,  1,  0, -1,  1,  0,
        0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  1,  1,  0

In [255]:
A = np.array(predicted_y)-y_test
len(A[A==0])/len(A)

0.8433734939759037