In [1]:
from sklearn.datasets import load_breast_cancer
import numpy as np
from sklearn.metrics import euclidean_distances

In [2]:
data = load_breast_cancer()
x = data['data']
y = data['target']

In [3]:
len(x)

569

In [None]:
from sklearn.cross_validation import train

In [55]:
# use Minmaxscaler because we use euclidean distance
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
x = minmax.fit_transform(x)

In [56]:
# parameter for LVQ
R = 6 # R is the # of initial prototype for each class
n_classes = 3
epsilon = 0.9
epsilon_dec_factor = 0.001

In [57]:
# class of prototype vectors
class prototype(object):
    def __init__(self, class_id, p_vector, epsilon):
        self.class_id = class_id
        self.p_vector = p_vector
        self.epsilon = epsilon
    def update(self, u_vector, increment = True):
        if increment:
            # Move the prototype closer to input vector
            self.p_vector = self.p_vector + self.epsilon * (u_vector - self.p_vector)
        else:
            # Move the prototype away from input vector
            self.p_vector = self.p_vector - self.epsilon * (u_vector - self.p_vector)

In [58]:
# function to find the closest prototype vector for a given vector
def find_closest(in_vector, proto_vectors):
    closest = None
    closest_distance = 99999
    for p_v in proto_vectors:
        distance = np.linalg.norm(in_vector - p_v.p_vector)
        if distance < closest_distance:
            closest_distance = distance
            closest = p_v
    return closest

In [59]:
def find_class_id(test_vector, p_vectors):
    return find_closest(test_vector, p_vectors).class_id

In [60]:
# Choose R initial prototype for each class
p_vectors = []
for i in range(n_classes):
    # select class i
    y_subset = np.where(y == i)
    # select tuple for chosen class
    x_subset = x[y_subset]
    # get R random indices between 0 and 50
    samples = np.random.randint(0, len(x_subset), R)
    # select p_vectors, they are chosen randomly from the samples x
    for sample in samples:
        s = x_subset[sample]
        p = prototype(i, s, epsilon)
        p_vectors.append(p)
print("class id \t Initial prototype vector \n")
for p_v in p_vectors:
    print(p_v.class_id, '\t', p_v.p_vector)

class id 	 Initial prototype vector 

0 	 [0.22222222 0.75       0.15254237 0.125     ]
0 	 [0.13888889 0.45833333 0.10169492 0.04166667]
0 	 [0.22222222 0.54166667 0.11864407 0.16666667]
0 	 [0.25       0.875      0.08474576 0.        ]
0 	 [0.22222222 0.625      0.06779661 0.08333333]
0 	 [0.33333333 0.91666667 0.06779661 0.04166667]
1 	 [0.25       0.29166667 0.49152542 0.54166667]
1 	 [0.63888889 0.41666667 0.57627119 0.54166667]
1 	 [0.5        0.33333333 0.62711864 0.45833333]
1 	 [0.55555556 0.125      0.57627119 0.5       ]
1 	 [0.38888889 0.25       0.42372881 0.375     ]
1 	 [0.33333333 0.16666667 0.47457627 0.41666667]
2 	 [1.         0.75       0.91525424 0.79166667]
2 	 [1.         0.75       0.91525424 0.79166667]
2 	 [0.72222222 0.45833333 0.74576271 0.83333333]
2 	 [0.66666667 0.20833333 0.81355932 0.70833333]
2 	 [0.55555556 0.33333333 0.69491525 0.58333333]
2 	 [0.58333333 0.45833333 0.76271186 0.70833333]


In [61]:
while epsilon >= 0.01:
    rnd_i = np.random.randint(0, 149)
    rnd_s = x[rnd_i]
    target_y = y[rnd_i]
    
    epsilon = epsilon - epsilon_dec_factor
    
    closest_pvector = find_closest(rnd_s, p_vectors)
    
    if target_y == closest_pvector.class_id:
        closest_pvector.update(rnd_s)
    else:
        closest_pvector.update(rnd_s, False)
    closest_pvector.epsilon = epsilon

print("class id \t Final prototype vector \n")
for p_v in p_vectors:
    print(p_v.class_id, '\t', p_v.p_vector)

class id 	 Final prototype vector 

0 	 [0.16649692 0.61065163 0.07063595 0.05133676]
0 	 [0.05549663 0.12562153 0.05088467 0.08323234]
0 	 [0.25893174 0.5978987  0.09055977 0.11513863]
0 	 [0.3432007  0.89192764 0.06447529 0.04155285]
0 	 [0.11003612 0.44121483 0.07283661 0.03611766]
0 	 [0.27646304 0.74143723 0.09950247 0.08549193]
1 	 [0.42048382 0.34734102 0.53319468 0.50366652]
1 	 [0.52110326 0.54276528 0.61084272 0.6259976 ]
1 	 [0.67572055 0.42031504 0.63069703 0.57630459]
1 	 [0.54861142 0.11787067 0.5528514  0.50924518]
1 	 [0.20573058 0.14090209 0.37361206 0.39298281]
1 	 [0.36385305 0.20770709 0.49459447 0.43242702]
2 	 [0.8067022  0.66735017 0.86523405 0.99896353]
2 	 [0.9676283  0.75       0.94488258 0.84022422]
2 	 [0.53768516 0.36226556 0.71154466 0.77401344]
2 	 [0.83233174 0.38928811 0.86702691 0.73662531]
2 	 [0.33480842 0.22883409 0.65843117 0.7573677 ]
2 	 [0.66142564 0.48897114 0.77188966 0.87701589]


In [62]:
predicted_y = [find_class_id(instance, p_vectors) for instance in x]

from sklearn.metrics import classification_report

print (classification_report(y, predicted_y, target_names=['Iris-Setosa','Iris-Versicolour', 'Iris-Virginica']))

                  precision    recall  f1-score   support

     Iris-Setosa       1.00      1.00      1.00        50
Iris-Versicolour       0.94      0.98      0.96        50
  Iris-Virginica       0.98      0.94      0.96        50

     avg / total       0.97      0.97      0.97       150

