In [53]:
from sklearn.datasets import load_breast_cancer
import numpy as np
from sklearn.metrics import euclidean_distances
import pandas as pd

In [54]:
data = pd.read_csv(r"glass.csv")
x = data.iloc[:, 1:10].values
y = data.iloc[:, 10].values

In [55]:
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7], dtype=int64)

In [56]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [57]:
# use Minmaxscaler because we use euclidean distance
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
x_train = minmax.fit_transform(x_train)
x_test = minmax.transform(x_test)

In [58]:
# parameter for LVQ
R = 5 # R is the # of initial prototype for each class
n_classes = 7
epsilon = 0.9
epsilon_dec_factor = 0.001

In [59]:
# class of prototype vectors
class prototype(object):
    def __init__(self, class_id, p_vector, epsilon):
        self.class_id = class_id
        self.p_vector = p_vector
        self.epsilon = epsilon
    def update(self, u_vector, increment = True):
        if increment:
            # Move the prototype closer to input vector
            self.p_vector = self.p_vector + self.epsilon * (u_vector - self.p_vector)
        else:
            # Move the prototype away from input vector
            self.p_vector = self.p_vector - self.epsilon * (u_vector - self.p_vector)

In [60]:
# function to find the closest prototype vector for a given vector
def find_closest(in_vector, proto_vectors):
    closest = None
    closest_distance = 99999
    for p_v in proto_vectors:
        distance = np.linalg.norm(in_vector - p_v.p_vector)
        if distance < closest_distance:
            closest_distance = distance
            closest = p_v
    return closest

In [61]:
# function to find the second closest prototype vector for a given vector
def find_runnerup(in_vector, proto_vectors):
    runnerup = None
    closest_p_vector = find_closest(in_vector, proto_vectors)
    closest_distance = 99999
    for p_v in proto_vectors:
        distance = np.linalg.norm(in_vector - p_v.p_vector)
        if (distance < closest_distance) and (p_v != closest_p_vector):
            closest_distance = distance
            runnerup = p_v
    return runnerup

In [62]:
def find_class_id(test_vector, p_vectors):
    return find_closest(test_vector, p_vectors).class_id

In [63]:
# Choose R initial prototype for each class
p_vectors = []
for i in np.array([1, 2, 3, 5, 6, 7]):
    # select class i
    y_subset = np.where(y_train == i)
    # select tuple for chosen class
    x_subset = x_train[y_subset]
    # get R random indices between 0 and 50
    samples = np.random.randint(0, len(x_subset), R)
    # select p_vectors, they are chosen randomly from the samples x
    for sample in samples:
        s = x_subset[sample]
        p = prototype(i, s, epsilon)
        p_vectors.append(p)

print("class id \t Initial prototype vector \n")
for p_v in p_vectors:
    print(p_v.class_id, '\t', p_v.p_vector)

class id 	 Initial prototype vector 

1 	 [0.48068481 0.42610063 0.85523385 0.13395639 0.33876812 0.0273752
 0.40055762 0.         0.        ]
1 	 [0.48200176 0.50157233 0.85077951 0.05607477 0.34057971 0.01771337
 0.38475836 0.         0.        ]
1 	 [0.45522388 0.31918239 0.81291759 0.18068536 0.42210145 0.03059581
 0.41078067 0.         0.33333333]
1 	 [0.28050922 0.37264151 0.81514477 0.28037383 0.52536232 0.09178744
 0.26394052 0.         0.21568627]
1 	 [0.37357331 0.58962264 0.83518931 0.         0.38586957 0.00483092
 0.33178439 0.         0.        ]
2 	 [0.30640913 0.37893082 0.88641425 0.27725857 0.47101449 0.09339775
 0.2527881  0.         0.        ]
2 	 [0.46619842 0.42924528 0.22494432 0.33333333 0.41666667 0.0531401
 0.53066914 0.         0.        ]
2 	 [0.24539069 0.27830189 0.78396437 0.3894081  0.62862319 0.10628019
 0.2295539  0.         0.        ]
2 	 [0.32001756 0.35062893 0.83741648 0.32087227 0.45471014 0.09339775
 0.27788104 0.         0.        ]
2 	 [0.233

In [64]:
while epsilon >= 0.01:
    rnd_i = np.random.randint(0, 150)
    rnd_s = x_train[rnd_i]
    target_y = y_train[rnd_i]
    
    epsilon = epsilon - epsilon_dec_factor
    
    closest_pvector = find_closest(rnd_s, p_vectors)
    second_closest_pvector = find_runnerup(rnd_s, p_vectors)
    compare_distance = np.linalg.norm(closest_pvector.p_vector - rnd_s)/np.linalg.norm(second_closest_pvector.p_vector - rnd_s)
    
    if target_y == second_closest_pvector.class_id and target_y != closest_pvector.class_id and compare_distance > 0.8 and compare_distance < 1.2:
        closest_pvector.update(rnd_s, False)
        second_closest_pvector.update(rnd_s)
    elif target_y == closest_pvector.class_id:
        closest_pvector.update(rnd_s)
    closest_pvector.epsilon = epsilon

print("class id \t Final prototype vector \n")
for p_v in p_vectors:
    print(p_v.class_id, '\t', p_v.p_vector)

  # Remove the CWD from sys.path while we load stuff.


class id 	 Final prototype vector 

1 	 [2.85857339e-01 3.09520565e-01 7.75329892e-01 3.01213847e-01
 5.57796899e-01 9.29017848e-02 2.93116492e-01 0.00000000e+00
 1.28663082e-04]
1 	 [0.37850727 0.43866089 0.79764185 0.32073203 0.33152397 0.01931039
 0.30113904 0.23916842 0.        ]
1 	 [0.48098402 0.37005325 0.82427042 0.14662334 0.37814133 0.02789686
 0.41084434 0.         0.29965149]
1 	 [0.26099274 0.30704563 0.7753789  0.29097212 0.55130381 0.09407563
 0.29425782 0.00206716 0.3602079 ]
1 	 [0.45665148 0.43066398 0.88789    0.18627494 0.35325206 0.01601373
 0.36091723 0.         0.01284367]
2 	 [0.25014638 0.34074711 0.79764245 0.37541104 0.52795389 0.0968307
 0.24299911 0.         0.        ]
2 	 [8.01887016e-01 1.58615861e-01 7.14485087e-09 2.60150487e-01
 2.88513079e-01 3.36603354e-02 8.84156909e-01 0.00000000e+00
 3.37023280e-01]
2 	 [0.26375873 0.25984901 0.675071   0.40982414 0.57145484 0.10851119
 0.29648107 0.01626402 0.        ]
2 	 [0.50728558 0.32157914 0.84630388 0.259

In [65]:
predicted_y = [find_class_id(instance, p_vectors) for instance in x_test]

from sklearn.metrics import classification_report

print (classification_report(y_test, predicted_y, target_names=['malignant', 'benign']))

             precision    recall  f1-score   support

  malignant       0.52      0.93      0.67        14
     benign       0.85      0.46      0.59        24

avg / total       0.73      0.67      0.65        54



  .format(len(labels), len(target_names))
