In [29]:
from sklearn.datasets import load_breast_cancer
import numpy as np
from sklearn.metrics import euclidean_distances
import pandas as pd

In [30]:
data = pd.read_csv(r"data/SD-2X_rocktype.csv")
x = data.iloc[:, 0:6].values
y = data.iloc[:, 6].values

In [31]:
np.unique(y)

array([1, 2, 3, 4], dtype=int64)

In [32]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)

In [33]:
# use Minmaxscaler because we use euclidean distance
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
x_train = minmax.fit_transform(x_train)
x_test = minmax.transform(x_test)

In [34]:
# parameter for LVQ
R = 5 # R is the # of initial prototype for each class
n_classes = 4
epsilon = 0.9
epsilon_dec_factor = 0.001

In [35]:
# class of prototype vectors
class prototype(object):
    def __init__(self, class_id, p_vector, epsilon):
        self.class_id = class_id
        self.p_vector = p_vector
        self.epsilon = epsilon
    def update(self, u_vector, increment = True):
        if increment:
            # Move the prototype closer to input vector
            self.p_vector = self.p_vector + self.epsilon * (u_vector - self.p_vector)
        else:
            # Move the prototype away from input vector
            self.p_vector = self.p_vector - self.epsilon * (u_vector - self.p_vector)

In [36]:
# function to find the closest prototype vector for a given vector
def find_closest(in_vector, proto_vectors):
    closest = None
    closest_distance = 99999
    for p_v in proto_vectors:
        distance = np.linalg.norm(in_vector - p_v.p_vector)
        if distance < closest_distance:
            closest_distance = distance
            closest = p_v
    return closest

In [37]:
# function to find the second closest prototype vector for a given vector
def find_runnerup(in_vector, proto_vectors):
    runnerup = None
    closest_p_vector = find_closest(in_vector, proto_vectors)
    closest_distance = 99999
    for p_v in proto_vectors:
        distance = np.linalg.norm(in_vector - p_v.p_vector)
        if (distance < closest_distance) and (p_v != closest_p_vector):
            closest_distance = distance
            runnerup = p_v
    return runnerup

In [38]:
def find_class_id(test_vector, p_vectors):
    return find_closest(test_vector, p_vectors).class_id

In [39]:
# Choose R initial prototype for each class
p_vectors = []
for i in np.array([1, 2, 3, 4]):
    # select class i
    y_subset = np.where(y_train == i)
    # select tuple for chosen class
    x_subset = x_train[y_subset]
    # get R random indices between 0 and 50
    samples = np.random.randint(0, len(x_subset), R)
    # select p_vectors, they are chosen randomly from the samples x
    for sample in samples:
        s = x_subset[sample]
        p = prototype(i, s, epsilon)
        p_vectors.append(p)

print("class id \t Initial prototype vector \n")
for p_v in p_vectors:
    print(p_v.class_id, '\t', p_v.p_vector)

class id 	 Initial prototype vector 

1 	 [0.30481193 0.70631255 0.11583125 0.82602425 0.1009     0.90651475]
1 	 [0.1999412  0.44399538 0.19491401 0.74364257 0.131      0.80046743]
1 	 [0.15746785 0.47671286 0.13949728 0.74389275 0.0766     0.88664914]
1 	 [0.22927071 0.42282525 0.25782743 0.71690311 0.1732     0.70990359]
1 	 [0.20592737 0.41397229 0.18153756 0.75178067 0.1393     0.81390593]
2 	 [0.41366713 0.55966128 0.37409966 0.77145632 0.2956     0.52673094]
2 	 [0.4106153  0.66301001 0.37483463 0.74604132 0.289      0.85422144]
2 	 [0.39449925 0.676097   0.33823313 0.69899341 0.2554     0.90885188]
2 	 [0.4123631  0.69053118 0.27767162 0.78378856 0.2927     0.90534619]
2 	 [0.4106153  0.66301001 0.37483463 0.74604132 0.289      0.85422144]
3 	 [0.42687334 0.71343341 0.29295899 0.83079232 0.3249     0.61378907]
3 	 [0.44259017 0.5504234  0.51903572 0.82602425 0.3618     0.32865907]
3 	 [0.3278468  0.55292533 0.31206821 0.73847716 0.3476     0.58282209]
3 	 [0.40520821 0.617398  

In [40]:
while epsilon >= 0.01:
    rnd_i = np.random.randint(0, 150)
    rnd_s = x_train[rnd_i]
    target_y = y_train[rnd_i]
    
    epsilon = epsilon - epsilon_dec_factor
    
    closest_pvector = find_closest(rnd_s, p_vectors)
    second_closest_pvector = find_runnerup(rnd_s, p_vectors)
    compare_distance = np.linalg.norm(closest_pvector.p_vector - rnd_s)/np.linalg.norm(second_closest_pvector.p_vector - rnd_s)
    
    if target_y == second_closest_pvector.class_id and target_y != closest_pvector.class_id and compare_distance > 0.8 and compare_distance < 1.2:
        closest_pvector.update(rnd_s, False)
        second_closest_pvector.update(rnd_s)
    elif target_y == closest_pvector.class_id:
        closest_pvector.update(rnd_s)
    closest_pvector.epsilon = epsilon

print("class id \t Final prototype vector \n")
for p_v in p_vectors:
    print(p_v.class_id, '\t', p_v.p_vector)

class id 	 Final prototype vector 

1 	 [0.35983582 0.91604025 0.04988949 0.95494817 0.18972713 0.95464563]
1 	 [0.32592883 0.41457177 0.47279003 0.59952234 0.18818411 0.64145855]
1 	 [0.12880312 0.50382559 0.09910976 0.7456629  0.04412028 0.94681108]
1 	 [0.15846758 0.41102393 0.2283468  0.66825962 0.07863287 0.7789732 ]
1 	 [0.32972049 0.59295141 0.24197829 0.82171895 0.16091045 0.74658668]
2 	 [0.5011512  0.79804153 0.68962281 0.78998978 0.24819947 0.91116092]
2 	 [0.39446158 0.5470057  0.5662295  0.63941518 0.25536756 0.69782337]
2 	 [0.39449925 0.676097   0.33823313 0.69899341 0.2554     0.90885188]
2 	 [0.4123631  0.69053118 0.27767162 0.78378856 0.2927     0.90534619]
2 	 [0.31732366 0.50788292 0.42000772 0.74162562 0.23110231 0.59575458]
3 	 [0.39469807 0.6172363  0.2891472  0.82629977 0.41902941 0.60019672]
3 	 [0.67622634 0.79349509 0.49042523 0.89124804 0.42212248 0.43526476]
3 	 [0.42084861 0.57016652 0.451211   0.7014476  0.46026933 0.39902276]
3 	 [0.15215435 0.2843335  0

In [41]:
predicted_y = [find_class_id(instance, p_vectors) for instance in x_test]

from sklearn.metrics import classification_report

print (classification_report(y_test, predicted_y, target_names=['1', '2', '3', '4']))

             precision    recall  f1-score   support

          1       0.76      0.81      0.79        16
          2       0.33      0.18      0.24        11
          3       0.80      0.89      0.84        45
          4       0.98      0.97      0.97        60

avg / total       0.84      0.86      0.85       132

