In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd
from scipy.spatial.distance import cdist
from sklearn.metrics import silhouette_score, accuracy_score, confusion_matrix

%matplotlib inline

In [None]:
np.seterr(divide='ignore', invalid='ignore')

In [None]:
iris_data = pd.read_csv('./Iris.csv')

In [None]:
iris_data.head(3)

In [None]:
X = iris_data[['SepalLengthCm','SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]

In [None]:
y_target = iris_data[['Species']]

In [None]:
iris_species = iris_data['Species'].unique()

species_dict = {}
for idx, iris_specie in enumerate(iris_species):
    species_dict[iris_specie] = idx

In [None]:
for iris_specie in iris_species:
    y_target[y_target['Species'] == iris_specie] = species_dict[iris_specie]

In [None]:
y_target_species = y_target['Species'].values
y_target_format = np.array(list(y_target_species), dtype=np.int64)

In [None]:
def k_means(X, K):
    #Keep track of history so you can see k-means in action
    centroids_history = []
    labels_history = []
    rand_index = np.random.choice(X.shape[0], K)  
    centroids = X[rand_index]
    centroids_history.append(centroids)
    
    while True:
        # calculating euclidian distances , #and then np.argmin returns
        # the index location of the minimal distance - which cluster a point    is #assigned to
        labels = np.argmin(cdist(X, centroids), axis=1)
        labels_history.append(labels)
        #Take mean of points within clusters to find new centroids:
        new_centroids = np.array([X[labels == i].mean(axis=0)
                                for i in range(K)])
        centroids_history.append(new_centroids)
        
        # If old centroids and new centroids no longer change, k-means is complete and end. Otherwise continue
        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids
    
    return centroids, labels, centroids_history, labels_history

In [None]:
type(y_target_format)

In [None]:
y_target_format

In [None]:
X_mat = X.values
K_clusters = 3

In [None]:
centroids, labels, centroids_history, labels_history = k_means(X_mat, K_clusters)

In [None]:
labels

In [None]:
def get_confusion_matrix(y_target_in, labels_in):
    conf_mat = confusion_matrix(y_target_in, labels_in)
    conf_mat = pd.DataFrame(conf_mat, index=iris_species, columns=iris_species)
    
    plt.figure(figsize=(5.5,4))
    plt.title('Confusion Matrix for Iris Data')
    plt.imshow(conf_mat, cmap='hot', interpolation='nearest')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    
    return conf_mat

In [None]:
calculated_accuracy_score = accuracy_score(y_target_format, labels)
print(calculated_accuracy_score)

In [None]:
conf_mat = get_confusion_matrix(y_target_format, labels)
conf_mat

### Running the algorithm 55 times

In [None]:
#y_target_format = np.array(list(y_target_species), dtype=np.int64)

In [None]:
#y_target_format

In [None]:
accuracy_dict = {}
accuracy_score_list = []

for i in range(1, 56):
    print(f'Iteration acount: {i}')
    centroids, labels, centroids_history, labels_history = k_means(X_mat, K_clusters)

    calculated_accuracy_score = accuracy_score(y_target_format, labels)
    dict_data = {
        'accuracy_score_calculated': calculated_accuracy_score,
        'centroids': centroids
    }
    accuracy_dict[i] = dict_data
    accuracy_score_list.append(calculated_accuracy_score)

In [None]:
#accuracy_dict

In [None]:
max_accuracy_score = max([val['accuracy_score_calculated'] for key,val in accuracy_dict.items()])

In [None]:
max_accuracy_score

In [None]:
k = [k for k,v in accuracy_dict.items() if v['accuracy_score_calculated'] == max_accuracy_score]

In [None]:
best_centroid = accuracy_dict[k[0]]['centroids']

In [None]:
best_centroid

In [None]:
x_data = range(0, len(accuracy_score_list))
plt.scatter(x_data, accuracy_score_list, alpha=0.5)
plt.title('Accuracy over the number of runs')
plt.xlabel('Number of Runs')
plt.ylabel('Accuracy')
plt.show()