#### K-Means Clustering Algorithm for Palmer Archipelago penguin data
- Data from: https://www.kaggle.com/datasets/parulpandey/palmer-archipelago-antarctica-penguin-data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
# loading the dataset
data = pd.read_csv('data/penguins_size.csv')
data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [3]:
# data preprocessing
data = data.dropna()
data = data[data['sex'] != '.']

cleanup_nums = {"species": {"Adelie": 0, "Chinstrap": 1, "Gentoo": 2},
                "island": {"Biscoe": 0, "Dream": 1, "Torgersen": 2},
                "sex": {"MALE": 0.0, "FEMALE": 1.0}}
data = data.replace(cleanup_nums)

data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,2,39.1,18.7,181.0,3750.0,0.0
1,0,2,39.5,17.4,186.0,3800.0,1.0
2,0,2,40.3,18.0,195.0,3250.0,1.0
4,0,2,36.7,19.3,193.0,3450.0,1.0
5,0,2,39.3,20.6,190.0,3650.0,0.0


In [4]:
x = np.array(data.drop(['species'], axis=1).copy())
y = np.array(data['species'].copy()).astype(int)

In [5]:
# data standardization

# data standardization x=(x-mean)/standard deviation
x = (x-x.mean(axis=0)) / x.std(axis=0)

print("Mean: ", x.mean()) # mean should be 0
print("Std. Deviation: ", x.std()) # std. deviation should be 1

Mean:  1.56475877744967e-16
Std. Deviation:  1.0


In [6]:
# split the dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((266, 6), (67, 6), (266,), (67,))

In [7]:
# calculate the confusion matrix
def evaluator(y, y_pred):
    # confusion matrix with sklearn
    from sklearn.metrics import confusion_matrix
    
    confusion_matrix = confusion_matrix(y, y_pred)
    print('Confusion matrix:\n', confusion_matrix)
    
    
    # alternative confusion matrix with NumPy (uncomment to use this instead)
    #confusion_matrix2 = np.zeros(( len(np.unique(y)), len(np.unique(y)) ))
    #for i in range(len(y)):
    #    confusion_matrix2[y[i]][y_pred[i]] += 1
        
    #print('Confusion matrix (numpy):\n', confusion_matrix2)

In [8]:
# building a baseline K-means model with Sci-kit learn
from sklearn.cluster import KMeans
km = KMeans(n_clusters=3) # n_clusters - the number of clusters
km.fit(x_train)
y_pred = km.predict(x_train)
evaluator(y_train, y_pred)
y_pred = km.predict(x_test)
evaluator(y_test, y_pred)

#print("-----")
#print('Labels: \n', km.labels_)
#print('Centers: \n', km.cluster_centers_)

Confusion matrix:
 [[ 60   0  47]
 [ 32   0  26]
 [  0 101   0]]
Confusion matrix:
 [[13  0 26]
 [ 2  0  8]
 [ 0 18  0]]


In [9]:
# building K-means model with NumPy
class KMeans(object):
    def __init__(self, k):
        self.k = k
        
    def train(self, x, y, x_test, y_test, learning_rate, n_iters):
        # initialize the centers
        n = np.arange(x.shape[0])
        centers = x[np.random.choice(n,self.k,replace=False)] 

        # initialize the variable for the closest cluster
        closest_cluster = np.zeros(1)
        
        while True:
            closest_cluster_old = np.copy(closest_cluster)

            # update clusters
            distances = np.sum(np.sqrt((x-centers[:,np.newaxis])**2),axis=-1)
            closest_cluster = np.argmin(distances, axis=0)

            # update centers
            for i in range(self.k):
                if np.any(closest_cluster == i):
                    centers[i] = np.mean(x[closest_cluster==i],axis=0)

            # break out of while loop if clusters converge
            if all(closest_cluster_old == closest_cluster):
                break
        
        return closest_cluster, centers

    def predict(self, x):
        # initialize the centers
        n = np.arange(x.shape[0])
        centers = x[np.random.choice(n,self.k,replace=False)]
        # initialize the variable for the closest cluster
        closest_cluster = np.zeros(1)
        while True:
            closest_cluster_old = np.copy(closest_cluster)
            # update clusters
            distances = np.sum(np.sqrt((x-centers[:,np.newaxis])**2),axis=-1)
            closest_cluster = np.argmin(distances, axis=0)
            # update centers
            for i in range(self.k):
                if np.any(closest_cluster == i):
                    centers[i] = np.mean(x[closest_cluster==i],axis=0)
            # break out of while loop if clusters converge
            if all(closest_cluster_old == closest_cluster):
                break
        return closest_cluster

In [10]:
# initialize and train the model

kmeans1 = KMeans(3) # set number of clusters to 3

closest_cluster,centers = kmeans1.train(x_train, y_train, x_test, y_test, 1, 1)

#print('Labels: \n', closest_cluster)
#print('Centers: \n', centers)

In [11]:
# evaluate the model and print the confusion matrixes for both training and test sets

print('Labels: \n', closest_cluster)
print('Centers: \n', centers)
print("-----")

y_pred = kmeans1.predict(x_train)
evaluator(y_train, y_pred)
y_pred = kmeans1.predict(x_test)
evaluator(y_test, y_pred)

Labels: 
 [0 2 0 0 2 0 1 2 0 2 0 1 0 2 0 0 0 2 0 2 2 2 0 0 1 0 1 0 2 0 1 0 0 1 1 2 0
 2 2 2 2 2 1 0 0 1 2 2 0 2 2 0 0 0 1 1 2 0 1 2 0 1 1 0 1 0 2 1 1 0 0 2 2 1
 0 1 0 1 1 0 1 1 2 0 0 1 1 2 0 2 2 0 2 0 1 2 1 1 1 0 1 1 1 2 1 1 1 1 0 0 0
 0 1 0 2 1 0 2 0 1 1 1 1 1 2 0 0 0 1 0 0 2 0 1 1 2 2 2 0 2 2 0 0 2 1 1 1 2
 1 0 0 0 0 1 0 2 0 0 2 2 2 0 1 1 2 2 0 0 1 2 0 2 2 0 2 0 1 0 1 1 2 1 1 2 0
 1 0 0 1 1 2 2 0 1 1 1 1 0 0 0 1 2 1 0 0 1 1 0 1 1 0 0 2 1 0 0 1 1 0 2 1 1
 2 2 0 2 1 2 1 1 0 0 0 1 0 0 1 1 1 0 1 0 2 2 1 0 2 0 2 1 0 2 2 1 0 0 0 1 1
 2 0 0 2 1 1 2]
Centers: 
 [[-0.91313599  0.6490015  -1.12466359  1.14564997  1.09337705  0.01891077]
 [ 0.54905251  0.02580425  0.99742949 -0.4170529  -0.26936113 -0.9692912 ]
 [ 0.54571418 -0.59349274  0.24359278 -0.84042413 -0.96375576  0.98165159]]
-----
Confusion matrix:
 [[106   1   0]
 [ 57   0   1]
 [  0  51  50]]
Confusion matrix:
 [[26 13  0]
 [ 8  2  0]
 [ 0  0 18]]
