# Clustering Metrics:

**Author:** Matt Q.

## Import the necessary libraries:

In [1]:
import os
print("Working dir:", os.getcwd())

Working dir: c:\Users\miqui\OneDrive\Python-Projects\Thieu Metrics Collaboration\permetrics\examples\clustering


In [2]:
import numpy as np
import pandas as pd

iris = pd.read_csv("C:/Users/miqui/OneDrive/ML-DL-Datasets/IrisData.csv")
dfSample = pd.read_csv("C:/Users/miqui/OneDrive/CSU Classes/Exit Project/Clustering Project/SampleData.csv")

In [75]:
# Preparing the Iris dataset:
my_dict = {"Iris-setosa": 0,
           "Iris-versicolor": 1,
           "Iris-virginica": 2}
iris["Labels"] = iris["Species"].map(my_dict)
X_iris = iris.iloc[:, 0:4]
labels = iris.iloc[:, 5]

del my_dict

# Shifting the labels to be 0, 1, 2 instead of 1, 2, 3:
labels2 = np.array(labels) - 1
np.unique(labels2)


X_sample = dfSample[["V1", "V2", "V3"]]
X_array = np.array(X_sample)
y_sample = dfSample[["Labels"]]
y_labels = y_sample - 1
y_array = np.array(y_labels).ravel()

**Calculating Centroids per Cluster:**

In [4]:
def get_centroids(X: pd.DataFrame, labels):
    """
    Calculates the centroids from the data given labels

    Args:
        X (pd.DataFrame, np.ndarray): The original data that was clustered
        labels (list. pd.DataFrame, np.ndarray): The predicted cluster assignment values

    Returns:
        centroids (np.ndarray): The centroids given the input data and labels
    """
    x = pd.DataFrame(X)
    labels = np.array(labels)
    k = int(np.max(labels) + 1)
    n_cols = x.shape[1]
    centers = np.array(np.zeros(shape=(k, n_cols)))

    # Getting the centroids:
    for i in range(k):
        centers[i, :] = np.mean(x.iloc[labels == i], axis=0)

    return centers

In [14]:
get_centroids(X_sample, y_labels)

array([[ 9. ,  2. , 18. ],
       [ 6. ,  5. , 15. ],
       [ 2.5,  8.5, 11.5]])

In [97]:
def get_centroids_new(X, labels):
    """
    Calculates the centroids from the data given, for each label

    Args:
        X (pd.DataFrame, np.ndarray): The original data that was clustered
        labels (list, np.ndarray): The predicted cluster assignment values

    Returns:
        centroids (np.ndarray): The centroids given the input data and labels
    """
    n_samples, n_features = X.shape
    n_classes = len(np.unique(labels))
    # Mask mapping each class to its members.
    centroids = np.empty((n_classes, n_features), dtype=np.float64)
    # Number of clusters in each class.
    nk = np.zeros(n_classes)

    for cur_class in range(n_classes):
        centroid_mask = labels == cur_class
        nk[cur_class] = np.sum(centroid_mask)
        centroids[cur_class] = X[centroid_mask].mean(axis=0)
    
    return centroids

In [98]:
get_centroids_new(X_array, y_array)

array([[ 9. ,  2. , 18. ],
       [ 6. ,  5. , 15. ],
       [ 2.5,  8.5, 11.5]])