- Clustering algorithms are not guided by any foreknowledge, so it is described as unsupervised machine learning.
- "Clustering is a useful technique when you want to learn about the structure of a data set but you do not know ahead of time its constituent parts".

# Preliminaries

- __pstdev()__ finds the standard deviation of a population, and __stdev()__, finds the stardard deviation of a sample.

Our __zscores()__ function converts a sequence (list, tuples, string etc) of floats into z-scores, relative to all the numbers in the sequence.

We will create a file called __kmeans.py__

In [1]:
''' to go into kmeans.py
from __future__ import annotations
from typing import TypeVar, Generic, List, Sequence
from copy import deepcopy
from functools import partial
from random import uniform
from statistics import mean, pstdev
from dataclasses import dataclass

# data_point.py to be defined
from data_point import DataPoint

def zscores(original: Sequence[float]) ->List[float]:
    avg: float = mean(original)
    std: float = pstdev(original)
    if std == 0: # return all zeros if ther is no variation
        return[0] * len(original)
    return [(x - avg) / std for x in original]
'''

' to go into kmeans.py\nfrom __future__ import annotations\nfrom typing import TypeVar, Generic, List, Sequence\nfrom copy import deepcopy\nfrom functools import partial\nfrom random import uniform\nfrom statistics import mean, pstdev\nfrom dataclasses import dataclass\n\n# data_point.py to be defined\nfrom data_point import DataPoint\n\ndef zscores(original: Sequence[float]) ->List[float]:\n    avg: float = mean(original)\n    std: float = pstdev(original)\n    if std == 0: # return all zeros if ther is no variation\n        return[0] * len(original)\n    return [(x - avg) / std for x in original]\n'

We will create a class called __DataPoint__ and save it in a file called __data_point.py__.

- The reason why we have two tuples, __._ originals__ and __.dimensions__ is because we later want to replace the dimensions with z-scores by k-means.

In [2]:
''' to go into data_point.py
from __future__ import annotations
from typing import Iterator, Tuple, List, Iterable
from math import sqrt


class DataPoint:
    def __init__(self, initial: Iterable[float]) -> None:
        self._originals: Tuple[float, ...] = tuple(initial)
        self.dimensions: Tuple[float, ...] = tuple(initial)

    @property
    def num_dimensions(self) -> int:
        return len(self.dimensions)
    
    # Euclidean distance
    def distance(self, other: DataPoint) -> float:
        combined: Iterator[Tuple[float, float]] = zip(self.dimensions, other.dimensions)
        differences: List[float] = [(x - y) ** 2 for x, y in combined]
        return sqrt(sum(differences))

    def __eq__(self, other: object) -> bool:
        if not isinstance(other, DataPoint):
            return NotImplemented
        return self.dimensions == other.dimensions

    def __repr__(self) -> str:
        return self._originals.__repr__()
'''

' to go into data_point.py\nfrom __future__ import annotations\nfrom typing import Iterator, Tuple, List, Iterable\nfrom math import sqrt\n\n\nclass DataPoint:\n    def __init__(self, initial: Iterable[float]) -> None:\n        self._originals: Tuple[float, ...] = tuple(initial)\n        self.dimensions: Tuple[float, ...] = tuple(initial)\n\n    @property\n    def num_dimensions(self) -> int:\n        return len(self.dimensions)\n    \n    # Euclidean distance\n    def distance(self, other: DataPoint) -> float:\n        combined: Iterator[Tuple[float, float]] = zip(self.dimensions, other.dimensions)\n        differences: List[float] = [(x - y) ** 2 for x, y in combined]\n        return sqrt(sum(differences))\n\n    def __eq__(self, other: object) -> bool:\n        if not isinstance(other, DataPoint):\n            return NotImplemented\n        return self.dimensions == other.dimensions\n\n    def __repr__(self) -> str:\n        return self._originals.__repr__()\n'