# Implementação básica do algoritmo k-means

## Imports

In [73]:
import numpy as np
import pandas as pd

import sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from IPython.display import clear_output

## Criando o Dataset

- N datapoints
- Datapoints D-dimensionais
- Floats randomizados ➡ intervalo de [1, 10) (ou [1, 10], dependendo da método de arredondamento de float usado. Isto não deve ter nenhuma relevância estatística, no entanto)

In [74]:
N = 100000; D = 4

dataset  = [[np.random.uniform(1, 10) for _ in range(D)] for _ in range(N)]
dataset = pd.DataFrame(dataset, columns=[f'd{i}' for i in range(D)])
dataset

Unnamed: 0,d0,d1,d2,d3
0,9.336476,9.792662,1.346396,6.264275
1,9.879061,9.847793,1.092177,6.543734
2,5.546666,4.490165,6.150200,3.664296
3,2.957207,3.695874,4.702524,9.253130
4,6.635922,4.568868,1.968860,6.630090
...,...,...,...,...
99995,5.765507,4.839275,7.728256,8.953079
99996,6.427086,9.016328,7.868604,5.431322
99997,6.844309,9.638616,3.465673,6.048671
99998,3.675460,9.387854,3.320836,2.433438


## K-Means (CPU)

In [132]:
def kMeansCPU(dataset:pd.DataFrame, k, maxIter):
    # Gerando centróides iniciais randomicamente

    centroids = pd.concat([(dataset.apply(lambda x: float(x.sample().iloc[0]))) for _ in range(k)], axis=1)
    print(f'Centroids:\n{centroids.T}\n')
    
    # Para cada datapoint, calcular distâncias entre ele e cada centróide; depois, encontrar o centróide mais próximo e salvar seu index
    closestCent = centroids.apply(lambda x: np.sqrt(((dataset - x) ** 2).sum(axis=1))).idxmin(axis=1)
    print(f'Closest centroid index:\n{closestCent}\n')

    centroids_NEW = dataset.groupby(closestCent).apply(lambda x: np.exp(np.log(x).mean()))
    return centroids_NEW


K = 5
MAX_ITERATIONS = 500

kMeansCPU(dataset, K, MAX_ITERATIONS)

Centroids:
         d0        d1        d2        d3
0  8.558230  1.495131  2.204470  3.472749
1  7.310373  6.179121  9.420427  7.456652
2  2.302783  6.659138  1.286762  3.579423
3  7.104359  8.291331  3.864413  4.681459
4  4.282139  4.520714  2.366698  9.568329

Closest centroid index:
0        3
1        3
2        3
3        4
4        4
        ..
99995    1
99996    1
99997    3
99998    2
99999    2
Length: 100000, dtype: int64



Unnamed: 0,d0,d1,d2,d3
0,7.070065,2.381203,3.625857,3.3215
1,5.1801,4.706068,8.284833,5.715253
2,2.340841,5.190451,3.349394,3.130642
3,6.390235,7.58604,4.19468,3.942983
4,3.664282,3.808384,3.42133,8.186432
