In [1]:
import torch
import os
import numpy as np
import global_resources as gr
import random
import matplotlib.pyplot as plt
import kmc_torch.kmc as kmc

In [2]:
# Example Data process
# Read data as pandas dataframe
data_path = os.path.join(gr.default_dir, r'Data\breast-cancer-wisconsin.data')
df = gr.read_and_return_pd_df(data_path)

# Process & drop Nan(not a number) values
df.replace('?', np.nan, inplace = True)
df.dropna(inplace = True)

# Drop useless data column
df.drop(['id'], axis = 1, inplace = True)
df["bare_nuclei"] = df["bare_nuclei"].astype(np.int64)

# Set device
device = gr.set_device()
print(f"Current device: {device.capitalize()}.")

# Set X as datatype: np.array()
X = np.array(df.drop(['class'], axis = 1)).astype('float32')
# Set X_gpu as datatype: torch.tensor()
X_gpu = torch.tensor(X, device = device, dtype = torch.float64)

Reading files from: D:\Important Files\Repositories\Quantitative-Investment-Algorithms\Data\breast-cancer-wisconsin.data
Current device: Cuda.


In [3]:
print(torch.__version__)
print(torch.version.cuda)

2.7.0+cu128
12.8


# initiate_centroids
(
    X: torch.Tensor, 
    k: int = 3, 
    random_seed: int = RANDOM_SEED
    ) -> torch.Tensor:

In [4]:
centroids = kmc.initiate_centroids(X_gpu, k = 3)
print(centroids)

Initiating centroids with k being 3...
tensor([[ 5.,  5.,  5.,  6.,  3., 10.,  3.,  1.,  1.],
        [ 7.,  5.,  6.,  3.,  3.,  8.,  7.,  4.,  1.],
        [ 1.,  1.,  1.,  1.,  1.,  1.,  3.,  1.,  1.]], device='cuda:0',
       dtype=torch.float64)


# optimize_centroids
(
    X: torch.Tensor, 
    k: int = 3, 
    centroids: torch.Tensor = None, 
    max_iters: int = MAX_ITERATION, 
    tol: float = TOLERANCE
    ) -> tuple[torch.Tensor, torch.Tensor]:

In [5]:
oped_centroids, labels = kmc.optimize_centroids(X_gpu, centroids = centroids)
print(oped_centroids)
print(labels[:10])
print(labels[-10:])

tensor([[7.1892, 4.8288, 5.1081, 4.6667, 4.1802, 8.5135, 5.0631, 3.9910, 1.6757],
        [7.1532, 8.4274, 8.1129, 6.6371, 6.5484, 7.2339, 6.9274, 7.7258, 3.3387],
        [3.0112, 1.2746, 1.3906, 1.3214, 2.0826, 1.2924, 2.0804, 1.2478, 1.1049]],
       device='cuda:0', dtype=torch.float64)
tensor([2, 0, 2, 0, 2, 1, 2, 2, 2, 2], device='cuda:0')
tensor([2, 2, 1, 2, 2, 2, 2, 1, 1, 1], device='cuda:0')


# calculate_variation
(
    X: torch.Tensor, 
    centroids: torch.Tensor = None, 
    labels: torch.Tensor = None
    ) -> float:

In [6]:
variation = kmc.calculate_variation(X_gpu, centroids = oped_centroids, labels = labels)
print(variation)

16267.235662001287


# WCSS_for_single_k
(
    X: torch.Tensor, 
    k: int = 3, 
    n_restarts: int = N_RESTARTS, 
    tol: float = TOLERANCE, 
    max_iters: int = MAX_ITERATION,
    ) -> torch.Tensor:

In [7]:
X, labels, centroids, var = kmc.WCSS_for_single_k(X_gpu)
print(X)
print(labels[:10])
print(centroids)
print(var)
print(X.dtype)

Clustering with: k = 3.
Initiating centroids with k being 3...
Initiating centroids with k being 3...
Initiating centroids with k being 3...
Initiating centroids with k being 3...
Initiating centroids with k being 3...
Initiating centroids with k being 3...
Initiating centroids with k being 3...
Initiating centroids with k being 3...
Initiating centroids with k being 3...
Initiating centroids with k being 3...
tensor([[ 5.,  1.,  1.,  ...,  3.,  1.,  1.],
        [ 5.,  4.,  4.,  ...,  3.,  2.,  1.],
        [ 3.,  1.,  1.,  ...,  3.,  1.,  1.],
        ...,
        [ 5., 10., 10.,  ...,  8., 10.,  2.],
        [ 4.,  8.,  6.,  ..., 10.,  6.,  1.],
        [ 4.,  8.,  8.,  ..., 10.,  4.,  1.]], device='cuda:0',
       dtype=torch.float64)
tensor([2, 1, 2, 1, 2, 0, 2, 2, 2, 2], device='cuda:0')
tensor([[7.0813, 8.4309, 8.1463, 6.7154, 6.5528, 7.2358, 6.9593, 7.6667, 3.3902],
        [7.3136, 4.7627, 5.0169, 4.4576, 4.1695, 8.2119, 4.9746, 4.0424, 1.7373],
        [2.9412, 1.2511, 1.3620