In [2]:
# K-Means Clustering Algorithm

# ! Caution: Assumed that the dataset has no missing value,
# !          and consists of numeric values only.

# ? How the algorithm works ?
# 1. select cluster number "k"
# 2. choose randomly k centroids
# 3. calculate distance of objects to centroids
# 4. group based on minimum distance
# 5. if no group update, end
#    else, go to step 3

import math
from random import random

# Determine the number of clusters arbitrarily
k = 2

# Initialize list for the processed dataset
processed_dataset = []

# Initialize list for the clustered dataset
clustered_dataset = []

# Initialize list for the raw dataset
raw_dataset = []

# Initialize list for the labels of dataset
labels = []

# Initialize dictionary for the centroids
# Key: index of centroid (1 for k1, 2 for k2, ...)
# Value: list of the mean data
centroids = dict()

# Initialize dictionary for column-group based average calculation 
# Key: index of centroid (1 for k1, 2 for k2, ...)
# Value: dictionary
#        index of attribute (column index): dictionary with 'sum', 'total', and 'avg' keys
group_averages = dict()

# Initialize list for control flow
# If any group change occurs, list will contain at least one True value
group_changes = [True]

# Initialize list for distance changes (as list) of each sample (row)
# distance[2]: distance changes of sample index 2 (3rd row) --> [1,1,2,2,2]
distance_changes = []

# Determine the file path of dataset
file_path = '../Custom Datasets/k-means-test.csv'

# Read all lines of the file
with open(file_path, 'r') as file:
  lines = file.readlines()

# Get the line where the labels reside
for line_number in range(0, len(lines)):
    if not lines[line_number].isspace():
        # Store the labels
        labels = lines.pop(line_number).strip().split(',')
        break

print(f"Labels:\n{labels}")
print()

# Store raw line in the dataset
for line in lines:
   if not line.isspace():
      # Store sample data after removing newline characters and splitting
      raw_dataset.append(line.strip().split(","))

print("Raw dataset:")
for sample in raw_dataset:
  print(sample)
print()

# Function for detecting float typed values
def is_float(string) -> bool:
  try:
    # If the value is integer typed, casting won't throw exception
    int(string)
    return False
  except:
    # The value is float typed, casting threw exception
    return True

# Process the raw data by type casting
for row in range(0, len(raw_dataset)):
  instance = []
  for col in range(0, len(labels)):
    value = raw_dataset[row][col]
    if is_float(value):
      instance.append(float(value))
    else:
      instance.append(int(value))
  processed_dataset.append(instance)

print("Processed dataset:")
for sample in processed_dataset:
  print(sample)
print()

# Create lists for each sample for their distance changes
for row in processed_dataset:
   distance_changes.append([0])

# Choose randomly k centroids
centroid_indices = []
temp_k = k
while temp_k > 0:   
  index = int(random() * (10**len(processed_dataset)) % len(processed_dataset))
  if index not in centroid_indices:
    centroid_indices.append(index)
    temp_k -= 1

# Store the centroids in dictionary
for index in centroid_indices:
   centroids[len(centroids.keys())+1] = processed_dataset[index]

print("Centroids:")
for centroid in centroids.items():
  print(centroid)
print()

# Initialize dictionary for each centroid for average calculation
for centroid in centroids.keys():
  group_averages[centroid] = {}
  for col in range(0, len(processed_dataset[0])):
    group_averages[centroid][col] = {
      'sum':0,
      'total':0,
      'avg':0
    }

# Function of euclidean distance
def calculate_euclidean_distance(vector_p, vector_q):
  if len(vector_p) == len(vector_q):
     d = 0
     for i in range(0, len(vector_p)):
       d += (vector_p[i] - vector_q[i])*(vector_p[i] - vector_q[i])
     return math.sqrt(d)
  return None

# Function to update the centroids
def update_centroids():
  # Calculate the column-group based averages
  for row in range(0, len(processed_dataset)):
    # Sample data
    sample = processed_dataset[row]
    # Current centroid of the sample
    current_centroid = distance_changes[row][-1]

    # Go through each attribute
    for col in range(0, len(sample)):
      # Attribute of the sample
      attribute = sample[col]
      for centroid in centroids.keys():
        # Sum attributes among same group
        if centroid == current_centroid:
          values = group_averages.get(centroid).get(col)
          values['sum'] += attribute
          values['total'] += 1
          values['avg'] = values.get('sum') / values.get('total')
  
  # Update the centroid with calculated averages
  for col in range(0, len(processed_dataset[0])):
    for row in range(0, len(processed_dataset)):
      # Current centroid of the sample
      current_centroid = distance_changes[row][-1]
      for centroid in centroids.keys():
        if centroid == current_centroid:
          means = centroids[centroid].copy()
          # Insert method inserts the value before the specified index
          means.insert(col, group_averages[centroid][col]['avg'])
          # Combine list by excluding the value previously resided at the index
          means = means[:col+1] + means[col+2:]
          centroids[centroid] = means

while group_changes.count(True) != 0:
  # Reset the list
  group_changes = []
  
  # Calculate the distances
  for index in range(0, len(processed_dataset)):
    # Sample data
    sample = processed_dataset[index]
    # Current centroid group
    current_centroid = distance_changes[index][-1]
    # Initialize list to store distances between the sample and each centroid
    distances = []

    # Calculate the distance with euclidean distance formula
    for centroid in centroids.values():
      d = calculate_euclidean_distance(sample, centroid)
      distances.append(d)
    # Check whether there is one min result
    if distances.count(min(distances)) == 1:
      # New centroid
      new_centroid = distances.index(min(distances))+1
      # Check whether the new centroid is same as the current one
      if new_centroid == current_centroid:
        # Preserve current group
        distance_changes[index].append(current_centroid)
        # No centroid change
        group_changes.append(False)
      else:
        distance_changes[index].append(new_centroid)
        group_changes.append(True)
    # There is multiple equal results
    else:
      # Store their indices
      indices = []

      for i in range(0, len(distances)):
        if min(distances) == distances[i]:
            # The indices of centroids start by 1 not 0
            indices.append(i+1)
      
      # Check whether the current centroid is in the list
      if current_centroid in indices:
        # Preserve current group
        distance_changes[index].append(current_centroid)
        # No centroid change
        group_changes.append(False)
      # Update new centroid
      else:
        # New centroid
        new_centroid = distances.index(min(distances))+1
        distance_changes[index].append(new_centroid)
        group_changes.append(True)
  
  # Update the centroids for the next iteration
  if group_changes.count(True) != 0:
    update_centroids()

print(f'Group changes:\n{group_changes}')
print()

print("Group averages:")
for group, values in group_averages.items():
  for col, means in values.items():
    print(f"Group:\t{group}\tColumn:\t{col}\tMeans:\t{means}")
print()

print("Distance changes:")
for dc in distance_changes:
  print(dc)
print()

# Append the class to labels
labels.append('Class')

# Combine the class value with the dataset
for row in range(0, len(processed_dataset)):
  sample = processed_dataset[row].copy()
  sample.append(distance_changes[row][-1])
  clustered_dataset.append(sample)

print("Dataset after clustering:")
for sample in clustered_dataset:
  print(sample)
print()

Labels:
['A1', 'A2', 'A3']

Raw dataset:
['5', '12', '35']
['6', '15', '36']
['3', '16', '31']
['4', '18', '34']
['8', '17', '30']
['25', '6', '14']
['23', '8', '16']
['26', '7', '12']
['28', '4', '11']
['24', '5', '18']

Processed dataset:
[5, 12, 35]
[6, 15, 36]
[3, 16, 31]
[4, 18, 34]
[8, 17, 30]
[25, 6, 14]
[23, 8, 16]
[26, 7, 12]
[28, 4, 11]
[24, 5, 18]

Centroids:
(1, [26, 7, 12])
(2, [24, 5, 18])

Group changes:
[False, False, False, False, False, False, False, False, False, False]

Group averages:
Group:	1	Column:	0	Means:	{'sum': 205, 'total': 8, 'avg': 25.625}
Group:	1	Column:	1	Means:	{'sum': 47, 'total': 8, 'avg': 5.875}
Group:	1	Column:	2	Means:	{'sum': 108, 'total': 8, 'avg': 13.5}
Group:	2	Column:	0	Means:	{'sum': 99, 'total': 12, 'avg': 8.25}
Group:	2	Column:	1	Means:	{'sum': 169, 'total': 12, 'avg': 14.083333333333334}
Group:	2	Column:	2	Means:	{'sum': 366, 'total': 12, 'avg': 30.5}

Distance changes:
[0, 2, 2, 2]
[0, 2, 2, 2]
[0, 2, 2, 2]
[0, 2, 2, 2]
[0, 2, 2, 2]
[0,