In [9]:
import numpy as np
import pandas as pd
from math import sqrt
import csv
import os

In [2]:
data_file = "Example.tsv"
k = 3
max_iter = 100
threshold = 0.5
centroids = {} 

In [3]:
df = pd.read_csv(data_file, sep='\t', header=None)

#No labels will be attached so take just data columns
dropped = df[[1,2]] 

X = dropped.values.tolist() 

print(len(X))

400


In [4]:
#Euclidean distance
def distance(feat_one, feat_two):
    squared_distance = 0

    for i in range(len(feat_one)):
        squared_distance += (feat_one[i] - feat_two[i])**2

    return sqrt(squared_distance);

In [5]:
for i in range(k):
    centroids[i] = X[i]

centroid_points = []
centroid_points.append([centroids[i] for i in range(k)])

errors = []

for epoch in range(max_iter):
    classes = {}
    for i in range(k):
        classes[i] = [] #K clusters initialization

    #Clustering the points based on distance function to centroids
    for feature in X:
        distances = [distance(feature, centroids[centroid]) for centroid in centroids]
        classification = distances.index(min(distances))
        classes[classification].append(feature)

    error = 0
    
    #Cost(J) calculation (intra-cluster distances)
    for j in classes.keys():
        for point in classes[j]:
            diff = distance(point, centroids[j])
            error += diff * diff 
    
    errors.append(error)
    
    #Terminate
    if epoch > 0:
        if (errors[epoch - 1] - error) < threshold:
            break

    #Recalculation of centroids
    for classification in classes:
        centroids[classification] = np.average(classes[classification], axis = 0)

    centroid_points.append([centroids[i] for i in range(3)])

In [6]:
print(centroid_points[-1])

[array([4.33965128, 2.21092625]), array([-2.09641599,  0.39852832]), array([ 2.91076041, -4.46278297])]


In [12]:
#Output the Errors(Progress) and Centroids (Proto) as TSV file
name = data_file.split('.')[0]

output_path = ""

out_proto = name + "-Proto.tsv"
out_progr = name + "-Progr.tsv"

with open(os.path.join(output_path, out_proto), mode='w', newline='') as file:
    out = csv.writer(file, delimiter='\t')
    
    for element in centroid_points:
        out.writerow([','.join([str(elem) for elem in element[0]]), ','.join([str(elem) for elem in element[1]]), ','.join([str(elem) for elem in element[2]])])

        
with open(os.path.join(output_path, out_progr), mode='w', newline='') as file:
    out = csv.writer(file, delimiter='\t')

    for elem in errors:
        out.writerow([elem])