# K Means Clustering

In [98]:
# Imports
import pandas as pd
import math
from array import *

## Read Data

In [99]:
df = pd.read_csv("/home/ktb/gitlab/kcluster/data/tumor.csv")

## Configurations

In [100]:
TOTAL_FEATURES = 9
MALIGN_FLAG = 4
BENIGN_FLAG = 2
TOTAL_ENTRIES = 699

## Functions

In [130]:
def euclidean_distance(f1, f2):
    squared_sum = 0
    for i in range(len(f1)):
        squared_sum += pow( (f1[i] - f2[i]), 2)
    e_dist = math.sqrt(squared_sum)
    return e_dist


def get_closest_centroid(t_features, m_features, b_features):
    mal_dist = euclidean_distance(t_features, m_features)
    ben_dist = euclidean_distance(t_features, b_features)
    if mal_dist > ben_dist:
        return MALIGN_FLAG
    else:
        return BENIGN_FLAG

def get_tumor_features(df, index):
    tumor_features = []
    tumor_features.append(int(df.iloc[index]["thickness"]))
    tumor_features.append(int(df.iloc[index]["size"]))
    tumor_features.append(int(df.iloc[index]["shape"]))
    tumor_features.append(int(df.iloc[index]["adhesion"]))
    tumor_features.append(int(df.iloc[index]["epithelial"]))
    tumor_features.append(int(df.iloc[index]["nuclei"]))
    tumor_features.append(int(df.iloc[index]["chromatin"]))
    tumor_features.append(int(df.iloc[index]["nucleoli"]))
    tumor_features.append(int(df.iloc[index]["mitoses"]))
    return tumor_features

def compute_new_centroids(nearest_centroids, feature_matrix):
    mal_sum = []
    ben_sum = []
    total_mal = 0
    total_ben = 0
    for i in range(TOTAL_FEATURES):
        mal_sum.append(0)
        ben_sum.append(0)
    for i in range(TOTAL_ENTRIES):
        if nearest_centroids[i] == MALIGN_FLAG:
            for j in range(TOTAL_FEATURES):
                mal_sum[j] += feature_matrix[i][j]
            total_mal += 1
        else:
            for j in range(TOTAL_FEATURES):
                ben_sum[j] += feature_matrix[i][j]
            total_ben += 1
    for i in range(TOTAL_FEATURES):
        mal_sum[i] = (mal_sum[i] * 1.0) / total_mal
        ben_sum[i] = (ben_sum[i] * 1.0) / total_ben
    return mal_sum,ben_sum

def check_accuracy(class_vector, ground_truth):
    entires_to_compute = len(class_vector)
    correct_entires = 0
    for i in range(entires_to_compute):
        if class_vector[i] == ground_truth[i]:
            correct_entires += 1
    accuracy = (correct_entires * 100.0) / (entires_to_compute)
    if accuracy < 50.0:
        accuracy = 100.0 - accuracy
    return(accuracy)

def get_classification(m_centroid, b_centroid, feature_matrix):
    class_vector = []
    for i in range(len(feature_matrix)):
        m_dist = euclidean_distance(m_centroid, feature_matrix[i])
        b_dist = euclidean_distance(b_centroid, feature_matrix[i])
        if m_dist < b_dist:
            class_vector.append(MALIGN_FLAG)
        else:
            class_vector.append(BENIGN_FLAG)
    return class_vector

## Create Feature Matrix

In [131]:
feature_matrix = []
ground_truth = []
for i in range(TOTAL_ENTRIES):
    feature_matrix.append(get_tumor_features(df, i))
    ground_truth.append(int(df.iloc[i]["classification"]))

## Code Begin

In [132]:
# Start with random points
mal_features = get_tumor_features(df, 7)
ben_features = get_tumor_features(df, 8)

entries_to_compute = TOTAL_ENTRIES
NUMBER_OF_ITERATIONS = 8

# Compute for 8 Iterations
for iter in range(NUMBER_OF_ITERATIONS):
    nearest_centroids = []
    for i in range(entries_to_compute):
        t_feature = feature_matrix[i]
        nearest_centroids.append(get_closest_centroid(t_feature, mal_features, ben_features))
    mal_features,ben_features = compute_new_centroids(nearest_centroids, feature_matrix)
    class_vector = get_classification(mal_features, ben_features, feature_matrix)
    accuracy = check_accuracy(class_vector, ground_truth)
    print("Iteraion: ", iter, "\tAccuracy: ", accuracy)


Iteraion:  0 	Accuracy:  88.2689556509299
Iteraion:  1 	Accuracy:  94.4206008583691
Iteraion:  2 	Accuracy:  95.27896995708154
Iteraion:  3 	Accuracy:  95.70815450643777
Iteraion:  4 	Accuracy:  95.85121602288984
Iteraion:  5 	Accuracy:  95.85121602288984
Iteraion:  6 	Accuracy:  95.85121602288984
Iteraion:  7 	Accuracy:  95.85121602288984


In [123]:
df

Unnamed: 0,id,thickness,size,shape,adhesion,epithelial,nuclei,chromatin,nucleoli,mitoses,classification
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4
