# K-Means 

# 1. Scratch 

In [23]:
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import random
import numpy as np

## 1.1. Class Member & Cluster 

In [19]:
# Member store info of data points: tf_idf, news group, file name of text d
class Member:
    def __init__(self, r_d, label = None, doc_id = None):
        self.r_d = r_d
        self.label = label
        self.doc_id = doc_id

In [20]:
# Cluster: store data point
class Cluster:
    def __init__(self):
        # The underscore prefix is meant as a hint that a variable or method starting w "_" is intended for internal use
        # https://bit.ly/3pIEL8P
        self._centroid = None 
        self._member = []
    # Reset members (data points) in list of centroid
    def reset_members(self):
        self._member = []
    # Add new member (data point)
    def add_member(self, new_member):
        self._member.append(new_member)

## 1.2. Class KMeans

In [21]:
class KMeans:
    # Init with num_cluster, create K list store member of K centroid
    def __init__(self, num_cluster):
        self._num_cluster = num_cluster
        self._centroid = [Cluster() for i in range(self._num_cluster)]
        self._E = []    # List of centroids
        self._S = 0        # Overall similarity
    
    # Load Data
    def load_data(path):
        # Get data from each line with label, doc_id, index & tfidf of its vocab
        def sparse_to_dense(sparse_r_d, vocab_size):
            # Init list size vocal size to store vocal 
            r_d = [0.0 for _ in range(vocab_size)]
            
            # Split space & : in context data of each line 
            # Get index (id vocal of each line) & tfidf
            indices_and_tfidfs = sparse_r_d.split()
            for index_and_tfidf in indices_and_tfidfs:
                index = int(index_and_tfidf.split(':')[0])
                tfidf = float(index_and_tfidf.split(':')[1])
                r_d[index] = tfidf
            return np.array(r_d)    
                
        # Open file (newsgroup, id, context)
        with open(path +"data_tf_idf.txt") as f:
            data_lines = f.read().splitlines
        # Get size file vocal TF-IDF
        with open(path + "word_idfs.txt"):
            vocab_size = len(f.read().splitlines())

        # Member store info of data points: tf_idf, news group, file name of text d
        self._data = []
        # Count number of label (newsgroup)
        self._label_count = defaultdict(int)
        # Iterating sequence of pairs with counter
        for data_id, d in enumerate(data_lines):
            features = d.split('<fff>')
            label, doc_id = int(features[0]), int(features[1])
            self._label_count[label] += 1
            r_d = sparse_to_dense(sparse_r_d=features[2], vocab_size=vocab_size)

            # Append data with class Member
            self._data.append(Member(r_d=r_d, label=label, doc_id=doc_id))
    
    # Prepare for RUN Function
    # Random element to init centroid 
    def random_init(self, seed_value):
        random.seed(seed_value)
        # Crawl list members
        members = [member._r_d for member in self._data]
        # random.choice: random sample from 1-D array
        # Same as np.arrange without same num in array
        pos = np.random.choice(len(self._data), self._num_clusters, replace=False)
        centroid = []
        for i in pos:
            centroid.append(members[i])
        # Update centroid
        self._E = centroid
        for i in range(self._num_clusters):
            self._clusters[i].set_centroid(centroid[i])

    # Calculate similarity from centroid to another member
    # https://en.wikipedia.org/wiki/Cosine_similarity
    # https://www.machinelearningplus.com/nlp/cosine-similarity/
    def compute_similarity(self, member, centroid):
        # K(X, Y) = <X, Y> / (||X||*||Y||)
        return cosine_similarity([member._r_d], [centroid])

    # Assign member to centroid
    def select_cluster_for(self, member):
        best_fit_cluster = None
        max_similarity = -1
        for cluster in self._clusters:
            similarity = self.compute_similarity(member, cluster._centroid)
            if similarity > max_similarity:
                best_fit_cluster = cluster
                max_similarity = similarity
        best_fit_cluster.add_members(member)
        return max_similarity

    def update_centroid_of(self, cluster):
        member_r_ds = [member._r_d for member in cluster._members]
        aver_r_d = np.mean(member_r_ds, axis=0)
        sqrt_sum_sqr = np.sqrt(np.sum(aver_r_d ** 2))
        new_centroid = aver_r_d / sqrt_sum_sqr

        cluster._centroid = new_centroid

    def stopping_condition(self, criterion, threshold):
        criteria = ['centroid', 'similarity', 'max_iters']
        assert criterion in criteria
        if criterion == 'max_iters':
            if self._iteration >= threshold:
                return True
            else:
                return False
        elif criterion == 'centroid':
            E_new = [list(cluster._centroid) for cluster in self._clusters]
            E_new_minus_E = [centroid for centroid in E_new if centroid not in self._E]
            self._E = E_new
            if len(E_new_minus_E) <= threhold:
                return True
            else:
                return False
        else:
            new_S_minus_S = self._new_S - self._S
            self._S = self._new_S
            if new_S_minus_S <= threshold:
                return True
            else:
                return False    

    # Loop update cluster until convergence 
    # Step 1: For each instance, assign it to nearest centroid
    # Step 2: For each cluster, recompute its centroid
    def run(self, seed_value, criterion, threshold):
        self.random_init(seed_value)
        self._iteration = 0
        while True:
            # Reset clusters, retain only centroids
            for cluster in self._clusters:
                cluster.reset_members()
                self._new_S = 0
            # Select nearest member for each cluster
            for member in self._data:
                max_s = self.select_cluster_for(member)
                self._new_S += max_s
            # Recompute centroid 
            for cluster in self._clusters:
                self.update_centroid_of(cluster)

            # Stop loop with criterion
            self._iteration += 1
            if self.stopping_condition(criterion, threshold):
                break

In [57]:
np.random.choice(5,3, replace = False)

array([2, 0, 1])