In [3]:
import pickle as pkl
from sklearn.cluster import KMeans

In [4]:
with open('../data/tdidf_vector.pkl', 'rb') as f_open:
    tfidf_matrix = pkl.load(f_open)

clustering = KMeans(n_clusters=8).fit(tfidf_matrix.toarray())
clustering.cluster_centers_

array([[-5.14996032e-19,  7.06297091e-04,  7.58941521e-19, ...,
         2.58670332e-03, -5.42101086e-19,  4.33680869e-19],
       [ 8.13151629e-20,  1.12761410e-02, -1.08420217e-19, ...,
         1.08420217e-18,  1.67655157e-03, -1.08420217e-19],
       [ 4.87890978e-19,  7.11666893e-04,  7.99320852e-04, ...,
         6.75588570e-04, -1.62630326e-19,  8.78898247e-04],
       ...,
       [-2.71050543e-20,  3.46944695e-18,  1.08420217e-19, ...,
         1.30104261e-18, -2.16840434e-19,  3.70953831e-03],
       [-8.13151629e-20,  3.46944695e-18,  2.16840434e-19, ...,
         1.30104261e-18, -2.71050543e-19,  2.16840434e-19],
       [ 2.38292463e-03,  9.91981445e-02,  1.15335927e-03, ...,
        -2.16840434e-19, -3.25260652e-19,  5.42101086e-19]])

In [5]:
clustering.labels_

array([6, 6, 6, 6, 2, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0, 6, 6, 2, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 2, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 4, 4, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 5, 2, 1,
       5, 2, 2, 2, 2, 5, 5, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 2, 2, 2,
       2, 2, 5, 5, 2, 2, 1, 2, 2, 1, 4, 4, 2, 2, 5, 2, 5, 2, 5, 5, 4, 5,
       5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 2, 2, 4, 5, 2, 4, 5, 5, 4, 4, 2, 2,
       2, 2, 1, 2, 2, 2, 2, 5, 5, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 4, 4,
       2, 2, 5, 5, 5, 5, 5, 5, 5, 2, 4, 2, 2, 1, 4, 5, 2, 4, 4, 4, 4, 4,
       4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5,
       4, 0, 0, 0, 0, 1, 0, 2, 2, 2, 0, 0, 0, 0, 0,

In [6]:
import numpy as np
from numpy.linalg import norm
from random  import randrange
tfidf_matrix.toarray().shape

(589, 8266)

In [9]:
MIN_DIST = 1e10

class KMeans():
    """
    Agglomerative Clustering
    Recursively merges the pair of clusters that minimally increases
    a given linkage distance.
    """
    def __init__(self, n_clusters=8, iters=300):
        self._n_clusters = n_clusters
        self._iters = iters
        self._dist_mat = None
        
    def distance_matrix(self, X):
        """
        Computer matrix of exponential cosine distance between each data point.
        """
        self._dist_mat = np.zeros((X.shape[0], X.shape[0]))
#       calculating cosine distances
        for i in range(X.shape[0]):
            for j in range(X.shape[0]):
                if i!=j:
                    self._dist_mat[i][j] = self.cos_sim_dist(X[i],X[j])
        np.fill_diagonal(self._dist_mat, MIN_DIST)
#         print(self._dist_mat[545][545])
#         print(self._dist_mat.shape)
        
    @staticmethod
    def cos_sim_dist(X,Y):
        """
        Return exponential cosine similarity between two vectors.
        """
        return np.exp(-1 * ((X @ Y.T)/(norm(X)*norm(Y))))
    
    def init_centroids(self, X):
        """
        Create random centroids.
        """
        np.random.RandomState(randrange(0, 1e4))
        self._centroids = X[np.random.choice(X.shape[0], self._n_clusters, replace=False)]
        
    def closest_centroid(self, X):
        """
        Return index of centroid closest to the given X vector.
        """
        min_dist_list = [self.cos_sim_dist(X, Y) for Y in self._centroids]
        return min_dist_list.index(min(min_dist_list))   
            
    
    def fit(self, X):
        """
        Method to call to fit data.
        """
        X_arr = X.toarray()
        self.init_centroids(X_arr)
        self.distance_matrix(X_arr)
        doc_labels = np.zeros((X_arr.shape[0], 1))
        for i in range(self._iters):
            centroid_points = [[] for _ in range(self._n_clusters)]
            print(f"Iteration number {i}", end="\r")
            for index, arr in enumerate(X_arr):
#                 assign closest centroid
                doc_labels[index] = self.closest_centroid(arr)
                for num in doc_labels[index]:
                    centroid_points[int(num)].append(index)
                
    #             compute new centroids
            new_centroids = np.zeros((self._n_clusters, X_arr.shape[1]))
            for ind in range(self._n_clusters):
                new_centroids[ind, :] = np.mean(np.take(X_arr, centroid_points[ind], axis=0), axis=0)
            
#             break if no change detected
            if np.all(self._centroids == new_centroids):
                break
            self._centroids = new_centroids
        self._results = [[] for _ in range(self._n_clusters)]
        for i, ele in enumerate(doc_labels):
            for num in ele:
                print(num)
                self._results[int(num)].append(i)
        print(self._results)
        self.save()
    
    def save(self):
        """
        Save the results in a sorted manner to ../clusters/agglomerative.txt
        """
        sorted_results = sorted(self._results, key= lambda x: min(x))
        sorted_results = [sorted(x) for x in sorted_results]
#         print(sorted_results)
        with open("../clusters/kmeans.txt", 'w') as f_open:
            for result in sorted_results:
                f_open.write(','.join([str(x) for x in result]))
                f_open.write('\n')

In [None]:
kmeans = KMeans()
kmeans.fit(tfidf_matrix)